diff --git a/README.md b/README.md
index d33db3066de534e8fb3aead86ab9ab5ea63b252c..5ad55a16581bc03d39b13b847277f0d29dda59a5 100644
--- a/README.md
+++ b/README.md
@@ -396,15 +396,15 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
     </tr>
     <tr align="center">
-        <td rowspan=2>Wide_ResNet50</td>
+        <td rowspan=2>Wide ResNet50</td>
         <td>FP16</td>
         <td><a href="models/cv/classification/wide_resnet50/igie/README.md#fp16">Supported</a></td>
-        <td>-</td>
+        <td><a href="models/cv/classification/wide_resnet50/ixrt/README.md#fp16">Supported</a></td>
     </tr>
     <tr align="center">
         <td>INT8</td>
         <td><a href="models/cv/classification/wide_resnet50/igie/README.md#int8">Supported</a></td>
-        <td>-</td>
+        <td><a href="models/cv/classification/wide_resnet50/ixrt/README.md#int8">Supported</a></td>
     </tr>
 </table>
 
@@ -746,7 +746,7 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td rowspan=2>Conformer</td>
         <td>FP16</td>
         <td><a href="models/speech/speech_recognition/conformer/igie/README.md#fp16">Supported</a></td>
-        <td>-</td>
+        <td><a href="models/speech/speech_recognition/conformer/ixrt/README.md#fp16">Supported</a></td>
     </tr>
     <tr align="center">
         <td>INT8</td>
diff --git a/models/cv/classification/swin_transformer_large/ixrt/README.md b/models/cv/classification/swin_transformer_large/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f2282f9fcfdfc756cebc98df53c11c0d03c339c
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/README.md
@@ -0,0 +1,96 @@
+# Swin Transformer Large
+
+## Description
+
+Swin Transformer-Large is a variant of the Swin Transformer, an architecture designed for computer vision tasks, particularly within the realms of image classification, object detection, and segmentation. The Swin Transformer-Large model represents an expanded version with more layers and parameters compared to its base configuration, aiming for improved performance and deeper processing of visual data.
+
+## Setup
+
+### Install
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/cv/classification/swin_transformer_large/ixrt
+cd ${MODEL_PATH}
+
+apt install -y libnuma-dev libgl1-mesa-glx
+
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open-swin-large.tar>
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_imagenet.tar> to download the open_imagenet dataset.
+
+or you can :
+
+```bash
+bash ./scripts/prepare_model_and_dataset.sh
+
+```
+
+### Model Conversion
+
+Please correct the paths in the following commands or files.
+
+```bash
+tar -xvf open-swin-large.tar
+wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/swin-large-torch-fp32.json
+python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/swin-large/swin-transformer-large.pt --output_path swin-large-torch-fp32.onnx
+
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./swin-large-torch-fp32
+export OPTIMIER_FILE=/Path/ixrt/oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+
+bash ./scripts/infer_swinl_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+
+# copy data
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+cp -r datasets/open_imagenet/* ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/
+mkdir -p ./ByteMLPerf/general_perf/model_zoo/popular/swin-large
+cp general_perf/model_zoo/popular/swin-large/* ./ByteMLPerf/general_perf/model_zoo/popular/swin-large
+
+# run acc scripts
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
+```
+
+## Results
+
+| Model                  | BatchSize | Precision | QPS   | Top-1 Acc |
+| ---------------------- | --------- | --------- | ----- | --------- |
+| Swin Transformer Large | 2         | FP16      | 5.746 | 85.62     |
diff --git a/models/cv/classification/swin_transformer_large/ixrt/perf_engine.py b/models/cv/classification/swin_transformer_large/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/models/cv/classification/swin_transformer_large/ixrt/scripts/infer_swinl_fp16_performance.sh b/models/cv/classification/swin_transformer_large/ixrt/scripts/infer_swinl_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eb9350ff48d8a05d6d7f5e2bf3ce1eb0930033b2
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/scripts/infer_swinl_fp16_performance.sh
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+    BS=16
+    TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+    TARGET_ENGINE=${ORIGIN_ONNX_NAME}_end.engine
+    if [[ ! -f "${ORIGIN_ONNX}" ]];then
+        echo "${ORIGIN_ONNX} not exists!"
+        exit 1
+    fi
+
+    # Graph optimize
+    python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --model_type swint --dump_onnx
+    OPTMIZE_STATUS=$?
+
+    # Build Engine
+    ixrtexec --onnx ${TARGET_ONNX} --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin\
+        --min_shape pixel_values.1:${BS}x3x384x384 --opt_shape pixel_values.1:${BS}x3x384x384 --max_shape pixel_values.1:${BS}x3x384x384
+
+    # Test Performance
+    ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes pixel_values.1:${BS}x3x384x384 --shapes pixel_values.1:${BS}x3x384x384
+    PERFORMANCE_STATUS=$?
+
+}
+run 1
\ No newline at end of file
diff --git a/models/cv/classification/swin_transformer_large/ixrt/scripts/prepare_model_and_dataset.sh b/models/cv/classification/swin_transformer_large/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8154d350c9e24de33313a12873582641f8c73263
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model....  *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_imagenet/
+
+wget -O general_perf/download/open-swin-large.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open-swin-large.tar
+tar xf general_perf/download/open-swin-large.tar -C general_perf/model_zoo/popular/
+
+
+# # Download Datasets
+wget -O general_perf/download/open_imagenet.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_imagenet.tar
+tar xf general_perf/download/open_imagenet.tar -C datasets/
+
+
+echo "Extract Done."
diff --git a/models/cv/classification/swin_transformer_large/ixrt/torch2onnx.py b/models/cv/classification/swin_transformer_large/ixrt/torch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f115b730caf065b3f3dfc496c161916afc96d9e
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/torch2onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+    model_name = output_path.split("/")[-1][:-4]
+    with open(model_name + "json", "r") as f:
+        model_info = json.load(f)
+    model_inputs = model_info["inputs"].split(",")
+    input_shapes = model_info["input_shape"]
+    input_type = model_info["input_type"].split(",")
+    example_inputs = _get_fake_samples(input_shapes, input_type)
+
+    model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+    model.eval()
+
+    names = model_inputs
+    dynamic_inputs = {}
+    for i in range(len(names)):
+        dynamic_inputs[names[i]] = {0: "batch_size"}
+    outputs = model_info["outputs"].split(",")
+    for output in outputs:
+        dynamic_inputs[output] = {0: "batch_size"}
+    torch.onnx.export(
+        model,
+        example_inputs,
+        output_path,
+        opset_version=11,
+        input_names=names,
+        output_names=outputs,
+        dynamic_axes=dynamic_inputs,
+    )
+
+
+def _get_fake_samples(shape, type):
+    data = []
+    idx = 0
+    for key, val in shape.items():
+        val = [val[0] * 1] + val[1:]
+        data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+        idx += 1
+    return data
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    torch_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/igie/README.md b/models/cv/classification/wide_resnet50/igie/README.md
index 50120e88b90dca143ed5b9ce856af7d3903f5aa8..c3bebf1738ae76036f696eb7f1f0e6a80f93553b 100644
--- a/models/cv/classification/wide_resnet50/igie/README.md
+++ b/models/cv/classification/wide_resnet50/igie/README.md
@@ -1,4 +1,4 @@
-# WideResNet50
+# Wide ResNet50
 
 ## Description
 
@@ -51,7 +51,7 @@ bash scripts/infer_wide_resnet50_int8_performance.sh
 
 ## Results
 
-Model        |BatchSize  |Precision |FPS       |Top-1(%)  |Top-5(%)
--------------|-----------|----------|----------|----------|--------
-WideResNet50 |    32     |   FP16   | 2312.383 |  78.459  |  94.052
-WideResNet50 |    32     |   INT8   | 5195.654 |  77.957  |  93.798
+| Model         | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| ------------- | --------- | --------- | -------- | -------- | -------- |
+| Wide ResNet50 | 32        | FP16      | 2312.383 | 78.459   | 94.052   |
+| Wide ResNet50 | 32        | INT8      | 5195.654 | 77.957   | 93.798   |
diff --git a/models/cv/classification/wide_resnet50/ixrt/README.md b/models/cv/classification/wide_resnet50/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..72dd1308b11b2dd7f6237e8c7ec782c99107e0c2
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/README.md
@@ -0,0 +1,61 @@
+# Wide ResNet50
+
+## Description
+
+The distinguishing feature of Wide ResNet50 lies in its widened architecture compared to traditional ResNet models. By increasing the width of the residual blocks, Wide ResNet50 enhances the capacity of the network to capture richer and more diverse feature representations, leading to improved performance on various visual recognition tasks.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnx
+pip3 install tqdm
+```
+
+### Download
+
+Pretrained model: <https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+mkdir -p checkpoints/
+python3 export.py --weight wide_resnet50_2-95faca4d.pth --output checkpoints/wide_resnet50.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+export CHECKPOINTS_DIR=./checkpoints
+export RUN_DIR=./
+export CONFIG_DIR=config/WIDE_RESNET50_CONFIG
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_wide_resnet50_fp16_accuracy.sh
+# Performance
+bash scripts/infer_wide_resnet50_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_wide_resnet50_int8_accuracy.sh
+# Performance
+bash scripts/infer_wide_resnet50_int8_performance.sh
+```
+
+## Results
+
+| Model         | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| ------------- | --------- | --------- | -------- | -------- | -------- |
+| Wide ResNet50 | 32        | FP16      | 2478.551 | 78.486   | 94.084   |
+| Wide ResNet50 | 32        | INT8      | 5981.702 | 76.956   | 93.920   |
diff --git a/models/cv/classification/wide_resnet50/ixrt/build_engine.py b/models/cv/classification/wide_resnet50/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e126bc715aa77d38c3abdd1e02191a262689e7
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/build_engine.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from calibration_dataset import getdataloader
+import cuda.cudart as cudart
+
+def assertSuccess(err):
+    assert(err == cudart.cudaError_t.cudaSuccess)
+
+class EngineCalibrator(tensorrt.IInt8EntropyCalibrator2):
+
+    def __init__(self, cache_file, datasets_dir, loop_count=10, bsz=1, img_sz=224):
+        super().__init__()
+        self.cache_file = cache_file
+        self.image_batcher  = getdataloader(datasets_dir, loop_count, batch_size=bsz, img_sz=img_sz)
+        self.batch_generator = iter(self.image_batcher)
+        size = img_sz*img_sz*3*bsz
+        __import__('pdb').set_trace()
+        err, self.batch_allocation = cudart.cudaMalloc(size)
+        assertSuccess(err)
+
+    def __del__(self):
+        err,= cudart.cudaFree(self.batch_allocation)
+        assertSuccess(err)
+
+    def get_batch_size(self):
+        return self.image_batcher.batch_size
+
+    def get_batch(self, names):
+        try:
+            batch, _ = next(self.batch_generator)
+            batch = batch.numpy()
+            __import__('pdb').set_trace()
+            cudart.cudaMemcpy(self.batch_allocation,
+                              np.ascontiguousarray(batch),
+                              batch.nbytes,
+                              cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+            return [int(self.batch_allocation)]
+        except StopIteration:
+            return None
+
+    def read_calibration_cache(self):
+        if os.path.exists(self.cache_file):
+            with open(self.cache_file, "rb") as f:
+                return f.read()
+
+    def write_calibration_cache(self, cache):
+        with open(self.cache_file, "wb") as f:
+            f.write(cache)
+
+def main(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.VERBOSE)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    print("precision : ", precision)
+    build_config.set_flag(precision)
+    if config.precision == "int8":
+        build_config.int8_calibrator = EngineCalibrator("int8_cache", config.datasets_dir)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    parser.add_argument("--engine", type=str, default=None)
+    parser.add_argument(
+        "--datasets_dir",
+        type=str,
+        default="",
+        help="ImageNet dir",
+    )
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    # cali = EngineCalibrator("tmp", "/home/qiang.zhang/data/imagenet_val/")
+    # print(cali.get_batch_size())
+    # print(cali.get_batch("hello"))
+    args = parse_args()
+    main(args)
diff --git a/models/cv/classification/wide_resnet50/ixrt/build_i8_engine.py b/models/cv/classification/wide_resnet50/ixrt/build_i8_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..6038b33f50cff7a14efcefa6673ae9d2fd19870b
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/build_i8_engine.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import json
+import os
+
+import tensorrt
+import tensorrt as trt
+
+TRT_LOGGER = trt.Logger(tensorrt.Logger.VERBOSE)
+
+EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+
+def GiB(val):
+    return val * 1 << 30
+
+
+def json_load(filename):
+    with open(filename) as json_file:
+        data = json.load(json_file)
+    return data
+
+
+def setDynamicRange(network, json_file):
+    """Sets ranges for network layers."""
+    quant_param_json = json_load(json_file)
+    act_quant = quant_param_json["act_quant_info"]
+
+    for i in range(network.num_inputs):
+        input_tensor = network.get_input(i)
+        if act_quant.__contains__(input_tensor.name):
+            print(input_tensor.name)
+            value = act_quant[input_tensor.name]
+            tensor_max = abs(value)
+            tensor_min = -abs(value)
+            input_tensor.dynamic_range = (tensor_min, tensor_max)
+
+    for i in range(network.num_layers):
+        layer = network.get_layer(i)
+
+        for output_index in range(layer.num_outputs):
+            tensor = layer.get_output(output_index)
+
+            if act_quant.__contains__(tensor.name):
+                value = act_quant[tensor.name]
+                tensor_max = abs(value)
+                tensor_min = -abs(value)
+                tensor.dynamic_range = (tensor_min, tensor_max)
+            else:
+                print("\033[1;32m%s\033[0m" % tensor.name)
+
+
+def build_engine(onnx_file, json_file, engine_file):
+    builder = trt.Builder(TRT_LOGGER)
+    network = builder.create_network(EXPLICIT_BATCH)
+
+    config = builder.create_builder_config()
+
+    # If it is a dynamic onnx model , you need to add the following.
+    # profile = builder.create_optimization_profile()
+    # profile.set_shape("input_name", (batch, channels, min_h, min_w), (batch, channels, opt_h, opt_w), (batch, channels, max_h, max_w))
+    # config.add_optimization_profile(profile)
+
+    parser = trt.OnnxParser(network, TRT_LOGGER)
+    # config.max_workspace_size = GiB(1)
+    if not os.path.exists(onnx_file):
+        quit("ONNX file {} not found".format(onnx_file))
+
+    with open(onnx_file, "rb") as model:
+        if not parser.parse(model.read()):
+            print("ERROR: Failed to parse the ONNX file.")
+            for error in range(parser.num_errors):
+                print(parser.get_error(error))
+            return None
+
+    config.set_flag(trt.BuilderFlag.INT8)
+
+    setDynamicRange(network, json_file)
+
+    engine = builder.build_engine(network, config)
+
+    with open(engine_file, "wb") as f:
+        f.write(engine.serialize())
+
+
+if __name__ == "__main__":
+    # Add plugins if needed
+    # import ctypes
+    # ctypes.CDLL("libmmdeploy_tensorrt_ops.so")
+    parser = argparse.ArgumentParser(
+        description="Writing qparams to onnx to convert tensorrt engine."
+    )
+    parser.add_argument("--onnx", type=str, default=None)
+    parser.add_argument("--qparam_json", type=str, default=None)
+    parser.add_argument("--engine", type=str, default=None)
+    arg = parser.parse_args()
+
+    build_engine(arg.onnx, arg.qparam_json, arg.engine)
+    print("\033[1;32mgenerate %s\033[0m" % arg.engine)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/calibration_dataset.py b/models/cv/classification/wide_resnet50/ixrt/calibration_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec931c656abf5b2309dc9938490df46e4e8cdb19
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/calibration_dataset.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from torchvision import models
+from torchvision import transforms as T
+
+
+class CalibrationImageNet(torchvision.datasets.ImageFolder):
+    def __init__(self, *args, **kwargs):
+        super(CalibrationImageNet, self).__init__(*args, **kwargs)
+        img2label_path = os.path.join(self.root, "val_map.txt")
+        if not os.path.exists(img2label_path):
+            raise FileNotFoundError(f"Not found label file `{img2label_path}`.")
+
+        self.img2label_map = self.make_img2label_map(img2label_path)
+
+    def make_img2label_map(self, path):
+        with open(path) as f:
+            lines = f.readlines()
+
+        img2lable_map = dict()
+        for line in lines:
+            line = line.lstrip().rstrip().split("\t")
+            if len(line) != 2:
+                continue
+            img_name, label = line
+            img_name = img_name.strip()
+            if img_name in [None, ""]:
+                continue
+            label = int(label.strip())
+            img2lable_map[img_name] = label
+        return img2lable_map
+
+    def __getitem__(self, index):
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        # if self.target_transform is not None:
+        #     target = self.target_transform(target)
+        img_name = os.path.basename(path)
+        target = self.img2label_map[img_name]
+
+        return sample, target
+
+
+def create_dataloaders(data_path, num_samples=1024, img_sz=224, batch_size=2, workers=0):
+    dataset = CalibrationImageNet(
+        data_path,
+        transform=T.Compose(
+            [
+                T.Resize(256),
+                T.CenterCrop(img_sz),
+                T.ToTensor(),
+                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ]
+        ),
+    )
+
+    calibration_dataset = dataset
+    if num_samples is not None:
+        calibration_dataset = torch.utils.data.Subset(
+            dataset, indices=range(num_samples)
+        )
+
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        shuffle=True,
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=workers,
+    )
+
+    verify_dataloader = DataLoader(
+        dataset,
+        shuffle=False,
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=workers,
+    )
+
+    return calibration_dataloader, verify_dataloader
+
+
+def getdataloader(dataset_dir, step=20, batch_size=32, workers=2, img_sz=224, total_sample=50000):
+    num_samples = min(total_sample, step * batch_size)
+    if step < 0:
+        num_samples = None
+    calibration_dataloader, _ = create_dataloaders(
+        dataset_dir,
+        img_sz=img_sz,
+        batch_size=batch_size,
+        workers=workers,
+        num_samples=num_samples,
+    )
+    return calibration_dataloader
diff --git a/models/cv/classification/wide_resnet50/ixrt/common.py b/models/cv/classification/wide_resnet50/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..0458195e5b7980ce70585d7284ca8a875afa3fd6
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/common.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import glob
+import torch
+import tensorrt
+import numpy as np
+import pycuda.driver as cuda
+
+def eval_batch(batch_score, batch_label):
+    batch_score = torch.tensor(torch.from_numpy(batch_score), dtype=torch.float32)
+    values, indices = batch_score.topk(5)
+    top1, top5 = 0, 0
+    for idx, label in enumerate(batch_label):
+
+        if label == indices[idx][0]:
+            top1 += 1
+        if label in indices[idx]:
+            top5 += 1
+    return top1, top5
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+def get_io_bindings(engine):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = engine.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
diff --git a/models/cv/classification/wide_resnet50/ixrt/config/WIDE_RESNET50_CONFIG b/models/cv/classification/wide_resnet50/ixrt/config/WIDE_RESNET50_CONFIG
new file mode 100644
index 0000000000000000000000000000000000000000..04e6b34078b14979940a6f5b0747b8032ab6fc2a
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/config/WIDE_RESNET50_CONFIG
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+# IMGSIZE : 模型输入hw大小
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件名称
+IMGSIZE=224
+MODEL_NAME=Wide_Resnet50
+ORIGINE_MODEL=wide_resnet50.onnx
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+    # QUANT_OBSERVER : 量化策略，可选 [hist_percentile, percentile, minmax, entropy, ema]
+    # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致，有些op可能推导shape错误(比如Reshape)
+    # QUANT_STEP : 量化步数
+    # QUANT_SEED : 随机种子 保证量化结果可复现
+    # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=minmax
+QUANT_BATCHSIZE=1
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=
+QUANT_EXIST_ONNX=
diff --git a/models/cv/classification/wide_resnet50/ixrt/export.py b/models/cv/classification/wide_resnet50/ixrt/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d3c64c825ab3aaf172f0c6ca7ef9b802ea06bf9
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/export.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import torchvision
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    
+    model = torchvision.models.wide_resnet50_2()
+    model.load_state_dict(torch.load(args.weight))
+    model.eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )    
+    
+    print("Export onnx model successfully! ")
+    
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/wide_resnet50/ixrt/inference.py b/models/cv/classification/wide_resnet50/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c9dcb3f9cc5b9a26903651a31fafa16d8f0db31
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/inference.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import json
+import os
+import re
+import time
+from tqdm import tqdm
+
+import cv2
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import tensorrt
+
+from calibration_dataset import getdataloader
+from common import eval_batch, create_engine_context, get_io_bindings
+
+def main(config):
+    dataloader = getdataloader(config.datasets_dir, config.loop_count, config.bsz, img_sz=config.imgsz)
+
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    # Load Engine && I/O bindings
+    engine, context = create_engine_context(config.engine_file, logger)
+    inputs, outputs, allocations = get_io_bindings(engine)
+
+    # Warm up
+    if config.warm_up > 0:
+        print("\nWarm Start.")
+        for i in range(config.warm_up):
+            context.execute_v2(allocations)
+        print("Warm Done.")
+
+    # Inference
+    if config.test_mode == "FPS":
+        torch.cuda.synchronize()
+        start_time = time.time()
+
+        for i in range(config.loop_count):
+            context.execute_v2(allocations)
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+        forward_time = end_time - start_time
+
+        num_samples = 50000
+        if config.loop_count * config.bsz < num_samples:
+            num_samples = config.loop_count * config.bsz
+        fps = num_samples / forward_time
+
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+        if fps >= config.fps_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+    elif config.test_mode == "ACC":
+
+        ## Prepare the output data
+        output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+        print(f"output shape : {output.shape} output type : {output.dtype}")
+
+        total_sample = 0
+        acc_top1, acc_top5 = 0, 0
+
+        with tqdm(total= len(dataloader)) as _tqdm:
+            for idx, (batch_data, batch_label) in enumerate(dataloader):
+                batch_data = batch_data.numpy().astype(inputs[0]["dtype"])
+                batch_data = np.ascontiguousarray(batch_data)
+                total_sample += batch_data.shape[0]
+
+                cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+                context.execute_v2(allocations)
+                cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+
+                # squeeze output shape [32,1000,1,1] to [32,1000] for mobilenet_v2 model
+                if len(output.shape) == 4:
+                    output = output.squeeze(axis=(2,3))
+
+                batch_top1, batch_top5 = eval_batch(output, batch_label)
+                acc_top1 += batch_top1
+                acc_top5 += batch_top5
+
+                _tqdm.set_postfix(acc_1='{:.4f}'.format(acc_top1/total_sample),
+                                    acc_5='{:.4f}'.format(acc_top5/total_sample))
+                _tqdm.update(1)
+
+        print(F"Acc@1 : {acc_top1/total_sample} = {acc_top1}/{total_sample}")
+        print(F"Acc@5 : {acc_top5/total_sample} = {acc_top5}/{total_sample}")
+        acc1 = acc_top1/total_sample
+        print(f"Accuracy Check : Test {acc1} >= target {config.acc_target}")
+        if acc1 >= config.acc_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+    parser.add_argument(
+        "--engine_file",
+        type=str,
+        help="engine file path"
+    )
+    parser.add_argument(
+        "--datasets_dir",
+        type=str,
+        default="",
+        help="ImageNet dir",
+    )
+    parser.add_argument("--warm_up", type=int, default=-1, help="warm_up times")
+    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+    parser.add_argument(
+        "--imgsz",
+        "--img",
+        "--img-size",
+        type=int,
+        default=224,
+        help="inference size h,w",
+    )
+    parser.add_argument("--use_async", action="store_true")
+    parser.add_argument(
+        "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4"
+    )
+    parser.add_argument("--fps_target", type=float, default=-1.0)
+    parser.add_argument("--acc_target", type=float, default=-1.0)
+    parser.add_argument("--loop_count", type=int, default=-1)
+
+    config = parser.parse_args()
+    return config
+
+if __name__ == "__main__":
+    config = parse_config()
+    main(config)
diff --git a/models/cv/classification/wide_resnet50/ixrt/modify_batchsize.py b/models/cv/classification/wide_resnet50/ixrt/modify_batchsize.py
new file mode 100644
index 0000000000000000000000000000000000000000..689b7a972dcbfec77c185592ede16bb4f04fa4fd
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/modify_batchsize.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import onnx
+import argparse
+
+def change_input_dim(model, bsz):
+    batch_size = bsz
+
+    # The following code changes the first dimension of every input to be batch_size
+    # Modify as appropriate ... note that this requires all inputs to
+    # have the same batch_size
+    inputs = model.graph.input
+    for input in inputs:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = input.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        if isinstance(batch_size, str):
+            # set dynamic batch size
+            dim1.dim_param = batch_size
+        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
+            # set given batch size
+            dim1.dim_value = int(batch_size)
+        else:
+            # set batch size of 1
+            dim1.dim_value = 1
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int)
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+model = onnx.load(args.origin_model)
+change_input_dim(model, args.batch_size)
+onnx.save(model, args.output_model)
+
+    
+
+
+
diff --git a/models/cv/classification/wide_resnet50/ixrt/quant.py b/models/cv/classification/wide_resnet50/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d71c828629bb0370aa40c5bcdcf117812bbaedc
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/quant.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+"""这是一个高度自动化的 PPQ 量化的入口脚本，将你的模型和数据按要求进行打包:
+
+在自动化 API 中，我们使用 QuantizationSetting 对象传递量化参数。
+
+This file will show you how to quantize your network with PPQ
+    You should prepare your model and calibration dataset as follow:
+
+    ~/working/model.onnx                          <--  your model
+    ~/working/data/*.npy or ~/working/data/*.bin  <--  your dataset
+
+if you are using caffe model:
+    ~/working/model.caffemdoel  <--  your model
+    ~/working/model.prototext   <--  your model
+
+### MAKE SURE YOUR INPUT LAYOUT IS [N, C, H, W] or [C, H, W] ###
+
+quantized model will be generated at: ~/working/quantized.onnx
+"""
+from ppq import *
+from ppq.api import *
+import os
+from calibration_dataset import getdataloader
+import argparse
+import random
+import numpy as np
+import torch
+
+
+def setseed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--dataset_dir", type=str, default="imagenet_val")
+    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"],
+                        default="hist_percentile")
+    parser.add_argument("--disable_quant_names", nargs='*', type=str)
+    parser.add_argument("--save_dir", type=str, help="save path", default=None)
+    parser.add_argument("--bsz", type=int, default=32)
+    parser.add_argument("--step", type=int, default=20)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--imgsz", type=int, default=224)
+    args = parser.parse_args()
+    print("Quant config:", args)
+    print(args.disable_quant_names)
+    return args
+
+
+config = parse_args()
+
+# modify configuration below:
+WORKING_DIRECTORY = 'checkpoints'  # choose your working directory
+TARGET_PLATFORM = TargetPlatform.TRT_INT8  # choose your target platform
+MODEL_TYPE = NetworkFramework.ONNX  # or NetworkFramework.CAFFE
+INPUT_LAYOUT = 'chw'  # input data layout, chw or hwc
+NETWORK_INPUTSHAPE = [1, 3, 224, 224]  # input shape of your network
+EXECUTING_DEVICE = 'cuda'  # 'cuda' or 'cpu'.
+REQUIRE_ANALYSE = False
+TRAINING_YOUR_NETWORK = False  # 是否需要 Finetuning 一下你的网络
+# -------------------------------------------------------------------
+# 加载你的模型文件，PPQ 将会把 onnx 或者 caffe 模型文件解析成自己的格式
+# 如果你正使用 pytorch, tensorflow 等框架，你可以先将模型导出成 onnx
+# 使用 torch.onnx.export 即可，如果你在导出 torch 模型时发生错误，欢迎与我们联系。
+# -------------------------------------------------------------------
+graph = None
+if MODEL_TYPE == NetworkFramework.ONNX:
+    graph = load_onnx_graph(onnx_import_file=config.model)
+if MODEL_TYPE == NetworkFramework.CAFFE:
+    graph = load_caffe_graph(
+        caffemodel_path=os.path.join(WORKING_DIRECTORY, 'model.caffemodel'),
+        prototxt_path=os.path.join(WORKING_DIRECTORY, 'model.prototxt'))
+assert graph is not None, 'Graph Loading Error, Check your input again.'
+
+# -------------------------------------------------------------------
+# SETTING 对象用于控制 PPQ 的量化逻辑，主要描述了图融合逻辑、调度方案、量化细节策略等
+# 当你的网络量化误差过高时，你需要修改 SETTING 对象中的属性来进行特定的优化
+# -------------------------------------------------------------------
+QS = QuantizationSettingFactory.default_setting()
+
+# -------------------------------------------------------------------
+# 下面向你展示了如何使用 finetuning 过程提升量化精度
+# 在 PPQ 中我们提供了十余种算法用来帮助你恢复精度
+# 开启他们的方式都是 QS.xxxx = True
+# 按需使用，不要全部打开，容易起飞
+# -------------------------------------------------------------------
+if TRAINING_YOUR_NETWORK:
+    QS.lsq_optimization = True  # 启动网络再训练过程，降低量化误差
+    QS.lsq_optimization_setting.steps = 500  # 再训练步数，影响训练时间，500 步大概几分钟
+    QS.lsq_optimization_setting.collecting_device = 'cuda'  # 缓存数据放在那，cuda 就是放在gpu，如果显存超了你就换成 'cpu'
+
+
+dataloader = getdataloader(config.dataset_dir, config.step, batch_size=config.bsz, img_sz=config.imgsz)
+# ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x，但是你如果没有装相应编译环境的话是编译不了的
+# 你可以尝试安装编译环境，或者在不启动 CUDA KERNEL 的情况下完成量化：移除 with ENABLE_CUDA_KERNEL(): 即可
+with ENABLE_CUDA_KERNEL():
+    print('网络正量化中，根据你的量化配置，这将需要一段时间:')
+    quantized = quantize_native_model(
+        setting=QS,  # setting 对象用来控制标准量化逻辑
+        model=graph,
+        calib_dataloader=dataloader,
+        calib_steps=config.step,
+        input_shape=NETWORK_INPUTSHAPE,  # 如果你的网络只有一个输入，使用这个参数传参
+        inputs=None,
+        # 如果你的网络有多个输入，使用这个参数传参，就是 input_shape=None, inputs=[torch.zeros(1,3,224,224), torch.zeros(1,3,224,224)]
+        collate_fn=lambda x: x[0].to(EXECUTING_DEVICE),  # collate_fn 跟 torch dataloader 的 collate fn 是一样的，用于数据预处理，
+        # 你当然也可以用 torch dataloader 的那个，然后设置这个为 None
+        platform=TARGET_PLATFORM,
+        device=EXECUTING_DEVICE,
+        do_quantize=True)
+
+    # -------------------------------------------------------------------
+    # 如果你需要执行量化后的神经网络并得到结果，则需要创建一个 executor
+    # 这个 executor 的行为和 torch.Module 是类似的，你可以利用这个东西来获取执行结果
+    # 请注意，必须在 export 之前执行此操作。
+    # -------------------------------------------------------------------
+    executor = TorchExecutor(graph=quantized, device=EXECUTING_DEVICE)
+    # output = executor.forward(input)
+
+    # -------------------------------------------------------------------
+    # PPQ 计算量化误差时，使用信噪比的倒数作为指标，即噪声能量 / 信号能量
+    # 量化误差 0.1 表示在整体信号中，量化噪声的能量约为 10%
+    # 你应当注意，在 graphwise_error_analyse 分析中，我们衡量的是累计误差
+    # 网络的最后一层往往都具有较大的累计误差，这些误差是其前面的所有层所共同造成的
+    # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
+    # -------------------------------------------------------------------
+    print('正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:')
+    reports = graphwise_error_analyse(
+        graph=quantized, running_device=EXECUTING_DEVICE, steps=32,
+        dataloader=dataloader, collate_fn=lambda x: x[0].to(EXECUTING_DEVICE))
+    for op, snr in reports.items():
+        if snr > 0.1: ppq_warning(f'层 {op} 的累计量化误差显著，请考虑进行优化')
+
+    if REQUIRE_ANALYSE:
+        print('正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:')
+        layerwise_error_analyse(graph=quantized, running_device=EXECUTING_DEVICE,
+                                interested_outputs=None,
+                                dataloader=dataloader, collate_fn=lambda x: x.to(EXECUTING_DEVICE))
+
+    # -------------------------------------------------------------------
+    # 使用 export_ppq_graph 函数来导出量化后的模型
+    # PPQ 会根据你所选择的导出平台来修改模型格式
+    # -------------------------------------------------------------------
+    print('网络量化结束，正在生成目标文件:')
+    export_ppq_graph(
+        graph=quantized, platform=TARGET_PLATFORM,
+        graph_save_to=os.path.join(config.save_dir, f"quantized_{config.model_name}.onnx"),
+        config_save_to=os.path.join(config.save_dir, 'quant_cfg.json'))
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_model.py b/models/cv/classification/wide_resnet50/ixrt/refine_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f1e6c2f6325651556267ceed7e4403a565a2f69
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/refine_model.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import argparse
+import dataclasses
+
+import torch
+import onnx
+
+from refine_utils.matmul_to_gemm_pass import FusedGemmPass
+from refine_utils.linear_pass import FusedLinearPass
+
+from refine_utils.common import *
+
+def get_constant_input_name_of_operator(graph: Graph, operator: Operator):
+    const = None
+    for input in operator.inputs:
+        if not graph.containe_var(input):
+            continue
+
+        if not graph.is_leaf_variable(input):
+            continue
+
+        input_var = graph.get_variable(input)
+        if input_var.value is not None:
+            const = input
+    return const 
+
+class FuseLayerNormPass(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        self.transform = GraphTransform(graph)
+        find_sequence_subgraph(
+            graph,
+            [OP.REDUCE_MEAN, OP.SUB, OP.POW, OP.REDUCE_MEAN, OP.ADD, OP.SQRT, OP.DIV, OP.MUL, OP.ADD],
+            self.fuse_layer_norm,
+            strict=False
+        )
+        return graph
+
+    def fuse_layer_norm(self, graph: Graph, pattern: PatternGraph):
+        # 检查 REDUCE_MEAN 的输入是否和 SUB 的输入是一致的
+        if pattern.nodes[0].operator.inputs[0] != pattern.nodes[1].operator.inputs[0]:
+            return
+
+        # 检查 POW 的输入是否和 DIV 的输入是一致的
+        if pattern.nodes[2].operator.inputs[0] != pattern.nodes[6].operator.inputs[0]:
+            return
+
+        # 检查部分算子的输出是否被多个算子使用
+        nodes = pattern.nodes
+        for node in [nodes[0]] + nodes[2:-1]:
+            next_ops = graph.get_next_operators(node.operator)
+            if len(next_ops) > 1:
+                return
+
+        eps = None
+        for input in nodes[4].operator.inputs:
+            input_var = graph.get_variable(input)
+            if input_var.value is not None and graph.is_leaf_variable(input):
+                eps = to_py_type(input_var.value)
+
+        scale = get_constant_input_name_of_operator(graph, nodes[-2].operator)
+        bias = get_constant_input_name_of_operator(graph, nodes[-1].operator)
+
+        self.transform.delete_operators_between_op_op(nodes[0].operator, nodes[-1].operator)
+        
+        bias_var = graph.get_variable(bias)
+        print(bias_var)
+        
+        attributes = {
+            "axis": nodes[0].operator.attributes.axes,
+            "epsilon": eps,
+        }
+        
+        
+        layer_norm_op = self.transform.make_operator(
+            op_type="LayerNormalization",
+            inputs=[nodes[0].operator.inputs[0], scale, bias],
+            outputs=[nodes[-1].operator.outputs[0]],
+            **attributes
+        )
+
+        self.transform.add_operator(layer_norm_op)
+
+class FusedGeluPass(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        self.transform = GraphTransform(graph)
+
+        find_sequence_subgraph(
+            graph, pattern=[OP.DIV, OP.ERF, OP.ADD, OP.MUL, OP.MUL], callback=self.fuse_gelu, strict=True
+        )
+        return graph
+
+    def fuse_gelu(self, graph: Graph, pattern: PatternGraph):
+        nodes = pattern.nodes
+        prev_op = self.transform.get_previous_operators(nodes[0].operator)[0]
+        next_ops = self.transform.get_next_operators(prev_op)
+        if len(next_ops) != 2:
+            return
+
+        if nodes[0].operator not in next_ops or nodes[3].operator not in next_ops:
+            return
+
+        gelu_op_input = None
+        for input in nodes[3].operator.inputs:
+            if input in nodes[0].operator.inputs:
+                gelu_op_input = input
+                break
+
+        self.transform.delete_operators_between_op_op(nodes[0].operator, nodes[-1].operator)
+
+        gelu_op = self.transform.make_operator(
+            op_type=OP.GELU,
+            inputs=[gelu_op_input],
+            outputs=[nodes[-1].operator.outputs[0]]
+        )
+        self.transform.add_operator(gelu_op)
+
+@dataclasses.dataclass
+class NormalizeAttr(BaseOperatorAttr):
+    p: float = 2.0
+    epsilon: float = 1e-12
+    axis: int = 1
+
+
+@registe_operator(OP.GELU)
+class GeluOperator(BaseOperator):
+
+    def call(
+        self,
+        executor,
+        operator: Operator,
+        inputs: List,
+        attr: NormalizeAttr,
+    ):
+        return F.gelu(inputs[0])
+
+    def convert_onnx_operator(
+        self, ir_graph: Graph, onnx_graph: onnx.GraphProto, node: onnx.NodeProto
+    ) -> Operator:
+        return default_converter(ir_graph, onnx_graph, node, attr_cls=attr.EmptyAttr)
+
+    def quantize(
+        self,
+        graph: Graph,
+        op: Operator,
+        operator_observer_config: QuantOperatorObserverConfig,
+        quant_outputs: bool = False,
+    ):
+        return quant_single_input_operator(graph, op, operator_observer_config, quant_outputs=quant_outputs)
+
+
+
+class ClearUnsedVariables(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        vars = list(graph.variables)
+
+        for var in vars:
+            if len(graph.get_dst_operators(var)) == 0 and graph.is_leaf_variable(var):
+                graph.delete_variable(var)
+
+        quant_params = list(graph.quant_parameters.keys())
+        for var in quant_params:
+            if not graph.containe_var(var):
+                graph.quant_parameters.pop(var)
+
+        return graph
+
+class FormatLayerNorm(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        for op in graph.operators.values():
+            if "LayerNorm" in op.op_type:
+                self.format_layer_norm(graph, op)
+        return graph
+
+    def format_layer_norm(self, graph, operator):
+        if not hasattr(operator.attributes, "axis"):
+            return
+        if isinstance(operator.attributes.axis, (tuple, list)):
+            operator.attributes.axis = operator.attributes.axis[0]
+
+class FormatReshape(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        for op in graph.operators.values():
+            if op.op_type == "Reshape":
+                self.format_reshape(graph, op)
+
+        return graph
+
+    def format_reshape(self, graph, operator):
+        shape = graph.get_variable(operator.inputs[1])
+        shape.value = torch.tensor(shape.value, dtype=torch.int64)
+
+class FormatScalar(BasePass):
+
+    def process(self, graph: Graph):
+        for var in graph.variables.values():
+            var: Variable
+            use_ops = graph.get_dst_operators(var)
+
+            if len(use_ops) == 0:
+                continue
+
+            if use_ops[0].op_type not in [OP.MUL, OP.ADD, OP.GATHER]:
+                continue
+
+            if var.value is not None and var.value.ndim == 0:
+                var.value = var.value.reshape(1)
+                print(f"Reshape scalar to tensor for {var.name}.")
+
+        return graph
+
+class RenamePass(BasePass):
+
+    def process(self, graph:Graph):
+
+        names = [name for name in graph.operators.keys()]
+        for old_name in names:
+            new_name = old_name.replace("/", "#")
+
+            graph.rename_operator(old_name, new_name)
+
+        names = [name for name in graph.variables.keys()]
+        for name in names:
+            new_name = name.replace("/", ".").replace("Output", "out").replace("output", "out")
+
+            graph.rename_vaiable(name, new_name,
+                                with_variables=True, 
+                                with_operator_outputs=True)
+
+        return graph
+
+def create_pipeline(example_inputs):
+    return PassSequence(
+        # FuseLayerNormPass(),
+        FusedGeluPass(),
+
+        # ClearUnsedVariables(),
+        # FormatLayerNorm(),
+        # FormatReshape(),
+        # FormatScalar(),
+        # RenamePass()
+    )
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--onnx_path", type=str)
+    parser.add_argument("--dst_onnx_path", type=str)
+
+    parser.add_argument("--bsz", type=int, default=8,
+                        help="Batch size")
+    parser.add_argument("--imgsz", type=int, default=224,
+                        help="Image size")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    example_inputs = torch.randn(args.bsz, 3, args.imgsz, args.imgsz)
+
+    refine_pipline = Pipeline(
+        create_source(f"{args.onnx_path}", example_inputs=example_inputs),
+        create_pipeline(example_inputs),
+        create_target(
+            f"{args.dst_onnx_path}",
+            example_inputs=example_inputs,
+        )
+    )
+    refine_pipline.run()
+
+    print(f"refine the model, input shape={example_inputs.shape}")
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_utils/__init__.py b/models/cv/classification/wide_resnet50/ixrt/refine_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_utils/common.py b/models/cv/classification/wide_resnet50/ixrt/refine_utils/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2af19a14df73cea6ba27ad6a8ad020fe0bec7aaa
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/refine_utils/common.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+from typing import Union, Callable, List
+
+from tensorrt.deploy.api import *
+from tensorrt.deploy.backend.onnx.converter import default_converter
+from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
+from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
+from tensorrt.deploy.ir.operator_type import OperatorType as OP
+from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
+from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
+from tensorrt.deploy.ir import Graph
+from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
+from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
+
+def find_sequence_subgraph(graph,
+                           pattern: Union[List[str], PatternGraph],
+                           callback: Callable[[Graph, PatternGraph], None],
+                           strict=True):
+    if isinstance(pattern, List):
+        pattern = build_sequence_graph(pattern)
+
+    matcher = GraphMatcher(pattern, strict=strict)
+    return matcher.findall(graph, callback)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_utils/linear_pass.py b/models/cv/classification/wide_resnet50/ixrt/refine_utils/linear_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..29b5e4a96e6edc448168bd78ede3111f6b59c032
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/refine_utils/linear_pass.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import dataclasses
+
+from refine_utils.common import *
+
+# AXB=C, Only for B is initializer
+
+class FusedLinearPass(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        self.transform = GraphTransform(graph)
+
+        find_sequence_subgraph(
+            graph, pattern=[OP.MATMUL, OP.ADD], callback=self.to_linear_with_bias, strict=True
+        )
+        find_sequence_subgraph(
+            graph, pattern=[OP.MATMUL], callback=self.to_linear, strict=True
+        )
+        return graph
+
+    def to_linear_with_bias(self, graph, pattern: PatternGraph):
+        matmul = pattern.nodes[0]
+        add = pattern.nodes[1]
+        if len(add.operator.inputs) != 2:
+            return
+
+        b_var = graph.get_variable(matmul.operator.inputs[1])
+        if not graph.is_leaf_variable(b_var) or b_var.value is None:
+            return
+
+        if b_var.value.ndim != 2:
+            return
+
+        bias_var = None
+        for input in add.operator.inputs:
+            if input not in matmul.operator.outputs:
+                bias_var = input
+
+        inputs = matmul.operator.inputs
+        inputs.append(bias_var)
+        outputs = add.operator.outputs
+
+        b_var.value =  b_var.value.transpose(1, 0)
+        b_var.shape[0],b_var.shape[1] = b_var.shape[1],b_var.shape[0]
+        
+        hidden_size = b_var.shape[1]
+        linear_dim = b_var.shape[0]
+        
+        attributes = {
+            "hidden_size": hidden_size,
+            "linear_dim":  linear_dim,
+            "has_bias": 1,
+            "act_type":"none"
+        }
+        
+        self.transform.make_operator(
+            "LinearFP16",
+            inputs=inputs,
+            outputs=outputs,
+            **attributes
+        )
+        
+        self.transform.delete_operator(add.operator)
+        self.transform.delete_operator(matmul.operator)
+
+    def to_linear(self, graph, pattern: PatternGraph):
+        matmul = pattern.nodes[0]
+        if len(matmul.operator.inputs) != 2:
+            return
+
+        b_var = graph.get_variable(matmul.operator.inputs[1])
+        if not graph.is_leaf_variable(b_var) or b_var.value is None:
+            return
+
+        if b_var.value.ndim != 2:
+            return
+
+        attributes = {
+            "hidden_size": hidden_size,
+            "linear_dim":  linear_dim,
+            "has_bias":    0,
+            "act_type":    "none"
+        }
+
+        b_var.value =  b_var.value.transpose(1, 0)
+        b_var.shape[0],b_var.shape[1] = b_var.shape[1], b_var.shape[0]
+        
+        hidden_size = b_var.shape[1]
+        linear_dim = b_var.shape[0]
+
+        op = self.transform.make_operator(
+            op_type = "LinearFP16",
+            inputs = pattern.nodes[0].operator.inputs,
+            outputs=[pattern.nodes[-1].operator.outputs[0]],
+            **attributes
+        )
+
+        self.transform.add_operator(op)
+
+        self.transform.delete_operator(matmul.operator)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_utils/matmul_to_gemm_pass.py b/models/cv/classification/wide_resnet50/ixrt/refine_utils/matmul_to_gemm_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ebfac4d917d6b05e46187f025c3c17184096e80
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/refine_utils/matmul_to_gemm_pass.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+from refine_utils.common import *
+
+#
+#   Common pattern Matmul to Gemm
+#
+class FusedGemmPass(BasePass):
+
+    def process(self, graph: Graph) -> Graph:
+        self.transform = GraphTransform(graph)
+
+        find_sequence_subgraph(
+            graph, pattern=[OP.MATMUL], callback=self.to_gemm, strict=True
+        )
+        return graph
+
+    def to_gemm(self, graph, pattern: PatternGraph):
+        matmul_op = pattern.nodes[0]
+        inputs = matmul_op.operator.inputs
+        outputs = matmul_op.operator.outputs
+
+        if len(inputs)!=2 and len(outputs)!=1:
+            return
+
+        for input in inputs:
+            if self.transform.is_leaf_variable(input):
+                return
+
+        print(f"{self.transform.get_variable(inputs[0]).shape}   {self.transform.get_variable(inputs[1]).shape}")
+        self.transform.delete_operator(matmul_op.operator)
+
+        op = self.transform.make_operator(
+            op_type = "Gemm",
+            inputs = inputs,
+            outputs = outputs,
+            alpha = 1,
+            beta = 1,
+            transB = 1
+        )
+
+        self.transform.add_operator(op)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_accuracy.sh b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b743d7084ae058118c29daaf494769fc293ceb41
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_accuracy.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    python3 ${RUN_DIR}/simplify_model.py \
+    --origin_model $ORIGINE_MODEL    \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+        --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_performance.sh b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e7a4f1a7276406a0ed7400af4368b5bec2a06e06
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_performance.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    python3 ${RUN_DIR}/simplify_model.py \
+    --origin_model $ORIGINE_MODEL    \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+        --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --fps_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_accuracy.sh b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..367bdd4bd22be28f96cd3c6719888d0ca889c612
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_accuracy.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+set -x
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=int8
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+ echo [STEP ${step}] : Simplify Model
+ if [ -f ${SIM_MODEL} ];then
+     echo "  "Simplify Model, ${SIM_MODEL} has been existed
+ else
+     python3 ${RUN_DIR}/simplify_model.py \
+     --origin_model $ORIGINE_MODEL    \
+     --output_model ${SIM_MODEL}
+     echo "  "Generate ${SIM_MODEL}
+ fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL}               \
+            --model_name ${MODEL_NAME}         \
+            --dataset_dir ${DATASETS_DIR}      \
+            --observer ${QUANT_OBSERVER}       \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+            --save_dir $CHECKPOINTS_DIR        \
+            --bsz   ${QUANT_BATCHSIZE}         \
+            --step  ${QUANT_STEP}              \
+            --seed  ${QUANT_SEED}              \
+            --imgsz ${IMGSIZE}
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+ # Change Batchsize
+ let step++
+ echo;
+ echo [STEP ${step}] : Change Batchsize
+ FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx
+ if [ -f $FINAL_MODEL ];then
+     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+ else
+     python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+         --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+     echo "  "Generate ${FINAL_MODEL}
+ fi
+
+ # Build Engine
+ let step++
+ echo;
+ echo [STEP ${step}] : Build Engine
+ ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ if [ -f $ENGINE_FILE ];then
+     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+ else
+     python3 ${RUN_DIR}/build_i8_engine.py          \
+         --onnx ${FINAL_MODEL}                    \
+         --qparam_json ${CHECKPOINTS_DIR}/quant_cfg.json \
+         --engine ${ENGINE_FILE}
+     echo "  "Generate Engine ${ENGINE_FILE}
+ fi
+
+# Inference
+# let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_performance.sh b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..468c557de451ddab0024ef2c69e9fa42751a50ce
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_performance.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=int8
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+ echo [STEP ${step}] : Simplify Model
+ if [ -f ${SIM_MODEL} ];then
+     echo "  "Simplify Model, ${SIM_MODEL} has been existed
+ else
+     python3 ${RUN_DIR}/simplify_model.py \
+     --origin_model $ORIGINE_MODEL    \
+     --output_model ${SIM_MODEL}
+     echo "  "Generate ${SIM_MODEL}
+ fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL}               \
+            --model_name ${MODEL_NAME}         \
+            --dataset_dir ${DATASETS_DIR}      \
+            --observer ${QUANT_OBSERVER}       \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+            --save_dir $CHECKPOINTS_DIR        \
+            --bsz   ${QUANT_BATCHSIZE}         \
+            --step  ${QUANT_STEP}              \
+            --seed  ${QUANT_SEED}              \
+            --imgsz ${IMGSIZE}
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+ # Change Batchsize
+ let step++
+ echo;
+ echo [STEP ${step}] : Change Batchsize
+ FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx
+ if [ -f $FINAL_MODEL ];then
+     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+ else
+     python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+         --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+     echo "  "Generate ${FINAL_MODEL}
+ fi
+
+ # Build Engine
+ let step++
+ echo;
+ echo [STEP ${step}] : Build Engine
+ ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ if [ -f $ENGINE_FILE ];then
+     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+ else
+     python3 ${RUN_DIR}/build_i8_engine.py          \
+         --onnx ${FINAL_MODEL}                    \
+         --qparam_json ${CHECKPOINTS_DIR}/quant_cfg.json \
+         --engine ${ENGINE_FILE}
+     echo "  "Generate Engine ${ENGINE_FILE}
+ fi
+
+# Inference
+# let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --acc_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/simplify_model.py b/models/cv/classification/wide_resnet50/ixrt/simplify_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9948a9fa083ff99ff88e556e96614b02cccaa965
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/simplify_model.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import onnx
+import argparse
+from onnxsim import simplify
+
+# Simplify
+def simplify_model(args):
+    onnx_model = onnx.load(args.origin_model)
+    model_simp, check = simplify(onnx_model)
+    model_simp = onnx.shape_inference.infer_shapes(model_simp)
+    onnx.save(model_simp, args.output_model)
+    print("  Simplify onnx Done.")
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    parser.add_argument("--reshape", action="store_true")
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+simplify_model(args)
+    
+
+
+
diff --git a/models/cv/detection/yolov4/ixrt/README.md b/models/cv/detection/yolov4/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..886a87aec3bc59e730e1b9fb3436fe07c8179600
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/README.md
@@ -0,0 +1,82 @@
+# YOLOv4
+
+## Description
+
+YOLOv4 employs a two-step process, involving regression for bounding box positioning and classification for object categorization. it amalgamates past YOLO family research contributions with novel features like WRC, CSP, CmBN, SAT, Mish activation, Mosaic data augmentation, DropBlock regularization, and CIoU loss.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install onnxsim
+pip3 install pycocotools
+pip3 install pycuda
+```
+
+### Download
+
+Pretrained cfg: <https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4.cfg>
+Pretrained model: <https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights>
+
+Dataset: <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+# clone yolov4
+git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git yolov4
+
+# download weight
+mkdir data
+wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights -P data
+
+# export onnx model
+python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight data/yolov4.weights --batchsize 16 --output data/yolov4.onnx
+mv yolov4_16_3_608_608_static.onnx data/yolov4.onnx
+
+# Use onnxsim optimize onnx model
+onnxsim data/yolov4.onnx data/yolov4_sim.onnx
+
+# Make sure the dataset path is "data/coco"
+```
+
+## Inference
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_yolov4darknet_fp16_accuary.sh
+# Performance
+bash scripts/infer_yolov4darknet_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_yolov4darknet_int8_accuracy.sh
+# Performance
+bash scripts/infer_yolov4darknet_int8_performance.sh
+```
+
+## Results
+
+| Model  | BatchSize | Precision | FPS    | MAP@0.5 |
+| ------ | --------- | --------- | ------ | ------- |
+| YOLOv4 | 32        | FP16      | 303.27 | 0.730   |
+| YOLOv4 | 32        | INT8      | 682.14 | 0.608   |
+
+## Reference
+
+DarkNet: <https://github.com/AlexeyAB/darknet>
+Pytorch-YOLOv4: <https://github.com/Tianxiaomo/pytorch-YOLOv4>
diff --git a/models/cv/detection/yolov4/ixrt/build_engine.py b/models/cv/detection/yolov4/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4080edd3c275a4595cbfb407a21cebdada7fa7
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/build_engine.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from tensorrt import Dims
+
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+
+def build_engine_trtapi_staticshape(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
+    build_config.set_flag(precision)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    print("Build static shape engine done!")
+
+
+def build_engine_trtapi_dynamicshape(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+
+    profile = builder.create_optimization_profile()
+    profile.set_shape("input",
+                        Dims([1, 3, 608, 608]),
+                        Dims([32, 3, 608, 608]),
+                        Dims([64, 3, 608, 608]),
+    )
+    build_config.add_optimization_profile(profile)
+
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
+    build_config.set_flag(precision)
+
+    # set dynamic
+    num_inputs = network.num_inputs
+    for i in range(num_inputs):
+        input_tensor = network.get_input(i)
+        input_tensor.shape = Dims([-1, 3, 608, 608])
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    print("Build dynamic shape engine done!")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    # engine args
+    parser.add_argument("--engine", type=str, default=None)
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    build_engine_trtapi_staticshape(args)
+    # build_engine_trtapi_dynamicshape(args)
diff --git a/models/cv/detection/yolov4/ixrt/coco_labels.py b/models/cv/detection/yolov4/ixrt/coco_labels.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc21282c7fa393e9d15e8bdc16c741dc7e78448
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/coco_labels.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+labels = [
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]
+def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
+    return [
+        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
+        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+
+__all__ = ["labels"]
diff --git a/models/cv/detection/yolov4/ixrt/common.py b/models/cv/detection/yolov4/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c2766533fa5a334a61231adb168ecf09622c3
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/common.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import glob
+import time
+import numpy as np
+from tqdm import tqdm
+
+import tensorrt
+import pycuda.driver as cuda
+
+
+def load_class_names(namesfile):
+    class_names = []
+    with open(namesfile, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.rstrip()
+        class_names.append(line)
+    return class_names
+
+# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
+# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
+def box_class85to6(input):
+    center_x_y = input[:, :2]
+    side = input[:, 2:4]
+    conf = input[:, 4:5]
+    class_id = np.argmax(input[:, 5:], axis = -1)
+    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
+    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
+    x1_y1 = center_x_y - 0.5 * side
+    x2_y2 = center_x_y + 0.5 * side
+    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
+    return nms_input
+
+def save2json(batch_img_id, pred_boxes, json_result, class_trans):
+    for i, boxes in enumerate(pred_boxes):
+        if boxes is not None:
+            image_id = int(batch_img_id[i])
+            # have no target
+            if image_id == -1:
+                continue
+
+            for x1, y1, x2, y2, _, p, c in boxes:
+                x1, y1, x2, y2, p = float(x1), float(y1), float(x2), float(y2), float(p)
+                c = int(c)
+                x = x1
+                y = y1
+                w = x2 - x1
+                h = y2 - y1
+
+                json_result.append(
+                    {
+                        "image_id": image_id,
+                        "category_id": class_trans[c - 1],
+                        "bbox": [x, y, w, h],
+                        "score": p,
+                    }
+                )
+
+################## About TensorRT #################
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        # print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
+##########################################################
+
+
+################## About Loading Dataset #################
+def load_images(images_path):
+    """
+    If image path is given, return it directly
+    For txt file, read it and return each line as image path
+    In other case, it's a folder, return a list with names of each
+    jpg, jpeg and png file
+    """
+    input_path_extension = images_path.split('.')[-1]
+    if input_path_extension in ['jpg', 'jpeg', 'png']:
+        return [images_path]
+    elif input_path_extension == "txt":
+        with open(images_path, "r") as f:
+            return f.read().splitlines()
+    else:
+        return glob.glob(
+            os.path.join(images_path, "*.jpg")) + \
+            glob.glob(os.path.join(images_path, "*.png")) + \
+            glob.glob(os.path.join(images_path, "*.jpeg"))
+
+def prepare_batch(images_path, bs=16, input_size=(608, 608)):
+
+    width, height = input_size
+
+    batch_names = []
+    batch_images = []
+    batch_shapes = []
+
+    temp_names = []
+    temp_images = []
+    temp_shapes = []
+
+    for i, image_path in tqdm(enumerate(images_path), desc="Loading coco data"):
+        name = os.path.basename(image_path)
+        image = cv2.imread(image_path)
+        h, w, _ = image.shape
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image_resized = cv2.resize(image_rgb, (width, height),
+                                   interpolation=cv2.INTER_LINEAR)
+        custom_image = image_resized.transpose(2, 0, 1).astype(np.float32) / 255.
+        custom_image = np.expand_dims(custom_image, axis=0)
+
+        if i != 0 and i % bs == 0:
+            batch_names.append(temp_names)
+            batch_images.append(np.concatenate(temp_images, axis=0))
+            batch_shapes.append(temp_shapes)
+
+            temp_names = [name]
+            temp_images = [custom_image]
+            temp_shapes = [(h, w)]
+        else:
+            temp_names.append(name)
+            temp_images.append(custom_image)
+            temp_shapes.append((h, w))
+
+    return batch_names, batch_images, batch_shapes
+##########################################################
+
+
+################## About Operating box #################
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, ratio, (dw, dh)
+
+def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
+    # Rescale boxes (xyxy) from net_shape to ori_shape
+
+    if use_letterbox:
+
+        gain = min(
+            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
+        )  # gain  = new / old
+        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
+            net_shape[0] - ori_shape[0] * gain
+        ) / 2.0
+
+        boxes[:, [0, 2]] -= pad[0]  # x padding
+        boxes[:, [1, 3]] -= pad[1]  # y padding
+        boxes[:, :4] /= gain
+    else:
+        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
+
+        boxes[:, 0] /= x_scale
+        boxes[:, 1] /= y_scale
+        boxes[:, 2] /= x_scale
+        boxes[:, 3] /= y_scale
+
+    clip_boxes(boxes, ori_shape)
+    return boxes
+
+def clip_boxes(boxes, shape):
+
+    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+##########################################################
+
+
+################## About pre and post processing #########
+def pre_processing(src_img, imgsz=608):
+    resized = cv2.resize(src_img, (imgsz, imgsz), interpolation=cv2.INTER_LINEAR)
+    in_img = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+    in_img = np.transpose(in_img, (2, 0, 1)).astype(np.float32)
+    in_img = np.expand_dims(in_img, axis=0)
+    in_img /= 255.0
+    return in_img
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+    # print(boxes.shape)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = confs.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        idx_self = order[0]
+        idx_other = order[1:]
+
+        keep.append(idx_self)
+
+        xx1 = np.maximum(x1[idx_self], x1[idx_other])
+        yy1 = np.maximum(y1[idx_self], y1[idx_other])
+        xx2 = np.minimum(x2[idx_self], x2[idx_other])
+        yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+
+        if min_mode:
+            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+        else:
+            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+        inds = np.where(over <= nms_thresh)[0]
+        order = order[inds + 1]
+
+    return np.array(keep)
+
+
+def post_processing(img, conf_thresh, nms_thresh, output, num_classes=80):
+
+    # [batch, num, 1, 4]
+    box_array = output[:, :, :4]
+    # [batch, num, 2]
+    class_confs = output[:, :, 4:]
+
+    max_conf = class_confs[:, :, 1]
+    max_id = class_confs[:, :, 0]
+
+    bboxes_batch = []
+    for i in range(box_array.shape[0]):
+
+        argwhere = max_conf[i] > conf_thresh
+        l_box_array = box_array[i, argwhere, :]
+        l_max_conf = max_conf[i, argwhere]
+        l_max_id = max_id[i, argwhere]
+
+        bboxes = []
+        # nms for each class
+        for j in range(num_classes):
+
+            cls_argwhere = l_max_id == j
+            ll_box_array = l_box_array[cls_argwhere, :]
+            ll_max_conf = l_max_conf[cls_argwhere]
+            ll_max_id = l_max_id[cls_argwhere]
+
+            keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+
+            if (keep.size > 0):
+                ll_box_array = ll_box_array[keep, :]
+                ll_max_conf = ll_max_conf[keep]
+                ll_max_id = ll_max_id[keep]
+
+                for k in range(ll_box_array.shape[0]):
+                    bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2],
+                                  ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+
+        bboxes_batch.append(bboxes)
+
+    return bboxes_batch
+##########################################################
+
diff --git a/models/cv/detection/yolov4/ixrt/cut_model.py b/models/cv/detection/yolov4/ixrt/cut_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf4f88dae926b8d15356c7f6b48d89fe80dc9f2a
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/cut_model.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import onnx
+import argparse
+from onnxsim import simplify
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    parser.add_argument("--input_names", nargs='+', type=str)
+    parser.add_argument("--output_names", nargs='+', type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
+print("  Cut Model Done.")
diff --git a/models/cv/detection/yolov4/ixrt/deploy.py b/models/cv/detection/yolov4/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..084356ec8cb14a0604bf994faca4ce15834e4b15
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/deploy.py
@@ -0,0 +1,210 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import copy
+
+from typing import Union, Callable, List
+
+from tensorrt.deploy.api import *
+from tensorrt.deploy.backend.onnx.converter import default_converter
+from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
+from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
+from tensorrt.deploy.ir.operator_type import OperatorType as OP
+from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
+from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
+from tensorrt.deploy.ir import Graph
+from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
+from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+
+class FuseMishPass(BasePass):
+    def process(self, graph: Graph) -> Graph:
+        pattern = build_sequence_graph([OP.SOFTPLUS, OP.TANH, OP.MUL])
+
+        matcher = GraphMatcher(pattern, strict=False)
+        self.transform = GraphTransform(graph)
+        matcher.findall(graph, self.fuse_mish)
+        return graph
+
+    def fuse_mish(self, graph: Graph, pattern_graph: PatternGraph):
+        softplus = pattern_graph.nodes[0].operator
+        mul = pattern_graph.nodes[-1].operator
+
+        if not self.can_fused(graph, pattern_graph):
+            return
+
+        self.transform.delete_operators_between_op_op(softplus, mul)
+
+        mish_op = Operator(
+            name=generate_operator_name(graph, pattern="Mish_{idx}"),
+            op_type=OP.MISH,
+            inputs=copy.copy(softplus.inputs),
+            outputs=copy.copy(mul.outputs),
+        )
+        mish_op.is_quant_operator = softplus.is_quant_operator and mul.is_quant_operator
+        graph.add_operator(mish_op)
+
+    def can_fused(self, graph: Graph, pattern_graph: PatternGraph):
+        softplus = pattern_graph.nodes[0].operator
+        mul = pattern_graph.nodes[-1].operator
+
+        # 检查 Softplus, tanh 的输出是不是只有一个 OP 使用
+        # 如果有多个 OP 使用，则不能融合
+        for node in pattern_graph.nodes[:2]:
+            next_ops = graph.get_next_operators(node.operator)
+            if len(next_ops) != 1:
+                return False
+
+        # 检查 Mul 的输入是不是和 Softplus 是同源的
+        softplus_prev_op = graph.get_previous_operators(softplus)
+        if len(softplus_prev_op) != 1:
+            return False
+
+        mul_prev_op = graph.get_previous_operators(mul)
+        if len(mul_prev_op) != 2:
+            return False
+
+        for op in mul_prev_op:
+            if op is softplus_prev_op[0]:
+                return True
+
+        return False
+
+
+class Transform:
+    def __init__(self, graph):
+        self.t = GraphTransform(graph)
+        self.graph = graph
+
+    def ReplaceFocus(self, input_edge, outputs, to_op):
+        input_var = self.graph.get_variable(input_edge)
+        op = self.graph.get_operator(to_op)
+        self.t.delete_operators_between_var_op(
+            from_var=input_var, to_op=op
+        )
+        self.t.make_operator(
+            "Focus", inputs=input_edge, outputs=outputs
+        )
+        return self.graph
+
+    def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
+        if attributes["anchor"] is None:
+            del attributes["anchor"]
+        self.t.make_operator(
+            op_type, inputs=inputs, outputs=outputs, **attributes
+        )
+        return self.graph
+
+    def AddConcatOp(self, inputs: list, outputs, **attributes):
+        self.t.make_operator(
+            "Concat", inputs=inputs, outputs=outputs, **attributes
+        )
+        return self.graph
+
+def customize_ops(graph, args):
+    t = Transform(graph)
+    fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
+    if fuse_focus:
+        graph = t.ReplaceFocus(
+            input_edge=args.focus_input,
+            outputs=args.focus_output,
+            to_op=args.focus_last_node
+        )
+    decoder_input = args.decoder_input_names
+    num = len(decoder_input) // 3
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[:num],
+        outputs=["decoder_8"],
+        op_type=args.decoder_type,
+        anchor=args.decoder8_anchor,
+        num_class=args.num_class,
+        stride=8,
+        faster_impl=args.faster
+    )
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[num:num*2],
+        outputs=["decoder_16"],
+        op_type=args.decoder_type,
+        anchor=args.decoder16_anchor,
+        num_class=args.num_class,
+        stride=16,
+        faster_impl=args.faster
+    )
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[num*2:num*2+1],
+        outputs=["decoder_32"],
+        op_type=args.decoder_type,
+        anchor=args.decoder32_anchor,
+        num_class=args.num_class,
+        stride=32,
+        faster_impl=args.faster
+    )
+    if args.decoder64_anchor is not None:
+        graph = t.AddYoloDecoderOp(
+            inputs=decoder_input[num*2+1:],
+            outputs=["decoder_64"],
+            op_type=args.decoder_type,
+            anchor=args.decoder64_anchor,
+            num_class=args.num_class,
+            stride=64,
+            faster_impl=args.faster
+        )
+        graph = t.AddConcatOp(
+            inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
+            outputs=["output"],
+            axis=1
+        )
+    else:
+        graph = t.AddConcatOp(
+            inputs=["decoder_32", "decoder_16", "decoder_8"],
+            outputs=["output"],
+            axis=1
+        )
+
+    graph.outputs.clear()
+    graph.add_output("output")
+    graph.outputs["output"].dtype = "FLOAT"
+    return graph
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str)
+    parser.add_argument("--dst", type=str)
+    parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
+    parser.add_argument("--decoder_input_names", nargs='+', type=str)
+    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
+    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
+    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
+    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
+    parser.add_argument("--num_class", type=int, default=80)
+    parser.add_argument("--faster", type=int, default=1)
+    parser.add_argument("--focus_input", type=str, default=None)
+    parser.add_argument("--focus_output", type=str, default=None)
+    parser.add_argument("--focus_last_node", type=str, default=None)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+    graph = create_source(args.src)()
+    graph = customize_ops(graph, args)
+    graph = FuseMishPass().process(graph)
+    create_target(saved_path=args.dst).export(graph)
+    print("Surged onnx lies on", args.dst)
diff --git a/models/cv/detection/yolov4/ixrt/export.py b/models/cv/detection/yolov4/ixrt/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c8bbfa5aa79f1a982c340690658325d23fa4b54
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/export.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+sys.path.insert(0, "yolov4")
+import argparse
+
+from yolov4.tool.darknet2onnx import *
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="darknet cfg path.")
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="darknet weights path.")
+    
+    parser.add_argument("--batchsize", 
+                    type=int, 
+                    required=True, 
+                    help="Onnx model batchsize.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    transform_to_onnx(args.cfg, args.weight, args.batchsize, args.output)
+    
+if __name__ == "__main__":
+    main()
+
diff --git a/models/cv/detection/yolov4/ixrt/inference.py b/models/cv/detection/yolov4/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d740507b3a54bf2248000b2ac60d09f12a9886a
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/inference.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import glob
+import json
+import os
+import time
+import sys
+from tqdm import tqdm
+
+import torch
+import numpy as np
+import tensorrt
+from tensorrt import Dims
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+from coco_labels import coco80_to_coco91_class
+from common import save2json, box_class85to6
+from common import load_images, prepare_batch
+from common import create_engine_context, setup_io_bindings
+from common import scale_boxes, post_processing
+
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+
+
+def main(config):
+
+    # Step1: Load dataloader
+    images_path = load_images(config.eval_dir)
+    dataloader = prepare_batch(images_path, config.bsz)
+
+    # Step2: Load Engine
+    input_name = "input"
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    engine, context = create_engine_context(config.model_engine, logger)
+    input_idx = engine.get_binding_index(input_name)
+    context.set_binding_shape(input_idx, Dims((config.bsz,3,config.imgsz,config.imgsz)))
+    inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+    # Warm up
+    if config.warm_up > 0:
+        print("\nWarm Start.")
+        for i in range(config.warm_up):
+            context.execute_v2(allocations)
+        print("Warm Done.")
+
+    json_result = []
+    forward_time = 0.0
+    class_map = coco80_to_coco91_class()
+    num_samples = 0
+    # Step3: Run on coco dataset
+    for batch_names, batch_images, batch_shapes in tqdm(zip(*dataloader)):
+        batch_data = np.ascontiguousarray(batch_images)
+        data_shape = batch_data.shape
+        h, w = zip(*batch_shapes)
+        batch_img_shape = [h, w]
+        batch_img_id = [int(x.split('.')[0]) for x in batch_names]
+
+        cur_bsz_sample = batch_images.shape[0]
+        num_samples += cur_bsz_sample
+        # Set input
+        input_idx = engine.get_binding_index(input_name)
+        context.set_binding_shape(input_idx, Dims(data_shape))
+        inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+        # Prepare the output data
+        output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+        # print(f"output shape : {output.shape} output type : {output.dtype}")
+
+        # Forward
+        start_time = time.time()
+        context.execute_v2(allocations)
+        end_time = time.time()
+        forward_time += end_time - start_time
+
+        if config.test_mode == "MAP":
+            # Fetch output
+            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            pred_boxes = post_processing(None, 0.001, 0.6, output)
+
+            pred_results = []
+            # Calculate pred box on raw shape
+            for (pred_box, raw_shape) in zip(pred_boxes, batch_shapes):
+                h, w = raw_shape
+                if len(pred_box) == 0:continue    # no detection results
+                pred_box = np.array(pred_box, dtype=np.float32)
+                pred_box = scale_boxes((config.imgsz, config.imgsz), pred_box, raw_shape, use_letterbox=False)
+
+                pred_results.append(pred_box.tolist())
+
+            save2json(batch_img_id, pred_results, json_result, class_map)
+
+    fps = num_samples / forward_time
+
+    if config.test_mode == "FPS":
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+        if fps >= config.fps_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+    if config.test_mode == "MAP":
+        if len(json_result) == 0:
+            print("Predict zero box!")
+            exit(1)
+
+        if not os.path.exists(config.pred_dir):
+            os.makedirs(config.pred_dir)
+
+        pred_json = os.path.join(
+            config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
+        )
+        with open(pred_json, "w") as f:
+            json.dump(json_result, f)
+
+        anno_json = config.coco_gt
+        anno = COCO(anno_json)  # init annotations api
+        pred = anno.loadRes(pred_json)  # init predictions api
+        eval = COCOeval(anno, pred, "bbox")
+
+        eval.evaluate()
+        eval.accumulate()
+        print(
+            f"==============================eval {config.model_name} {config.precision} coco map =============================="
+        )
+        eval.summarize()
+
+        map, map50 = eval.stats[:2]
+        print("MAP@0.5 : ", map50)
+        print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
+        if map50 >= config.map_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name", type=str, default="YOLOV4", help="YOLOV3 YOLOV4 YOLOV5 YOLOV7 YOLOX"
+    )
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+    parser.add_argument(
+        "--model_engine",
+        type=str,
+        default="",
+        help="model engine path",
+    )
+    parser.add_argument(
+        "--coco_gt",
+        type=str,
+        default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
+        help="coco instances_val2017.json",
+    )
+    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
+    parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
+    parser.add_argument(
+        "--eval_dir",
+        type=str,
+        default="data/datasets/cv/coco2017/val2017",
+        help="coco image dir",
+    )
+    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+    parser.add_argument(
+        "--imgsz",
+        "--img",
+        "--img-size",
+        type=int,
+        default=608,
+        help="inference size h,w",
+    )
+    parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
+    parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
+    parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
+
+    config = parser.parse_args()
+    print("config:", config)
+    return config
+
+
+if __name__ == "__main__":
+    config = parse_config()
+    main(config)
diff --git a/models/cv/detection/yolov4/ixrt/load_ixrt_plugin.py b/models/cv/detection/yolov4/ixrt/load_ixrt_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bb0abc21bd5806c51d6b908e3e3407cfdb62cc8
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import ctypes
+import tensorrt
+from os.path import join, dirname, exists
+def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path)
+    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
diff --git a/models/cv/detection/yolov4/ixrt/quant.py b/models/cv/detection/yolov4/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..70265cbc25d24d4ed41640c76f78a1839555f749
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/quant.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import random
+import argparse
+import numpy as np
+from tensorrt.deploy import static_quantize
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from common import letterbox
+
+
+def setseed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model", type=str,  default="yolov4_bs16_without_decoder.onnx")
+    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
+    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
+    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
+    parser.add_argument("--disable_quant_names", nargs='*', type=str)
+    parser.add_argument("--save_quant_model", type=str,  help="save the quantization model path", default=None)
+    parser.add_argument("--bsz", type=int, default=16)
+    parser.add_argument("--step", type=int, default=32)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--imgsz", type=int, default=608)
+    parser.add_argument("--use_letterbox", action="store_true")
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+setseed(args.seed)
+model_name = args.model_name
+
+
+def get_dataloader(data_dir, step=32, batch_size=16, new_shape=[608, 608], use_letterbox=False):
+    num = step * batch_size
+    val_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
+    random.shuffle(val_list)
+    pic_list = val_list[:num]
+
+    calibration_dataset = []
+    for file_path in pic_list:
+        pic_data = cv2.imread(file_path)
+        org_img = pic_data
+        assert org_img is not None, 'Image not Found ' + file_path
+        h0, w0 = org_img.shape[:2]
+
+        if use_letterbox:
+            img, ratio, dwdh = letterbox(org_img, new_shape=(new_shape[1], new_shape[0]), auto=False, scaleup=True)
+        else:
+            img = cv2.resize(org_img, new_shape)
+        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img = np.ascontiguousarray(img) / 255.0  # 0~1 np array
+        img = torch.from_numpy(img).float()
+
+        calibration_dataset.append(img)
+
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        shuffle=True,
+        batch_size=batch_size,
+        drop_last=True
+    )
+    return calibration_dataloader
+
+dataloader = get_dataloader(
+    data_dir=args.dataset_dir,
+    step=args.step,
+    batch_size=args.bsz,
+    new_shape=(args.imgsz, args.imgsz),
+    use_letterbox=args.use_letterbox
+)
+
+dirname = os.path.dirname(args.save_quant_model)
+quant_json_path = os.path.join(dirname, f"quantized_{model_name}.json")
+
+static_quantize(args.model,
+        calibration_dataloader=dataloader,
+        save_quant_onnx_path=args.save_quant_model,
+        save_quant_params_path=quant_json_path,
+        observer=args.observer,
+        data_preprocess=lambda x: x.to("cuda"),
+        quant_format="qdq",
+        disable_quant_names=args.disable_quant_names)
diff --git a/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_accuary.sh b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_accuary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b732d4eb297b6319ad5bef4660a6f7dde0ef0abc
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_accuary.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov4_darknet
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=16
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+
+# Cut decoder part
+echo "Cut decoder part"
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py  \
+        --input_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}      \
+        --input_names input                \
+        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# add decoder op
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/deploy.py             \
+        --src ${CURRENT_MODEL}               \
+        --dst ${FINAL_MODEL}                 \
+        --decoder_type YoloV3Decoder          \
+        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
+        --decoder8_anchor 12 16 19 36 40 28            \
+        --decoder16_anchor 36 75 76 55 72 146          \
+        --decoder32_anchor 142 110 192 243 459 401
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_fp16.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision float16                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=16
+python3 ${RUN_DIR}/inference.py                 \
+    --test_mode MAP                             \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 608                              \
+    --loop_count 10                             \
+    --eval_dir ${EVAL_DIR}                      \
+    --coco_gt ${COCO_GT}                        \
+    --pred_dir ${CHECKPOINTS_DIR}               \
+    --precision float16                            \
+    --map_target 0.30; check_status
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_performance.sh b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..796dad720e13250b6ee81c66defca990c416e220
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_performance.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov4_darknet
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=16
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+
+# Cut decoder part
+echo "Cut decoder part"
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py  \
+        --input_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}      \
+        --input_names input                \
+        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# add decoder op
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/deploy.py             \
+        --src ${CURRENT_MODEL}               \
+        --dst ${FINAL_MODEL}                 \
+        --decoder_type YoloV3Decoder          \
+        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
+        --decoder8_anchor 12 16 19 36 40 28            \
+        --decoder16_anchor 36 75 76 55 72 146          \
+        --decoder32_anchor 142 110 192 243 459 401
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_fp16.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision float16                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=16
+python3 ${RUN_DIR}/inference.py                 \
+    --test_mode FPS                             \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 608                              \
+    --loop_count 10                             \
+    --eval_dir ${EVAL_DIR}                      \
+    --coco_gt ${COCO_GT}                        \
+    --pred_dir ${CHECKPOINTS_DIR}               \
+    --precision float16                            \
+    --map_target 0.30; check_status
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_accuary.sh b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_accuary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c62d174c09e6f4b005a9b1e7ce028cc47643a930
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_accuary.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov4_darknet
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=16
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+
+# Cut decoder part
+echo "Cut decoder part"
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py  \
+        --input_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}      \
+        --input_names input                \
+        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/quant.py             \
+        --model_name "YOLOV4_DARKNET"       \
+        --model ${CURRENT_MODEL}            \
+        --bsz ${BATCH_SIZE}                 \
+        --dataset_dir ${EVAL_DIR}           \
+        --ann_file ${COCO_GT}               \
+        --observer "hist_percentile"        \
+        --save_quant_model ${FINAL_MODEL}   \
+        --imgsz 608
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# add decoder op
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/deploy.py             \
+        --src ${CURRENT_MODEL}               \
+        --dst ${FINAL_MODEL}                 \
+        --decoder_type YoloV3Decoder          \
+        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
+        --decoder8_anchor 12 16 19 36 40 28            \
+        --decoder16_anchor 36 75 76 55 72 146          \
+        --decoder32_anchor 142 110 192 243 459 401
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_int8.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision int8                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=16
+python3 ${RUN_DIR}/inference.py                 \
+    --test_mode MAP                             \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 608                              \
+    --loop_count 10                             \
+    --eval_dir ${EVAL_DIR}                      \
+    --coco_gt ${COCO_GT}                        \
+    --pred_dir ${CHECKPOINTS_DIR}               \
+    --precision int8                            \
+    --map_target 0.30; check_status
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_performance.sh b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2e335fa1d013961c136cda4f79fd2be712311494
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_performance.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov4_darknet
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=16
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+
+# Cut decoder part
+echo "Cut decoder part"
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py  \
+        --input_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}      \
+        --input_names input                \
+        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/quant.py             \
+        --model_name "YOLOV4_DARKNET"       \
+        --model ${CURRENT_MODEL}            \
+        --bsz ${BATCH_SIZE}                 \
+        --dataset_dir ${EVAL_DIR}           \
+        --ann_file ${COCO_GT}               \
+        --observer "hist_percentile"        \
+        --save_quant_model ${FINAL_MODEL}   \
+        --imgsz 608
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# add decoder op
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/deploy.py             \
+        --src ${CURRENT_MODEL}               \
+        --dst ${FINAL_MODEL}                 \
+        --decoder_type YoloV3Decoder          \
+        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
+        --decoder8_anchor 12 16 19 36 40 28            \
+        --decoder16_anchor 36 75 76 55 72 146          \
+        --decoder32_anchor 142 110 192 243 459 401
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_int8.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision int8                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=16
+python3 ${RUN_DIR}/inference.py                 \
+    --test_mode FPS                             \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 608                              \
+    --loop_count 10                             \
+    --eval_dir ${EVAL_DIR}                      \
+    --coco_gt ${COCO_GT}                        \
+    --pred_dir ${CHECKPOINTS_DIR}               \
+    --precision int8                            \
+    --map_target 0.30; check_status
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov6/ixrt/README.md b/models/cv/detection/yolov6/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..66258563113c66ba4aa22c98cd8b00ef056900a5
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/README.md
@@ -0,0 +1,84 @@
+# YOLOv6
+
+## Description
+
+YOLOv6 integrates cutting-edge object detection advancements from industry and academia, incorporating recent innovations in network design, training strategies, testing techniques, quantization, and optimization methods. This culmination results in a suite of deployment-ready networks, accommodating varied use cases across different scales.  
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install onnxsim
+pip3 install pycocotools
+pip3 install pycuda
+```
+
+### Download
+
+Pretrained model: <https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6s.pt>
+
+Dataset: <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+
+```bash
+# get yolov6s.pt
+wget https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6s.pt
+# set coco path
+mkdir -p data/
+ln -s /Path/to/coco/ data/coco
+```
+
+### Model Conversion
+
+```bash
+# install yolov6
+git clone https://github.com/meituan/YOLOv6.git
+
+pushd YOLOv6
+pip3 install -r requirements.txt
+
+# export onnx model
+python3 deploy/ONNX/export_onnx.py --weights ../yolov6s.pt --img 640 --batch-size 32 --simplify
+mv ../yolov6s.onnx ../data/
+
+popd
+```
+
+## Inference
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_yolov6s_fp16_accuracy.sh
+# Performance
+bash scripts/infer_yolov6s_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_yolov6s_int8_accuracy.sh
+# Performance
+bash scripts/infer_yolov6s_int8_performance.sh
+```
+
+## Results
+
+| Model  | BatchSize | Precision | FPS      | MAP@0.5 |
+| ------ | --------- | --------- | -------- | ------- |
+| YOLOv6 | 32        | FP16      | 1107.511 | -       |
+| YOLOv6 | 32        | INT8      | 2080.475 | -       |
+
+## Reference
+
+YOLOv6: <https://github.com/meituan/YOLOv6>
diff --git a/models/cv/detection/yolov6/ixrt/build_engine.py b/models/cv/detection/yolov6/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e1719a22c84b400a2ba9b9cbfdea6bae99e80d
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/build_engine.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from tensorrt import Dims
+
+
+def build_engine_trtapi_staticshape(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
+    build_config.set_flag(precision)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    print("Build static shape engine done!")
+
+
+def build_engine_trtapi_dynamicshape(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+
+    profile = builder.create_optimization_profile()
+    profile.set_shape("input",
+                        Dims([1, 3, 608, 608]),
+                        Dims([32, 3, 608, 608]),
+                        Dims([64, 3, 608, 608]),
+    )
+    build_config.add_optimization_profile(profile)
+
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
+    build_config.set_flag(precision)
+
+    # set dynamic
+    num_inputs = network.num_inputs
+    for i in range(num_inputs):
+        input_tensor = network.get_input(i)
+        input_tensor.shape = Dims([-1, 3, 608, 608])
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    print("Build dynamic shape engine done!")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    # engine args
+    parser.add_argument("--engine", type=str, default=None)
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    build_engine_trtapi_staticshape(args)
+    # build_engine_trtapi_dynamicshape(args)
diff --git a/models/cv/detection/yolov6/ixrt/common.py b/models/cv/detection/yolov6/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c2766533fa5a334a61231adb168ecf09622c3
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/common.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import glob
+import time
+import numpy as np
+from tqdm import tqdm
+
+import tensorrt
+import pycuda.driver as cuda
+
+
+def load_class_names(namesfile):
+    class_names = []
+    with open(namesfile, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.rstrip()
+        class_names.append(line)
+    return class_names
+
+# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
+# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
+def box_class85to6(input):
+    center_x_y = input[:, :2]
+    side = input[:, 2:4]
+    conf = input[:, 4:5]
+    class_id = np.argmax(input[:, 5:], axis = -1)
+    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
+    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
+    x1_y1 = center_x_y - 0.5 * side
+    x2_y2 = center_x_y + 0.5 * side
+    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
+    return nms_input
+
+def save2json(batch_img_id, pred_boxes, json_result, class_trans):
+    for i, boxes in enumerate(pred_boxes):
+        if boxes is not None:
+            image_id = int(batch_img_id[i])
+            # have no target
+            if image_id == -1:
+                continue
+
+            for x1, y1, x2, y2, _, p, c in boxes:
+                x1, y1, x2, y2, p = float(x1), float(y1), float(x2), float(y2), float(p)
+                c = int(c)
+                x = x1
+                y = y1
+                w = x2 - x1
+                h = y2 - y1
+
+                json_result.append(
+                    {
+                        "image_id": image_id,
+                        "category_id": class_trans[c - 1],
+                        "bbox": [x, y, w, h],
+                        "score": p,
+                    }
+                )
+
+################## About TensorRT #################
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        # print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
+##########################################################
+
+
+################## About Loading Dataset #################
+def load_images(images_path):
+    """
+    If image path is given, return it directly
+    For txt file, read it and return each line as image path
+    In other case, it's a folder, return a list with names of each
+    jpg, jpeg and png file
+    """
+    input_path_extension = images_path.split('.')[-1]
+    if input_path_extension in ['jpg', 'jpeg', 'png']:
+        return [images_path]
+    elif input_path_extension == "txt":
+        with open(images_path, "r") as f:
+            return f.read().splitlines()
+    else:
+        return glob.glob(
+            os.path.join(images_path, "*.jpg")) + \
+            glob.glob(os.path.join(images_path, "*.png")) + \
+            glob.glob(os.path.join(images_path, "*.jpeg"))
+
+def prepare_batch(images_path, bs=16, input_size=(608, 608)):
+
+    width, height = input_size
+
+    batch_names = []
+    batch_images = []
+    batch_shapes = []
+
+    temp_names = []
+    temp_images = []
+    temp_shapes = []
+
+    for i, image_path in tqdm(enumerate(images_path), desc="Loading coco data"):
+        name = os.path.basename(image_path)
+        image = cv2.imread(image_path)
+        h, w, _ = image.shape
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image_resized = cv2.resize(image_rgb, (width, height),
+                                   interpolation=cv2.INTER_LINEAR)
+        custom_image = image_resized.transpose(2, 0, 1).astype(np.float32) / 255.
+        custom_image = np.expand_dims(custom_image, axis=0)
+
+        if i != 0 and i % bs == 0:
+            batch_names.append(temp_names)
+            batch_images.append(np.concatenate(temp_images, axis=0))
+            batch_shapes.append(temp_shapes)
+
+            temp_names = [name]
+            temp_images = [custom_image]
+            temp_shapes = [(h, w)]
+        else:
+            temp_names.append(name)
+            temp_images.append(custom_image)
+            temp_shapes.append((h, w))
+
+    return batch_names, batch_images, batch_shapes
+##########################################################
+
+
+################## About Operating box #################
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, ratio, (dw, dh)
+
+def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
+    # Rescale boxes (xyxy) from net_shape to ori_shape
+
+    if use_letterbox:
+
+        gain = min(
+            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
+        )  # gain  = new / old
+        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
+            net_shape[0] - ori_shape[0] * gain
+        ) / 2.0
+
+        boxes[:, [0, 2]] -= pad[0]  # x padding
+        boxes[:, [1, 3]] -= pad[1]  # y padding
+        boxes[:, :4] /= gain
+    else:
+        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
+
+        boxes[:, 0] /= x_scale
+        boxes[:, 1] /= y_scale
+        boxes[:, 2] /= x_scale
+        boxes[:, 3] /= y_scale
+
+    clip_boxes(boxes, ori_shape)
+    return boxes
+
+def clip_boxes(boxes, shape):
+
+    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+##########################################################
+
+
+################## About pre and post processing #########
+def pre_processing(src_img, imgsz=608):
+    resized = cv2.resize(src_img, (imgsz, imgsz), interpolation=cv2.INTER_LINEAR)
+    in_img = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+    in_img = np.transpose(in_img, (2, 0, 1)).astype(np.float32)
+    in_img = np.expand_dims(in_img, axis=0)
+    in_img /= 255.0
+    return in_img
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+    # print(boxes.shape)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = confs.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        idx_self = order[0]
+        idx_other = order[1:]
+
+        keep.append(idx_self)
+
+        xx1 = np.maximum(x1[idx_self], x1[idx_other])
+        yy1 = np.maximum(y1[idx_self], y1[idx_other])
+        xx2 = np.minimum(x2[idx_self], x2[idx_other])
+        yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+
+        if min_mode:
+            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+        else:
+            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+        inds = np.where(over <= nms_thresh)[0]
+        order = order[inds + 1]
+
+    return np.array(keep)
+
+
+def post_processing(img, conf_thresh, nms_thresh, output, num_classes=80):
+
+    # [batch, num, 1, 4]
+    box_array = output[:, :, :4]
+    # [batch, num, 2]
+    class_confs = output[:, :, 4:]
+
+    max_conf = class_confs[:, :, 1]
+    max_id = class_confs[:, :, 0]
+
+    bboxes_batch = []
+    for i in range(box_array.shape[0]):
+
+        argwhere = max_conf[i] > conf_thresh
+        l_box_array = box_array[i, argwhere, :]
+        l_max_conf = max_conf[i, argwhere]
+        l_max_id = max_id[i, argwhere]
+
+        bboxes = []
+        # nms for each class
+        for j in range(num_classes):
+
+            cls_argwhere = l_max_id == j
+            ll_box_array = l_box_array[cls_argwhere, :]
+            ll_max_conf = l_max_conf[cls_argwhere]
+            ll_max_id = l_max_id[cls_argwhere]
+
+            keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+
+            if (keep.size > 0):
+                ll_box_array = ll_box_array[keep, :]
+                ll_max_conf = ll_max_conf[keep]
+                ll_max_id = ll_max_id[keep]
+
+                for k in range(ll_box_array.shape[0]):
+                    bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2],
+                                  ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+
+        bboxes_batch.append(bboxes)
+
+    return bboxes_batch
+##########################################################
+
diff --git a/models/cv/detection/yolov6/ixrt/deploy.py b/models/cv/detection/yolov6/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..f73d14b2617eee1e458825dc66d38177f482a1b1
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/deploy.py
@@ -0,0 +1,99 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import copy
+
+from typing import Union, Callable, List
+
+from tensorrt.deploy.api import *
+from tensorrt.deploy.backend.onnx.converter import default_converter
+from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
+from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
+from tensorrt.deploy.ir.operator_type import OperatorType as OP
+from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
+from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
+from tensorrt.deploy.ir import Graph
+from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
+from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+
+class FuseSiLUPass(BasePass):
+    def process(self, graph: Graph) -> Graph:
+        pattern = build_sequence_graph([OP.SIGMOID, OP.MUL])
+
+        matcher = GraphMatcher(pattern, strict=False)
+        self.transform = GraphTransform(graph)
+        matcher.findall(graph, self.fuse_mish)
+        return graph
+
+    def fuse_mish(self, graph: Graph, pattern_graph: PatternGraph):
+        sigmoid = pattern_graph.nodes[0].operator
+        mul = pattern_graph.nodes[-1].operator
+
+        if not self.can_fused(graph, pattern_graph):
+            return
+
+        self.transform.delete_operators_between_op_op(sigmoid, mul)
+
+        silu_op = Operator(
+            name=generate_operator_name(graph, pattern="SiLU_{idx}"),
+            op_type=OP.SILU,
+            inputs=copy.copy(sigmoid.inputs),
+            outputs=copy.copy(mul.outputs),
+        )
+        silu_op.is_quant_operator = sigmoid.is_quant_operator and mul.is_quant_operator
+        graph.add_operator(silu_op)
+
+    def can_fused(self, graph: Graph, pattern_graph: PatternGraph):
+        sigmoid = pattern_graph.nodes[0].operator
+        mul = pattern_graph.nodes[-1].operator
+
+        # 如果 sigmoid 的结果 被多个 OP 使用，则不能融合
+        if len(self.transform.get_next_operators(sigmoid)) > 1:
+            return False
+
+        # 检查 mul 的输入是不是和 sigmoid 是同源的
+        softplus_prev_op = graph.get_previous_operators(sigmoid)
+        if len(softplus_prev_op) != 1:
+            return False
+
+        mul_prev_op = graph.get_previous_operators(mul)
+        if len(mul_prev_op) != 2:
+            return False
+
+        for op in mul_prev_op:
+            if op is softplus_prev_op[0]:
+                return True
+
+        return False
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str)
+    parser.add_argument("--dst", type=str)
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+    graph = create_source(args.src)()
+    graph = FuseSiLUPass().process(graph)
+    create_target(saved_path=args.dst).export(graph)
+    print("Surged onnx lies on", args.dst)
diff --git a/models/cv/detection/yolov6/ixrt/inference.py b/models/cv/detection/yolov6/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..836f13b2376ded6144ea9bf0da7ed47cd3f5905f
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/inference.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import sys
+sys.path.insert(0, "YOLOv6")
+import json
+import argparse
+import time
+import tensorrt
+from tensorrt import Dims
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import create_engine_context, setup_io_bindings
+
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from yolov6.core.evaler import Evaler
+from yolov6.utils.events import NCOLS
+from yolov6.utils.nms import non_max_suppression
+from yolov6.data.data_load import create_dataloader
+
+
+coco_classes = {
+    0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 
+    10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 
+    20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 
+    30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 
+    40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 
+    50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 
+    60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 
+    70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
+}
+
+class EvalerIXRT(Evaler):
+    def eval_ixrt(self, args, stride=32):
+        self.stride = stride
+        def init_data(dataloader, task):
+            self.is_coco = self.data.get("is_coco", False)
+            self.ids = self.coco80_to_coco91_class() if self.is_coco else list(range(1000))
+            pad = 0.0 
+            dataloader = create_dataloader(
+                self.data[task], self.img_size, self.batch_size, self.stride, 
+                check_labels=True, pad=pad, rect=False, data_dict=self.data, task=task)[0]
+            return dataloader
+        
+        dataloader = init_data(None,'val')
+        pred_results = []
+        
+        input_name = "input"
+        host_mem = tensorrt.IHostMemory
+        logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+        engine, context = create_engine_context(args.model_engine, logger)
+        input_idx = engine.get_binding_index(input_name)
+        context.set_binding_shape(input_idx, Dims((args.bsz,3,args.imgsz,args.imgsz)))
+        inputs, outputs, allocations = setup_io_bindings(engine, context)
+        
+        if args.warm_up > 0:
+            print("\nWarm Start.")
+            for i in range(args.warm_up):
+                context.execute_v2(allocations)
+            print("Warm Done.")
+        
+        pbar = tqdm(dataloader, desc="Inferencing model in validation dataset.", ncols=NCOLS)
+        
+        forward_time = 0.0
+        num_samples = 0   
+        for imgs, targes, paths, shapes in pbar:
+            imgs = imgs.float()
+            pad_batch = len(imgs) != self.batch_size
+            if pad_batch:
+                origin_size = len(imgs)
+                imgs = np.resize(imgs, (self.batch_size, *imgs.shape[1:]))
+            imgs /= 255.0
+            # print(imgs.shape)
+            batch_data = np.ascontiguousarray(imgs)
+            data_shape = batch_data.shape
+            
+            cur_bsz_sample = batch_data.shape[0]
+            num_samples += cur_bsz_sample
+
+            # Set input
+            input_idx = engine.get_binding_index(input_name)
+            context.set_binding_shape(input_idx, Dims(data_shape))
+            inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+            cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+            # Prepare the output data
+            output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+            
+            
+            start_time = time.time()
+            context.execute_v2(allocations)
+            end_time = time.time()
+            forward_time += end_time - start_time
+            
+            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            
+            if not args.perf_only:
+                if pad_batch:
+                    output = output[:origin_size]
+
+                outputs = torch.from_numpy(output)
+                outputs = non_max_suppression(outputs, self.conf_thres, self.iou_thres, multi_label=True)
+                pred_results.extend(self.convert_to_coco_format(outputs, imgs, paths, shapes, self.ids))
+        if args.perf_only:       
+            fps = num_samples / forward_time
+            return fps
+        else:
+            return dataloader, pred_results
+    
+    def eval_ixrt_map(self, pred_results, dataloader, task):
+        '''Evaluate models
+            For task speed, this function only evaluates the speed of model and outputs inference time.
+            For task val, this function evaluates the speed and mAP by pycocotools, and returns
+            inference time and mAP value.
+        '''
+        if not self.do_coco_metric and self.do_pr_metric:
+            return self.pr_metric_result
+        print(f'\nEvaluating mAP by pycocotools.')
+        if task != 'speed' and len(pred_results):
+            if 'anno_path' in self.data:
+                anno_json = self.data['anno_path']
+            else:
+                # generated coco format labels in dataset initialization
+                task = 'val' if task == 'train' else task
+                dataset_root = os.path.dirname(os.path.dirname(self.data[task]))
+                base_name = os.path.basename(self.data[task])
+                anno_json = os.path.join(dataset_root, 'annotations', f'instances_{base_name}.json')
+            pred_json = os.path.join(self.save_dir, "predictions.json")
+            print(f'Saving {pred_json}...')
+            with open(pred_json, 'w') as f:
+                json.dump(pred_results, f)
+
+            anno = COCO(anno_json)
+            pred = anno.loadRes(pred_json)
+            cocoEval = COCOeval(anno, pred, 'bbox')
+            if self.is_coco:
+                imgIds = [int(os.path.basename(x).split(".")[0])
+                            for x in dataloader.dataset.img_paths]
+                cocoEval.params.imgIds = imgIds
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            cocoEval.summarize()
+
+            return cocoEval.stats
+        else:
+            print("pred_results is none")
+            return None
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument(
+        "--model_engine",
+        type=str,
+        default="",
+        help="model engine path",
+    )
+    
+    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+    parser.add_argument(
+        "--imgsz",
+        "--img",
+        "--img-size",
+        type=int,
+        default=608,
+        help="inference size h,w",
+    )
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+    
+    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")          
+        
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+    
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    task = 'val'
+
+    batch_size = args.bsz
+    data_path = os.path.join(args.datasets, "images", "val2017")
+    label_path = os.path.join(args.datasets, "annotations", "instances_val2017.json")
+        
+
+    data = {
+        'task': 'val',
+        'val': data_path,
+        'anno_path': label_path,
+        'names': coco_classes,
+        'is_coco': True,
+        'nc': 80
+    }
+
+    evaluator = EvalerIXRT(data, batch_size)
+    
+    if args.perf_only:
+        fps = evaluator.eval_ixrt(args)
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {args.fps_target}")
+    else:
+        dataloader, pred_results = evaluator.eval_ixrt(args)
+        eval_result = evaluator.eval_ixrt_map(pred_results, dataloader, task)
+        map, map50 = eval_result[:2]
+        print("MAP@0.5 : ", map50)
+        print(f"Accuracy Check : Test {map50} >= target {args.acc_target}")
+        if map50 >= args.acc_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/yolov6/ixrt/quant.py b/models/cv/detection/yolov6/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..70265cbc25d24d4ed41640c76f78a1839555f749
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/quant.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import random
+import argparse
+import numpy as np
+from tensorrt.deploy import static_quantize
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from common import letterbox
+
+
+def setseed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model", type=str,  default="yolov4_bs16_without_decoder.onnx")
+    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
+    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
+    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
+    parser.add_argument("--disable_quant_names", nargs='*', type=str)
+    parser.add_argument("--save_quant_model", type=str,  help="save the quantization model path", default=None)
+    parser.add_argument("--bsz", type=int, default=16)
+    parser.add_argument("--step", type=int, default=32)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--imgsz", type=int, default=608)
+    parser.add_argument("--use_letterbox", action="store_true")
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+setseed(args.seed)
+model_name = args.model_name
+
+
+def get_dataloader(data_dir, step=32, batch_size=16, new_shape=[608, 608], use_letterbox=False):
+    num = step * batch_size
+    val_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
+    random.shuffle(val_list)
+    pic_list = val_list[:num]
+
+    calibration_dataset = []
+    for file_path in pic_list:
+        pic_data = cv2.imread(file_path)
+        org_img = pic_data
+        assert org_img is not None, 'Image not Found ' + file_path
+        h0, w0 = org_img.shape[:2]
+
+        if use_letterbox:
+            img, ratio, dwdh = letterbox(org_img, new_shape=(new_shape[1], new_shape[0]), auto=False, scaleup=True)
+        else:
+            img = cv2.resize(org_img, new_shape)
+        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img = np.ascontiguousarray(img) / 255.0  # 0~1 np array
+        img = torch.from_numpy(img).float()
+
+        calibration_dataset.append(img)
+
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        shuffle=True,
+        batch_size=batch_size,
+        drop_last=True
+    )
+    return calibration_dataloader
+
+dataloader = get_dataloader(
+    data_dir=args.dataset_dir,
+    step=args.step,
+    batch_size=args.bsz,
+    new_shape=(args.imgsz, args.imgsz),
+    use_letterbox=args.use_letterbox
+)
+
+dirname = os.path.dirname(args.save_quant_model)
+quant_json_path = os.path.join(dirname, f"quantized_{model_name}.json")
+
+static_quantize(args.model,
+        calibration_dataloader=dataloader,
+        save_quant_onnx_path=args.save_quant_model,
+        save_quant_params_path=quant_json_path,
+        observer=args.observer,
+        data_preprocess=lambda x: x.to("cuda"),
+        quant_format="qdq",
+        disable_quant_names=args.disable_quant_names)
diff --git a/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_accuracy.sh b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..09cc0ac03802a697696ff3e68ea2c2157e240ea7
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_accuracy.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov6s
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov6s.onnx
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov6s_fp16.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision float16                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 640                              \
+    --datasets ${DATASETS_DIR}               \
+    --acc_target 0.3                     
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_performance.sh b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..409fd354e86d7fa3092fda68bd1da2c1ed35498d
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_performance.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov6s
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov6s.onnx
+
+# fuse silu
+# FINAL_MODEL=${CHECKPOINTS_DIR}/yolov6_bs${BATCH_SIZE}_fused.onnx
+# if [ -f $FINAL_MODEL ];then
+#     echo "  "Fuse silu Skip, $FINAL_MODEL has been existed
+# else
+#     python3 ${RUN_DIR}/deploy.py             \
+#         --src ${CURRENT_MODEL}               \
+#         --dst ${FINAL_MODEL}                 
+#     echo "  "Generate ${FINAL_MODEL}
+# fi
+# CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov6s_fp16.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision float16                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 640                              \
+    --datasets ${DATASETS_DIR}               \
+    --perf_only true                         \
+    --fps_target 0.0                     
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_accuracy.sh b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..701f80f06ac1ca46d154c1122f02913b247a83af
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_accuracy.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov6s
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov6s.onnx
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov6s_bs${BATCH_SIZE}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/quant.py             \
+        --model_name "YOLOV6s"       \
+        --model ${CURRENT_MODEL}            \
+        --bsz ${BATCH_SIZE}                 \
+        --dataset_dir ${EVAL_DIR}           \
+        --ann_file ${COCO_GT}               \
+        --observer "hist_percentile"        \
+        --save_quant_model ${FINAL_MODEL}   \
+        --imgsz 640                         \
+        --disable_quant_names '/detect/Split' '/detect/Div' '/detect/Sub' '/detect/Add' '/detect/Add_1' '/detect/Sub_1' '/detect/Div' '/detect/Concat_6' '/detect/Mul' '/detect/Concat_7' \
+        --use_letterbox                     
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov6s_int8.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision int8                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 640                              \
+    --datasets ${DATASETS_DIR}               \
+    --acc_target 0.3                     
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_performance.sh b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..58f77417058c5461fe84161bb139bcecad4623c6
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_performance.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov6s
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov6s.onnx
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov6s_bs${BATCH_SIZE}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/quant.py             \
+        --model_name "YOLOV6s"       \
+        --model ${CURRENT_MODEL}            \
+        --bsz ${BATCH_SIZE}                 \
+        --dataset_dir ${EVAL_DIR}           \
+        --ann_file ${COCO_GT}               \
+        --observer "hist_percentile"        \
+        --save_quant_model ${FINAL_MODEL}   \
+        --imgsz 640                         \
+        --disable_quant_names '/detect/Split' '/detect/Div' '/detect/Sub' '/detect/Add' '/detect/Add_1' '/detect/Sub_1' '/detect/Div' '/detect/Concat_6' '/detect/Mul' '/detect/Concat_7' \
+        --use_letterbox                     
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov6s_int8.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision int8                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 640                              \
+    --datasets ${DATASETS_DIR}               \
+    --perf_only true                         \
+    --fps_target 0.0                     
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/detection/yolov8/ixrt/README.md b/models/cv/detection/yolov8/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..07558edf6f3591a70262c778309d67484d1edf4f
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/README.md
@@ -0,0 +1,72 @@
+# YOLOv8
+
+## Description
+
+Yolov8 combines speed and accuracy in real-time object detection tasks. With a focus on simplicity and efficiency, this model employs a single neural network to make predictions, enabling fast and accurate identification of objects in images or video streams.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install onnxsim
+pip3 install pycocotools
+pip3 install ultralytics
+pip3 install pycuda
+```
+
+### Download
+
+Pretrained model: <https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt>
+
+Dataset: <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+
+```bash
+# get yolov8n.pt
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt
+# set coco path
+mkdir -p data/
+ln -s /Path/to/coco/ data/coco
+```
+
+### Model Conversion
+
+```bash
+python3 export.py --weight yolov8n.pt --batch 32
+onnxsim yolov8n.onnx ./data/yolov8n.onnx
+```
+
+## Inference
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_yolov8n_fp16_accuracy.sh
+# Performance
+bash scripts/infer_yolov8n_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_yolov8n_int8_accuracy.sh
+# Performance
+bash scripts/infer_yolov8n_int8_performance.sh
+```
+
+## Results
+
+| Model  | BatchSize | Precision | FPS      | MAP@0.5 |
+| ------ | --------- | --------- | -------- | ------- |
+| YOLOv8 | 32        | FP16      | 1511.366 | 0.525   |
+| YOLOv8 | 32        | INT8      | 1841.017 | 0.517   |
diff --git a/models/cv/detection/yolov8/ixrt/build_engine.py b/models/cv/detection/yolov8/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e1719a22c84b400a2ba9b9cbfdea6bae99e80d
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/build_engine.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from tensorrt import Dims
+
+
+def build_engine_trtapi_staticshape(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
+    build_config.set_flag(precision)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    print("Build static shape engine done!")
+
+
+def build_engine_trtapi_dynamicshape(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+
+    profile = builder.create_optimization_profile()
+    profile.set_shape("input",
+                        Dims([1, 3, 608, 608]),
+                        Dims([32, 3, 608, 608]),
+                        Dims([64, 3, 608, 608]),
+    )
+    build_config.add_optimization_profile(profile)
+
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(config.model)
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
+    build_config.set_flag(precision)
+
+    # set dynamic
+    num_inputs = network.num_inputs
+    for i in range(num_inputs):
+        input_tensor = network.get_input(i)
+        input_tensor.shape = Dims([-1, 3, 608, 608])
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    print("Build dynamic shape engine done!")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    # engine args
+    parser.add_argument("--engine", type=str, default=None)
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    build_engine_trtapi_staticshape(args)
+    # build_engine_trtapi_dynamicshape(args)
diff --git a/models/cv/detection/yolov8/ixrt/common.py b/models/cv/detection/yolov8/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c2766533fa5a334a61231adb168ecf09622c3
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/common.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import glob
+import time
+import numpy as np
+from tqdm import tqdm
+
+import tensorrt
+import pycuda.driver as cuda
+
+
+def load_class_names(namesfile):
+    class_names = []
+    with open(namesfile, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.rstrip()
+        class_names.append(line)
+    return class_names
+
+# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
+# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
+def box_class85to6(input):
+    center_x_y = input[:, :2]
+    side = input[:, 2:4]
+    conf = input[:, 4:5]
+    class_id = np.argmax(input[:, 5:], axis = -1)
+    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
+    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
+    x1_y1 = center_x_y - 0.5 * side
+    x2_y2 = center_x_y + 0.5 * side
+    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
+    return nms_input
+
+def save2json(batch_img_id, pred_boxes, json_result, class_trans):
+    for i, boxes in enumerate(pred_boxes):
+        if boxes is not None:
+            image_id = int(batch_img_id[i])
+            # have no target
+            if image_id == -1:
+                continue
+
+            for x1, y1, x2, y2, _, p, c in boxes:
+                x1, y1, x2, y2, p = float(x1), float(y1), float(x2), float(y2), float(p)
+                c = int(c)
+                x = x1
+                y = y1
+                w = x2 - x1
+                h = y2 - y1
+
+                json_result.append(
+                    {
+                        "image_id": image_id,
+                        "category_id": class_trans[c - 1],
+                        "bbox": [x, y, w, h],
+                        "score": p,
+                    }
+                )
+
+################## About TensorRT #################
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        # print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
+##########################################################
+
+
+################## About Loading Dataset #################
+def load_images(images_path):
+    """
+    If image path is given, return it directly
+    For txt file, read it and return each line as image path
+    In other case, it's a folder, return a list with names of each
+    jpg, jpeg and png file
+    """
+    input_path_extension = images_path.split('.')[-1]
+    if input_path_extension in ['jpg', 'jpeg', 'png']:
+        return [images_path]
+    elif input_path_extension == "txt":
+        with open(images_path, "r") as f:
+            return f.read().splitlines()
+    else:
+        return glob.glob(
+            os.path.join(images_path, "*.jpg")) + \
+            glob.glob(os.path.join(images_path, "*.png")) + \
+            glob.glob(os.path.join(images_path, "*.jpeg"))
+
+def prepare_batch(images_path, bs=16, input_size=(608, 608)):
+
+    width, height = input_size
+
+    batch_names = []
+    batch_images = []
+    batch_shapes = []
+
+    temp_names = []
+    temp_images = []
+    temp_shapes = []
+
+    for i, image_path in tqdm(enumerate(images_path), desc="Loading coco data"):
+        name = os.path.basename(image_path)
+        image = cv2.imread(image_path)
+        h, w, _ = image.shape
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image_resized = cv2.resize(image_rgb, (width, height),
+                                   interpolation=cv2.INTER_LINEAR)
+        custom_image = image_resized.transpose(2, 0, 1).astype(np.float32) / 255.
+        custom_image = np.expand_dims(custom_image, axis=0)
+
+        if i != 0 and i % bs == 0:
+            batch_names.append(temp_names)
+            batch_images.append(np.concatenate(temp_images, axis=0))
+            batch_shapes.append(temp_shapes)
+
+            temp_names = [name]
+            temp_images = [custom_image]
+            temp_shapes = [(h, w)]
+        else:
+            temp_names.append(name)
+            temp_images.append(custom_image)
+            temp_shapes.append((h, w))
+
+    return batch_names, batch_images, batch_shapes
+##########################################################
+
+
+################## About Operating box #################
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, ratio, (dw, dh)
+
+def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
+    # Rescale boxes (xyxy) from net_shape to ori_shape
+
+    if use_letterbox:
+
+        gain = min(
+            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
+        )  # gain  = new / old
+        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
+            net_shape[0] - ori_shape[0] * gain
+        ) / 2.0
+
+        boxes[:, [0, 2]] -= pad[0]  # x padding
+        boxes[:, [1, 3]] -= pad[1]  # y padding
+        boxes[:, :4] /= gain
+    else:
+        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
+
+        boxes[:, 0] /= x_scale
+        boxes[:, 1] /= y_scale
+        boxes[:, 2] /= x_scale
+        boxes[:, 3] /= y_scale
+
+    clip_boxes(boxes, ori_shape)
+    return boxes
+
+def clip_boxes(boxes, shape):
+
+    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+##########################################################
+
+
+################## About pre and post processing #########
+def pre_processing(src_img, imgsz=608):
+    resized = cv2.resize(src_img, (imgsz, imgsz), interpolation=cv2.INTER_LINEAR)
+    in_img = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+    in_img = np.transpose(in_img, (2, 0, 1)).astype(np.float32)
+    in_img = np.expand_dims(in_img, axis=0)
+    in_img /= 255.0
+    return in_img
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+    # print(boxes.shape)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = confs.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        idx_self = order[0]
+        idx_other = order[1:]
+
+        keep.append(idx_self)
+
+        xx1 = np.maximum(x1[idx_self], x1[idx_other])
+        yy1 = np.maximum(y1[idx_self], y1[idx_other])
+        xx2 = np.minimum(x2[idx_self], x2[idx_other])
+        yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+
+        if min_mode:
+            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+        else:
+            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+        inds = np.where(over <= nms_thresh)[0]
+        order = order[inds + 1]
+
+    return np.array(keep)
+
+
+def post_processing(img, conf_thresh, nms_thresh, output, num_classes=80):
+
+    # [batch, num, 1, 4]
+    box_array = output[:, :, :4]
+    # [batch, num, 2]
+    class_confs = output[:, :, 4:]
+
+    max_conf = class_confs[:, :, 1]
+    max_id = class_confs[:, :, 0]
+
+    bboxes_batch = []
+    for i in range(box_array.shape[0]):
+
+        argwhere = max_conf[i] > conf_thresh
+        l_box_array = box_array[i, argwhere, :]
+        l_max_conf = max_conf[i, argwhere]
+        l_max_id = max_id[i, argwhere]
+
+        bboxes = []
+        # nms for each class
+        for j in range(num_classes):
+
+            cls_argwhere = l_max_id == j
+            ll_box_array = l_box_array[cls_argwhere, :]
+            ll_max_conf = l_max_conf[cls_argwhere]
+            ll_max_id = l_max_id[cls_argwhere]
+
+            keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+
+            if (keep.size > 0):
+                ll_box_array = ll_box_array[keep, :]
+                ll_max_conf = ll_max_conf[keep]
+                ll_max_id = ll_max_id[keep]
+
+                for k in range(ll_box_array.shape[0]):
+                    bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2],
+                                  ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+
+        bboxes_batch.append(bboxes)
+
+    return bboxes_batch
+##########################################################
+
diff --git a/models/cv/detection/yolov8/ixrt/export.py b/models/cv/detection/yolov8/ixrt/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..383b327e5794fd7930a78e2acfbf4237c556c4d8
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/export.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+from ultralytics import YOLO
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+    
+    parser.add_argument("--batch", 
+                type=int, 
+                required=True, 
+                help="batchsize of the model.")
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+    
+    model = YOLO(args.weight).cpu()
+    
+    model.export(format='onnx', batch=args.batch, imgsz=(640, 640), opset=11)
+    
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/detection/yolov8/ixrt/inference.py b/models/cv/detection/yolov8/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..d83b013610c132a776a2dc02663177e20a7ea2e3
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/inference.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import json
+import argparse
+import time
+import tensorrt
+from tensorrt import Dims
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import create_engine_context, setup_io_bindings
+
+from pathlib import Path
+
+from ultralytics.cfg import get_cfg
+from ultralytics.data import converter
+from ultralytics.utils import DEFAULT_CFG
+from ultralytics.data.utils import check_det_dataset
+from ultralytics.utils.metrics import ConfusionMatrix
+from ultralytics.models.yolo.detect import DetectionValidator
+
+coco_classes = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 
+                10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 
+                20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 
+                30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 
+                40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 
+                50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 
+                60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 
+                70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--model_engine", 
+                        type=str, 
+                        required=True, 
+                        help="ixrt engine path.")
+    
+    parser.add_argument("--bsz",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument(
+        "--imgsz",
+        "--img",
+        "--img-size",
+        type=int,
+        default=640,
+        help="inference size h,w",
+    )
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+    
+    parser.add_argument("--warm_up", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=0.0,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=0.0,
+                        help="Model inference FPS target.")
+    
+    parser.add_argument("--conf",
+                        type=float,
+                        default=0.001,
+                        help="confidence threshold.")
+    
+    parser.add_argument("--iou",
+                        type=float,
+                        default=0.65,
+                        help="iou threshold.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+class IxRT_Validator(DetectionValidator):
+    def __call__(self, config, data):
+        self.data = data
+        self.stride = 32
+        self.dataloader = self.get_dataloader(self.data.get(self.args.split), self.args.batch)
+        self.init_metrics()
+        
+        total_num = 0
+
+        input_name = "input"
+        host_mem = tensorrt.IHostMemory
+        logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+        engine, context = create_engine_context(config.model_engine, logger)
+        input_idx = engine.get_binding_index(input_name)
+        context.set_binding_shape(input_idx, Dims((config.bsz,3,config.imgsz,config.imgsz)))
+        inputs, outputs, allocations = setup_io_bindings(engine, context)
+        
+        if config.warm_up > 0:
+            print("\nWarm Start.")
+            for i in range(config.warm_up):
+                context.execute_v2(allocations)
+            print("Warm Done.")
+        
+        forward_time = 0.0
+        num_samples = 0   
+
+        for batch in tqdm(self.dataloader):
+            batch = self.preprocess(batch)
+
+            imgs = batch['img']
+            pad_batch = len(imgs) != self.args.batch
+            if pad_batch:
+                origin_size = len(imgs)
+                imgs = np.resize(imgs, (self.args.batch, *imgs.shape[1:]))
+            
+            batch_data = np.ascontiguousarray(imgs)
+            data_shape = batch_data.shape
+            
+            cur_bsz_sample = batch_data.shape[0]
+            num_samples += cur_bsz_sample
+
+            # Set input
+            input_idx = engine.get_binding_index(input_name)
+            context.set_binding_shape(input_idx, Dims(data_shape))
+            inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+            cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+            # Prepare the output data
+            output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+            
+            
+            start_time = time.time()
+            context.execute_v2(allocations)
+            end_time = time.time()
+            forward_time += end_time - start_time
+            
+            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            if pad_batch:
+                output = output[:origin_size]
+                
+            outputs = torch.from_numpy(output)
+            
+            preds = self.postprocess([outputs])
+            
+            self.update_metrics(preds, batch)
+                
+        if config.perf_only:
+            fps = num_samples / forward_time
+            return fps
+        else:
+            stats = self.get_stats()
+
+            if self.args.save_json and self.jdict:
+                with open(str(self.save_dir / 'predictions.json'), 'w') as f:
+                    print(f'Saving {f.name} ...')
+                    json.dump(self.jdict, f)  # flatten and save
+
+            stats = self.eval_json(stats)
+
+            return stats
+
+    def init_metrics(self):
+        """Initialize evaluation metrics for YOLO."""
+        val = self.data.get(self.args.split, '')  # validation path
+        self.is_coco = isinstance(val, str) and 'coco' in val and val.endswith(f'{os.sep}val2017.txt')  # is COCO
+        self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(1000))
+        self.args.save_json |= self.is_coco and not self.training  # run on final val if training COCO
+        self.names = self.data['names']
+        self.nc = len(self.names)
+        self.metrics.names = self.names
+        self.confusion_matrix = ConfusionMatrix(nc=80)
+        self.seen = 0
+        self.jdict = []
+        self.stats = dict(tp=[], conf=[], pred_cls=[], target_cls=[], target_img=[])
+
+def main():
+    config = parse_args()
+
+    batch_size = config.bsz
+
+    overrides = {'mode': 'val'}
+    cfg_args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
+
+    cfg_args.batch = batch_size
+    cfg_args.save_json = True
+
+    data = {
+        'path': Path(config.datasets),
+        'val': os.path.join(config.datasets, 'val2017.txt'),
+        'names': coco_classes
+    }
+
+    validator = IxRT_Validator(args=cfg_args, save_dir=Path('.'))
+    
+    if config.perf_only:
+        fps = validator(config, data)
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+    else:
+        stats = validator(config, data)
+        
+    
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/yolov8/ixrt/quant.py b/models/cv/detection/yolov8/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..70265cbc25d24d4ed41640c76f78a1839555f749
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/quant.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import cv2
+import random
+import argparse
+import numpy as np
+from tensorrt.deploy import static_quantize
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from common import letterbox
+
+
+def setseed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str)
+    parser.add_argument("--model", type=str,  default="yolov4_bs16_without_decoder.onnx")
+    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
+    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
+    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
+    parser.add_argument("--disable_quant_names", nargs='*', type=str)
+    parser.add_argument("--save_quant_model", type=str,  help="save the quantization model path", default=None)
+    parser.add_argument("--bsz", type=int, default=16)
+    parser.add_argument("--step", type=int, default=32)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--imgsz", type=int, default=608)
+    parser.add_argument("--use_letterbox", action="store_true")
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+setseed(args.seed)
+model_name = args.model_name
+
+
+def get_dataloader(data_dir, step=32, batch_size=16, new_shape=[608, 608], use_letterbox=False):
+    num = step * batch_size
+    val_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
+    random.shuffle(val_list)
+    pic_list = val_list[:num]
+
+    calibration_dataset = []
+    for file_path in pic_list:
+        pic_data = cv2.imread(file_path)
+        org_img = pic_data
+        assert org_img is not None, 'Image not Found ' + file_path
+        h0, w0 = org_img.shape[:2]
+
+        if use_letterbox:
+            img, ratio, dwdh = letterbox(org_img, new_shape=(new_shape[1], new_shape[0]), auto=False, scaleup=True)
+        else:
+            img = cv2.resize(org_img, new_shape)
+        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img = np.ascontiguousarray(img) / 255.0  # 0~1 np array
+        img = torch.from_numpy(img).float()
+
+        calibration_dataset.append(img)
+
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        shuffle=True,
+        batch_size=batch_size,
+        drop_last=True
+    )
+    return calibration_dataloader
+
+dataloader = get_dataloader(
+    data_dir=args.dataset_dir,
+    step=args.step,
+    batch_size=args.bsz,
+    new_shape=(args.imgsz, args.imgsz),
+    use_letterbox=args.use_letterbox
+)
+
+dirname = os.path.dirname(args.save_quant_model)
+quant_json_path = os.path.join(dirname, f"quantized_{model_name}.json")
+
+static_quantize(args.model,
+        calibration_dataloader=dataloader,
+        save_quant_onnx_path=args.save_quant_model,
+        save_quant_params_path=quant_json_path,
+        observer=args.observer,
+        data_preprocess=lambda x: x.to("cuda"),
+        quant_format="qdq",
+        disable_quant_names=args.disable_quant_names)
diff --git a/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_accuracy.sh b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44e7537657a65fc84d89531b8df9ad647513dfbe
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_accuracy.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov8n
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov8n.onnx
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov8n_fp16.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision float16                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 640                              \
+    --datasets ${DATASETS_DIR}               \
+    --acc_target 0.3                     
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_performance.sh b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1ab3808f1f45cf2072fa41a2107fa88c17fa3610
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_performance.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov8n
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov8n.onnx
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov8n_fp16.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision float16                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 640                              \
+    --datasets ${DATASETS_DIR}               \
+    --perf_only true                         \
+    --fps_target 0.0                     
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_accuracy.sh b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a2257463d70ee8fe6e9853db0fafd44f98ad8c83
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_accuracy.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+DISABLE_NAMES=('/model.22/Concat' '/model.22/Concat_1' '/model.22/Concat_2' '/model.22/Reshape' '/model.22/Reshape_1' '/model.22/Reshape_2' '/model.22/Concat_3' '/model.22/Split' '/model.22/dfl/Reshape' '/model.22/dfl/Transpose' '/model.22/dfl/Softmax' '/model.22/dfl/Transpose_1' '/model.22/dfl/conv/Conv' '/model.22/dfl/Reshape_1' '/model.22/Slice' '/model.22/Slice_1' '/model.22/Sub' '/model.22/Add_1' '/model.22/Add_2' '/model.22/Div_1' '/model.22/Sub_1' '/model.22/Concat_4' '/model.22/Mul_2' '/model.22/Sigmoid' '/model.22/Concat_5')
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov8n
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov8n.onnx
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov8n_bs${BATCH_SIZE}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Quantize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/quant.py             \
+        --model_name "YOLOV8N"       \
+        --model ${CURRENT_MODEL}            \
+        --bsz ${BATCH_SIZE}                 \
+        --dataset_dir ${EVAL_DIR}           \
+        --ann_file ${COCO_GT}               \
+        --observer "hist_percentile"        \
+        --save_quant_model ${FINAL_MODEL}   \
+        --disable_quant_names '/model.22/Concat' '/model.22/Concat_1' '/model.22/Concat_2' '/model.22/Reshape' '/model.22/Reshape_1' '/model.22/Reshape_2' '/model.22/Concat_3' '/model.22/Split' '/model.22/dfl/Reshape' '/model.22/dfl/Transpose' '/model.22/dfl/Softmax' '/model.22/dfl/Transpose_1' '/model.22/dfl/conv/Conv' '/model.22/dfl/Reshape_1' '/model.22/Slice' '/model.22/Slice_1' '/model.22/Sub' '/model.22/Add_1' '/model.22/Add_2' '/model.22/Div_1' '/model.22/Sub_1' '/model.22/Concat_4' '/model.22/Mul_2' '/model.22/Sigmoid' '/model.22/Concat_5' \
+        --imgsz 640                          
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov8n_int8.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision int8                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 640                              \
+    --datasets ${DATASETS_DIR}               \
+    --acc_target 0.3                     
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_performance.sh b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f1774d5b2b28ce734dadb3e022a3359b3790f2da
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_performance.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov8n
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov8n.onnx
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov8n_bs${BATCH_SIZE}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Quantize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/quant.py             \
+        --model_name "YOLOV8N"       \
+        --model ${CURRENT_MODEL}            \
+        --bsz ${BATCH_SIZE}                 \
+        --dataset_dir ${EVAL_DIR}           \
+        --ann_file ${COCO_GT}               \
+        --observer "hist_percentile"        \
+        --save_quant_model ${FINAL_MODEL}   \
+        --disable_quant_names '/model.22/Concat' '/model.22/Concat_1' '/model.22/Concat_2' '/model.22/Reshape' '/model.22/Reshape_1' '/model.22/Reshape_2' '/model.22/Concat_3' '/model.22/Split' '/model.22/dfl/Reshape' '/model.22/dfl/Transpose' '/model.22/dfl/Softmax' '/model.22/dfl/Transpose_1' '/model.22/dfl/conv/Conv' '/model.22/dfl/Reshape_1' '/model.22/Slice' '/model.22/Slice_1' '/model.22/Sub' '/model.22/Add_1' '/model.22/Add_2' '/model.22/Div_1' '/model.22/Sub_1' '/model.22/Concat_4' '/model.22/Mul_2' '/model.22/Sigmoid' '/model.22/Concat_5' \
+        --imgsz 640                          
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov8n_int8.engine
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision int8                        \
+        --model ${CURRENT_MODEL}                \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py                 \
+    --model_engine ${ENGINE_FILE}                \
+    --warm_up 2                                 \
+    --bsz ${RUN_BATCH_SIZE}                         \
+    --imgsz 640                              \
+    --datasets ${DATASETS_DIR}               \
+    --perf_only true                         \
+    --fps_target 0.0                     
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/README.md b/models/cv/face/facenet/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c2df5120bf75917c11d1d5a68c7dd377c5c823a
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/README.md
@@ -0,0 +1,101 @@
+# FaceNet
+
+## Description
+
+Facenet is a facial recognition system originally proposed and developed by Google. It utilizes deep learning techniques, specifically convolutional neural networks (CNNs), to transform facial images into high-dimensional feature vectors. These feature vectors possess high discriminative power, enabling comparison and identification of different faces. The core idea of Facenet is to map faces into a multi-dimensional space of feature vectors, achieving efficient representation and recognition of faces.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install tensorflow
+pip3 install onnxsim
+pip3 install scikit-learn
+pip3 install tf_slim
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install scipy==1.8.0
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+pip3 install simplejson
+```
+
+### Download
+
+Pretrained model: <https://drive.google.com/open?id=1R77HmFADxe87GmoLwzfgMu_HY0IhcyBz>
+
+Dataset: <https://vis-www.cs.umass.edu/lfw/lfw.tgz> to download the lfw dataset.
+
+```bash
+cd ${DeepSparkInference_PATH}/models/cv/face/facenet/ixrt
+# download and unzip 20180408-102900.zip
+unzip 20180408-102900.zip
+```
+
+### Model Conversion
+
+```bash
+
+mkdir -p checkpoints
+mkdir -p facenet_weights
+git clone https://github.com/timesler/facenet-pytorch
+mv /Path/facenet/ixrt/tensorflow2pytorch.py facenet-pytorch
+python3 /facenet-pytorch/tensorflow2pytorch.py \
+        --facenet_weights_path ./facenet_weights \
+        --facenet_pb_path ./20180408-102900 \
+        --onnx_save_name facenet_export.onnx
+mv facenet_export.onnx ./facenet_weights
+```
+
+### Data preprocessing
+
+We need to adjust the image resolution of the original dataset to 160x160. For details, please refer to the following link: <https://blog.csdn.net/rookie_wei/article/details/82078373>. This code relies on tensorflow 1.xx; If you encounter problems with TensorFlow version incompatibility during dataset processing, you can also download the preprocessed dataset from here: <https://github.com/lanrax/Project_dataset/blob/master/facenet_datasets.zip>
+
+```bash
+# download and unzip facenet_datasets.zip
+wget https://raw.githubusercontent.com/lanrax/Project_dataset/master/facenet_datasets.zip
+unzip facenet_datasets.zip
+```
+
+## Inference
+
+Because there are differences in model export, it is necessary to verify the following information before executing inference: In deploy.py, "/last_bn/BatchNormalization_output_0" refers to the output name of the BatchNormalization node in the exported ONNX model, such as "1187". "/avgpool_1a/GlobalAveragePool_output_0" refers to the output name of the GlobalAveragePool node, such as "1178". Additionally, make sure to update "/last_bn/BatchNormalization_output_0" in build_engine.py to the corresponding name, such as "1187".
+
+```bash
+sed -i -e 's#/last_bn/BatchNormalization_output_0#1187#g' -e 's#/avgpool_1a/GlobalAveragePool_output_0#1178#g' deploy.py build_engine.py
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_facenet_fp16_accuracy.sh
+# Performance
+bash scripts/infer_facenet_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_facenet_int8_accuracy.sh
+# Performance
+bash scripts/infer_facenet_int8_performance.sh
+```
+
+## Results
+
+| Model   | BatchSize | Precision | FPS       | AUC   | ACC              |
+| ------- | --------- | --------- | --------- | ----- | ---------------- |
+| FaceNet | 64        | FP16      | 8825.802  | 0.999 | 0.98667+-0.00641 |
+| FaceNet | 64        | INT8      | 14274.306 | 0.999 | 0.98633+-0.00605 |
diff --git a/models/cv/face/facenet/ixrt/build_engine.py b/models/cv/face/facenet/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..74a62202defa50397cc4227da2181eebe10ab3e9
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/build_engine.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+
+import onnx
+from onnx import helper
+from onnx import TensorProto,numpy_helper
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+def add_facenet_norm(onnx_model):
+    norm = helper.make_node('FacenetNorm_IxRT', inputs=['/last_bn/BatchNormalization_output_0'] , outputs=['/Pow_1_output_0'], name='facenet_norm_1', size=512)
+    
+    onnx_model = onnx.load(onnx_model)
+    graph = onnx_model.graph
+    nodes = graph.node
+    graph.node.append(norm)
+    output = onnx.helper.make_tensor_value_info('/Pow_1_output_0', TensorProto.FLOAT, [64, 512, 1, 1])
+    graph = onnx.helper.make_graph(
+        graph.node,
+        "facenet model",
+        graph.input,
+        [output],
+        graph.initializer
+    )
+    info_model = onnx.helper.make_model(graph, producer_name="facenet")
+    info_model.opset_import[0].version = 11
+    onnx.save(info_model, "tmp4.onnx")
+
+def main(config):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    print("start prepare...")
+    add_facenet_norm(config.model)
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file("tmp4.onnx")
+
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
+    build_config.set_flag(precision)
+
+    plan = builder.build_serialized_network(network, build_config)
+    engine_file_path = config.engine
+    with open(engine_file_path, "wb") as f:
+        f.write(plan)
+    os.remove("tmp4.onnx")
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    parser.add_argument("--engine", type=str, default=None)
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/common.py b/models/cv/face/facenet/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db1327ad1531c452fb38182d747c81fc6f8eccf
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/common.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import cv2
+import glob
+import torch
+import tensorrt
+import numpy as np
+import pycuda.driver as cuda
+
+from torch.utils.data import DataLoader, SubsetRandomSampler, SequentialSampler
+from torchvision import datasets, transforms
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+def get_io_bindings(engine):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = engine.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
+
+def fixed_image_standardization(image_tensor):
+    processed_tensor = (image_tensor - 127.5) / 128.0
+    return processed_tensor
+
+def collate_pil(x): 
+    out_x, out_y = [], [] 
+    for xx, yy in x: 
+        out_x.append(xx) 
+        out_y.append(yy) 
+    return out_x, out_y 
+
+def getdataloader(datasets_dir, step=20, batch_size=64, image_size=160):
+    orig_img_ds = datasets.ImageFolder(datasets_dir + 'lfw', transform=None)
+    orig_img_ds.samples = [
+        (p, p)
+        for p, _ in orig_img_ds.samples
+    ]
+    loader = DataLoader(
+        orig_img_ds,
+        num_workers=16,
+        batch_size=batch_size,
+        collate_fn=collate_pil
+    )
+    crop_paths = []
+    box_probs = []
+    for i, (x, b_paths) in enumerate(loader):
+        crops = [p for p in b_paths]
+        crop_paths.extend(crops)
+        # print('\rBatch {} of {}'.format(i + 1, len(loader)), end='')
+
+    trans = transforms.Compose([
+        np.float32,
+        transforms.ToTensor(),
+        fixed_image_standardization
+    ])
+
+    dataset = datasets.ImageFolder(datasets_dir + 'lfw', transform=trans)
+    embed_loader = DataLoader(
+        dataset,
+        num_workers=16,
+        batch_size=batch_size,
+        sampler=SequentialSampler(dataset)
+    )
+
+    return embed_loader, crop_paths
diff --git a/models/cv/face/facenet/ixrt/config/FACENET_CONFIG b/models/cv/face/facenet/ixrt/config/FACENET_CONFIG
new file mode 100644
index 0000000000000000000000000000000000000000..3b3282eff772fa4a2d46d2cc2aace1570ad0f1bb
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/config/FACENET_CONFIG
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# IMGSIZE : 模型输入hw大小
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件名称
+IMGSIZE=160
+MODEL_NAME=facenet
+ORIGINE_MODEL=facenet_export.onnx
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+    # QUANT_OBSERVER : 量化策略，可选 [hist_percentile, percentile, minmax, entropy, ema]
+    # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致，有些op可能推导shape错误(比如Reshape)
+    # QUANT_STEP : 量化步数
+    # QUANT_SEED : 随机种子 保证量化结果可复现
+    # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=hist_percentile
+QUANT_BATCHSIZE=64
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=
+QUANT_EXIST_ONNX=
diff --git a/models/cv/face/facenet/ixrt/deploy.py b/models/cv/face/facenet/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f4ce5880bb50f78127a923e09c446547ac3fd2
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/deploy.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import onnx
+import os
+import simplejson as json
+import argparse
+from onnxsim import simplify
+import numpy as np
+import shutil
+from onnx import numpy_helper
+from onnx import  AttributeProto, TensorProto, GraphProto
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+def onnx_sim(onnx_name, save_name):
+    #  simplify onnx
+    cmd = "onnxsim {} {}".format(onnx_name, save_name)
+    os.system(cmd)
+    print("[info] onnxsim done!")
+
+
+def cut_model(onnx_name):
+    input_names = ["input"]
+    output_names = ["/last_bn/BatchNormalization_output_0"]
+    onnx.utils.extract_model(onnx_name, onnx_name, input_names, output_names) 
+
+def fuse_matmul(onnx_name, save_onnx_name):
+    find_matmul = 0
+
+    onnx_model = onnx.load(onnx_name)
+
+    graph = onnx_model.graph
+    nodes = graph.node
+
+    conv_weights = None
+    conv_bias = None
+    bn_weights = None
+    bn_bias = None
+    conv_weights_new = None
+    conv_bias_new = None
+
+    pre_node = None
+    for i, node in enumerate(nodes):
+        if (node.op_type == "Conv"):
+            pass
+        if (node.op_type == "MatMul"):
+            for k, ten in enumerate(graph.initializer):
+                if ten.name == node.input[1]:
+                    H , W = ten.dims
+                    weights = np.fromstring(ten.raw_data, dtype=np.float32)
+                    weights = weights.reshape(ten.dims)
+                    conv_weights = weights.transpose()
+        if (node.op_type == "BatchNormalization" and pre_node.op_type == "MatMul"):
+            find_matmul=1            
+            weights = None
+            bias = None
+            mean = None
+            var = None
+
+            for k, ten in enumerate(graph.initializer):
+                if ten.name == node.input[1]:
+                    weights = np.fromstring(ten.raw_data, dtype=np.float32)
+                if ten.name == node.input[2]:
+                    bias = np.fromstring(ten.raw_data, dtype=np.float32)
+                if ten.name == node.input[3]:
+                    mean = np.fromstring(ten.raw_data, dtype=np.float32)
+                if ten.name == node.input[4]:
+                    var = np.fromstring(ten.raw_data, dtype=np.float32)
+
+            bn_weights = np.diag(weights / np.sqrt(var + 1e-8))
+            bn_bias = bias - weights * mean / np.sqrt(var + 1e-8)
+
+            conv_weights_new = np.matmul(bn_weights, conv_weights)
+            a, b = conv_weights_new.shape
+            conv_weights_new = conv_weights_new.reshape((a,b,1,1))
+            # conv_bias_new = bn_weights * conv_bias + bn_bias
+            conv_bias_new = 0 + bn_bias
+            conv_weights_new_initializer = onnx.numpy_helper.from_array(conv_weights_new, name='conv_weights_new')
+            graph.initializer.append(conv_weights_new_initializer)
+            conv_bias_new_initializer = onnx.numpy_helper.from_array(conv_bias_new, name='conv_bias_new')
+            graph.initializer.append(conv_bias_new_initializer)
+
+            pre_node.op_type = "Conv"
+            pre_node.input[0] = "/avgpool_1a/GlobalAveragePool_output_0"
+            pre_node.input[1] = "conv_weights_new"
+            pre_node.input.append("conv_bias_new")
+            pre_node.output[0] = "/last_bn/BatchNormalization_output_0"
+            dilations = onnx.helper.make_attribute("dilations", [1,1])
+            group = onnx.helper.make_attribute("group", 1)
+            kernel_shape = onnx.helper.make_attribute("kernel_shape", [1,1])
+            pads = onnx.helper.make_attribute("pads", [0,0,0,0])
+            strides = onnx.helper.make_attribute("strides", [1,1])
+
+            pre_node.attribute.append(dilations)
+            pre_node.attribute.append(group)
+            pre_node.attribute.append(kernel_shape)
+            pre_node.attribute.append(pads)
+            pre_node.attribute.append(strides)
+            graph.node.remove(node)
+
+        pre_node = node
+
+    for i, node in enumerate(nodes):
+        if (node.name == "Reshape_353"):
+            # print("[reshape] : ", node.name)
+            graph.node.remove(node)
+
+    if find_matmul==1:
+        output = onnx.helper.make_tensor_value_info('/last_bn/BatchNormalization_output_0', TensorProto.FLOAT, [64, 512, 1, 1])
+        graph = onnx.helper.make_graph(
+            graph.node,
+            "facenet model",
+            graph.input,
+            [output],
+            graph.initializer
+        )
+
+        info_model = onnx.helper.make_model(graph, producer_name="facenet")
+        info_model.opset_import[0].version = 11
+        onnx_model = onnx.shape_inference.infer_shapes(info_model)
+        
+        onnx.checker.check_model(onnx_model)
+        onnx.save(onnx_model, save_onnx_name)
+
+def fuse_mul(onnx_name, save_onnx_name):
+    onnx_model = onnx.load(onnx_name)
+
+    graph = onnx_model.graph
+    nodes = graph.node
+    pre_node = None
+
+    for i, node in enumerate(nodes):
+        if (node.op_type == "Constant"):
+            pass
+        
+        if (node.op_type == "Mul" and pre_node.op_type == "Conv" ):
+            for ten in graph.initializer:
+                if ten.name == node.input[1]:
+                    scale_name = ten.name
+                    scale = np.fromstring(ten.raw_data, dtype=np.float32)
+
+            for k, ten in enumerate(graph.initializer):
+                # print(ten.name)
+                if ten.name == pre_node.input[1]:
+                    weights_name = ten.name
+                    weights = np.fromstring(ten.raw_data, dtype=np.float32)
+                    weights *= scale
+                    graph.initializer[k].raw_data = weights.tobytes()
+    
+                if ten.name == pre_node.input[2]:
+                    bias_name = ten.name
+                    bias = np.fromstring(ten.raw_data, dtype=np.float32)
+                    # print("bias len: ",len(da))
+                    bias *= scale
+                    graph.initializer[k].raw_data = bias.tobytes()
+
+            new_conv = pre_node
+            new_conv.output[0] = node.output[0]
+            graph.node.remove(node)
+        pre_node = node
+
+    onnx.checker.check_model(onnx_model)
+    onnx.save(onnx_model, save_onnx_name)
+
+def create_graph_json(onnx_name):
+    #  create graph json and weights
+    graph_path = onnx_name[0:-5] + "_graph.json"
+    weight_path = onnx_name[0:-5] + ".weights"
+    
+    model = onnx.load(onnx_name)
+    graph = model.graph
+    nodes = graph.node
+    initializer = graph.initializer
+    value_info = graph.value_info  # Infer shape info
+
+    model_inputs = [tensor.name for tensor in graph.input]
+    model_outputs = [tensor.name for tensor in graph.output]
+
+    model = {}
+    model["nodes"] = {}
+    model["tensors"] = {}
+    model["edges"] = {}
+    model["output"] = {}
+    data_type_table = {
+        1: "float32",
+        2: "uint8",
+        3: "int8",
+        4: "uint16",
+        5: "int16",
+        6: "int32",
+        7: "int64",
+        9: "bool",
+        10: "float16",
+        11: "double",
+        12: "uint32",
+        13: "uint64",
+    }
+    input_cache = []
+    for item in graph.input:
+        if item.type.tensor_type.elem_type in data_type_table:
+            cache = {
+                "name": item.name,
+                "type": data_type_table[item.type.tensor_type.elem_type],
+            }
+        else:
+            cache = {"name": item.name}
+        input_cache.append(cache)
+    model["input"] = input_cache
+
+    output_cache = []
+    for item in graph.output:
+        if item.type.tensor_type.elem_type in data_type_table:
+            cache = {
+                "name": item.name,
+                "type": data_type_table[item.type.tensor_type.elem_type],
+            }
+        else:
+            cache = {"name": item.name}
+        output_cache.append(cache)
+    model["output"] = output_cache
+
+    # find cast dict
+    input_cast_dict = {}
+    output_cast_dict = {}
+    for i, item in enumerate(nodes):
+        node_name = item.name
+        input_edge_list = list(item.input)
+        output_edge_list = list(item.output)
+        # remove input and output cast op
+        if item.op_type == "Cast":
+            if input_edge_list[0] in model_inputs:
+                input_cast_dict[output_edge_list[0]] = input_edge_list[0]
+            if output_edge_list[0] in model_outputs:
+                output_cast_dict[input_edge_list[0]] = output_edge_list[0]
+
+    for i, item in enumerate(nodes):
+        node_name = item.name
+        input_edge_list = list(item.input)
+        output_edge_list = list(item.output)
+        # remove input and output cast op
+        if item.op_type == "Cast":
+            if input_edge_list[0] in model_inputs:
+                continue
+            if output_edge_list[0] in model_outputs:
+                continue
+
+        for idx, edge_name in enumerate(input_edge_list):
+            if edge_name in input_cast_dict.keys():
+                input_edge_list[idx] = input_cast_dict[edge_name]
+
+        for idx, edge_name in enumerate(output_edge_list):
+            if edge_name in output_cast_dict.keys():
+                output_edge_list[idx] = output_cast_dict[edge_name]
+
+        # remove mask in EmbedLayerNormalization
+        if item.op_type == "EmbedLayerNormalization":
+            no_attention_mask_in_Embed = True
+            for input_edge in input_edge_list:
+                if "attention_mask" in input_edge:
+                    input_edge_list.remove(input_edge)
+                    no_attention_mask_in_Embed = False
+            if no_attention_mask_in_Embed:
+                for tensor_name in model_inputs:
+                    if "attention_mask" in tensor_name:
+                        output_edge_list[1] = tensor_name
+
+        node_dict = {"inputs": input_edge_list, "outputs": output_edge_list}
+        node_dict["op_type"] = item.op_type
+        attribute_dict = {}
+
+        if node_name == "":
+            for input_edge in input_edge_list:
+                node_name += input_edge + "_"
+            node_name += "to"
+            for output_edge in output_edge_list:
+                node_name += "_" + output_edge
+
+        for attr in item.attribute:
+
+            if attr.type == onnx.AttributeProto().AttributeType.FLOAT:
+                attribute_dict[attr.name] = attr.f
+            if attr.type == onnx.AttributeProto().AttributeType.FLOATS:
+                attribute_dict[attr.name] = [x for x in attr.floats]
+            if attr.type == onnx.AttributeProto().AttributeType.INT:
+                attribute_dict[attr.name] = attr.i
+            if attr.type == onnx.AttributeProto().AttributeType.INTS:
+                attribute_dict[attr.name] = [x for x in attr.ints]
+            if attr.type == onnx.AttributeProto().AttributeType.STRING:
+                attribute_dict[attr.name] = str(attr.s.decode("UTF-8"))
+            if attr.type == onnx.AttributeProto().AttributeType.STRINGS:
+                attribute_dict[attr.name] = [str(x.decode("UTF-8")) for x in attr.strings]
+
+        node_dict["attrbiute"] = attribute_dict
+        model["nodes"][node_name] = node_dict
+
+    for i, item in enumerate(initializer):
+        tensor_name = item.name
+        tensor_dict = {}
+        if item.data_type in data_type_table:
+            tensor_dict["data_type"] = data_type_table[item.data_type]
+        else:
+            print(
+                tensor_name,
+                " use unsupport data type: ",
+                item.data_type,
+                ", data info will not be saved",
+            )
+            continue
+        tensor_dict["dims"] = list(item.dims)
+
+        model["tensors"][tensor_name] = tensor_dict
+
+    with open(graph_path, "w") as fh:
+        json.dump(model, fh, indent=4)
+
+
+    """
+    Export weight
+    """
+    byte_string = "".encode()
+
+    weight_file_postfix = ".weights"
+    for item in initializer:
+        tensor_name = item.name
+
+        np_data = None
+        if len(item.raw_data):
+            np_data = np.frombuffer(item.raw_data, dtype=np.byte)
+        elif item.data_type == 1 and len(item.float_data):
+            np_data = np.array(list(item.float_data), dtype=np.float32)
+        elif item.data_type == 2 and len(item.int32_data):
+            np_data = np.array(list(item.int32_data), dtype=np.uint8)
+        elif item.data_type == 6 and len(item.int32_data):
+            np_data = np.array(list(item.int32_data), dtype=np.int32)
+        elif item.data_type == 7 and len(item.int64_data):
+            np_data = np.array(list(item.int64_data), dtype=np.int64)
+        elif item.data_type == 10 and len(item.int32_data):
+            np_data = (
+                np.asarray(item.int32_data, dtype=np.uint16)
+                .reshape(item.dims)
+                .view(np.float16)
+            )
+        else:
+            print(
+                "tensor name: ",
+                tensor_name,
+                ", type: ",
+                item.data_type,
+                ", len: ",
+                len(item.raw_data),
+                len(item.float_data),
+                len(item.int32_data),
+                len(item.int64_data),
+                ", will not save into weights file",
+            )
+
+        if np_data is not None:
+            byte_string += np.uint64(len(tensor_name)).tobytes()
+            byte_string += tensor_name.encode()
+            np_bytes = np_data.tobytes()
+            byte_string += np.uint64(len(np_bytes)).tobytes()
+            byte_string += np_bytes
+
+
+    # Export weight values as bin file
+    with open(weight_path, "wb") as fh:
+        fh.write(byte_string)
+    print("----------------------------")
+    print("[OK] graph and weights file save at :")
+    print(graph_path)
+    print(weight_path)
+    return graph_path, weight_path
+
+def add_facenet_norm(cfg_name):
+    graph_json = json.load(open(cfg_name))
+
+    graph_json["nodes"]["facenet_norm_1"] = {
+            "inputs": [
+                "/last_bn/BatchNormalization_output_0"
+            ],
+            "outputs": [
+                "/Pow_1_output_0"
+            ],
+            "op_type": "FacenetNorm",
+            "attrbiute": {
+                "size": 512
+            }
+        }
+    graph_json["output"] = []
+    graph_json["output"].append({"name":"/Pow_1_output_0", "type":"float32"})
+
+    with open(cfg_name, "w") as fh:
+        json.dump(graph_json, fh, indent=4)
+
+
+def main(args):
+    print("[info] input onnx name :", args.onnx_name)
+    # onnxsim
+    onnx_sim(args.onnx_name, "tmp1.onnx")
+    # cut model
+    cut_model("tmp1.onnx")
+    # fuse matmul bn
+    fuse_matmul("tmp1.onnx", "tmp2.onnx")
+    # fuse mul
+    fuse_mul("tmp2.onnx", "facenet_weights/facenet.onnx")
+    # generate cfg weights
+    # graph_path, weight_path = create_graph_json("facenet_weights/facenet.onnx")
+    # add facenet norm
+    # add_facenet_norm(graph_path)
+
+    os.remove("tmp1.onnx")
+    os.remove("tmp2.onnx")
+    print("\n[info] facenet deploy done!!!")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("deploy facenet")
+    parser.add_argument("--model_name", default="facenet", help="model name")
+    parser.add_argument("--onnx_name", default="facenet_weights/facenet_export.onnx", help="onnx filepath")
+    parser.add_argument("--save_name", default="facenet_weights/facenet.onnx", help="onnx filepath")
+    parser.add_argument("--data_type", default="int8", type=str, choices=["float16", "int8"], help="int8 float16")
+    parser.add_argument("--batch_size", default="64", type=int, help="batch_size")
+    parser.add_argument("--quant_file", default="", type=str, help="quant file")
+    parser.add_argument("--img_size", default="160", type=int, help="image size")
+    parser.add_argument("--device", default=0, type=int, help="cuda device 0 1 3 ...")
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/inference.py b/models/cv/face/facenet/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec9876e33c800206003d4d5e2c2d165929ba6591
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/inference.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import json
+import os
+import re
+import time
+from tqdm import tqdm
+
+import cv2
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import tensorrt
+from tensorrt.utils import topk
+from sklearn import metrics
+from scipy.optimize import brentq
+from sklearn.model_selection import KFold
+from scipy import interpolate
+
+from utils import read_pairs, get_paths, evaluate
+from common import getdataloader, create_engine_context, get_io_bindings
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+def main(config):
+    embed_loader, crop_paths = getdataloader(config.datasets_dir, config.loop_count, config.bsz, config.imgsz)
+
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    # Load Engine && I/O bindings
+    engine, context = create_engine_context(config.engine_file, logger)
+    inputs, outputs, allocations = get_io_bindings(engine)
+
+    # Warm up
+    if config.warm_up > 0:
+        print("\nWarm Start.")
+        for i in range(config.warm_up):
+            context.execute_v2(allocations)
+        print("Warm Done.")
+
+    # Inference
+    if config.test_mode == "FPS":
+        torch.cuda.synchronize()
+        start_time = time.time()
+
+        for i in range(config.loop_count):
+            context.execute_v2(allocations)
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+        forward_time = end_time - start_time
+
+        fps = config.loop_count * config.bsz / forward_time
+
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+        if fps >= config.fps_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+    elif config.test_mode == "ACC":
+
+        classes = []
+        embeddings = []
+
+        for xb, yb in tqdm(embed_loader):
+        
+            output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+            current_imgs_num = xb.numpy().shape[0]
+            xb = xb.numpy()
+            xb = np.ascontiguousarray(xb)
+
+            cuda.memcpy_htod(inputs[0]["allocation"], xb)
+            context.execute_v2(allocations)
+            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+
+            output = output.reshape(output.shape[0],output.shape[1])
+            #print("output shape ",output.shape)
+
+            classes.extend(yb[0:current_imgs_num].numpy())
+            embeddings.extend(output)
+
+
+        embeddings_dict = dict(zip(crop_paths,embeddings))
+
+        pairs = read_pairs(config.datasets_dir + config.pairs_name)
+        path_list, issame_list = get_paths(config.datasets_dir + 'lfw', pairs)
+        # embeddings = np.array([embeddings_dict[path.replace(".png",".jpg")] for path in path_list])
+        embeddings = np.array([embeddings_dict[path] for path in path_list])
+        tpr, fpr, accuracy, val, val_std, far, fp, fn = evaluate(embeddings, issame_list)
+
+        print('\nAccuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy)))
+        print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
+
+        auc = metrics.auc(fpr, tpr)
+        print('Area Under Curve (AUC): %1.3f' % auc)
+        #eer = brentq(lambda x: 1. - x - interpolate.interp1d(fpr, tpr, fill_value="extrapolate")(x), 0., 1.)
+        #print('Equal Error Rate (EER): %1.3f' % eer)
+
+        acc = np.mean(accuracy)
+        print(f"Accuracy Check : Test {acc} >= target {config.acc_target}")
+        if acc >= config.acc_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+    parser.add_argument(
+        "--engine_file",
+        type=str,
+        help="engine file path"
+    )
+    parser.add_argument(
+        "--datasets_dir",
+        type=str,
+        default="",
+        help="ImageNet dir",
+    )
+    parser.add_argument("--pairs_name", type=str, default="pairs.txt", help="binary weights file name")
+    parser.add_argument("--warm_up", type=int, default=-1, help="warm_up times")
+    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+    parser.add_argument(
+        "--imgsz",
+        "--img",
+        "--img-size",
+        type=int,
+        default=160,
+        help="inference size h,w",
+    )
+    parser.add_argument("--use_async", action="store_true")
+    parser.add_argument(
+        "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4"
+    )
+    parser.add_argument("--fps_target", type=float, default=-1.0)
+    parser.add_argument("--acc_target", type=float, default=-1.0)
+    parser.add_argument("--loop_count", type=int, default=-1)
+
+    config = parser.parse_args()
+    return config
+
+if __name__ == "__main__":
+    config = parse_config()
+    main(config)
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/load_ixrt_plugin.py b/models/cv/face/facenet/ixrt/load_ixrt_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae47dc8e854b6bea1f768e65c4dd481048bfebce
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import ctypes
+import tensorrt
+from os.path import join, dirname, exists
+def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path)
+    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/quant.py b/models/cv/face/facenet/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..26413e3e0f58f219cce2bd78804de288cba1fd1a
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/quant.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import torch
+from tensorrt.deploy.api import *
+from tensorrt.deploy.utils.seed import manual_seed
+from torchvision import models
+from argparse import ArgumentParser
+from torch.utils.data import DataLoader
+from torch.utils.data.dataset import Dataset
+from torchvision import datasets, transforms
+import json
+import cv2
+import numpy as np
+import math
+import simplejson as json
+from tensorrt.deploy import static_quantize
+
+
+# manual_seed(43)
+device = 0 if torch.cuda.is_available() else "cpu"
+
+
+def fixed_image_standardization(image_tensor):
+    processed_tensor = (image_tensor - 127.5) / 128.0
+    return processed_tensor
+
+def create_dataloader(args):
+    image_dir_path = os.path.join(args.data_path, "lfw")
+
+    trans = transforms.Compose([
+        np.float32,
+        transforms.ToTensor(),
+        fixed_image_standardization
+    ])
+    
+    dataset = datasets.ImageFolder(args.data_path + 'lfw', transform=trans)
+
+    calibration_dataset = dataset
+    print("image folder total images : ", len(dataset))
+    if args.num_samples is not None:
+        indices = np.random.permutation(len(dataset))[:args.num_samples]
+        calibration_dataset = torch.utils.data.Subset(
+            dataset, indices=indices
+        )
+        print("calibration_dataset images : ", len(calibration_dataset))
+
+    assert len(dataset), f"data size is 0, check data path please"
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.workers,
+    )
+    verify_dataloader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.workers,
+    )
+
+    return calibration_dataloader, verify_dataloader
+
+
+@torch.no_grad()
+def quantize_model(args, model_name, model, dataloader):
+
+    calibration_dataloader, verify_dataloader = dataloader
+    print("calibration dataset length: ", len(calibration_dataloader))
+
+    if isinstance(model, torch.nn.Module):
+        model = model.to(device)
+        model.eval()
+
+    static_quantize(args.model,
+        calibration_dataloader=calibration_dataloader,
+        save_quant_onnx_path=os.path.join("./facenet_weights", f"{model_name}-quant.onnx"),
+        observer=args.observer,
+        data_preprocess=lambda x: x[0].to("cuda"),
+        quant_format="qdq",
+        disable_quant_names=None)
+
+def create_argparser(*args, **kwargs):
+    parser = ArgumentParser(*args, **kwargs)
+    parser.add_argument("--batch_size", type=int, default=64)
+    parser.add_argument("--img_size", type=int, default=160)
+    parser.add_argument("-j", "--workers", type=int, default=4)
+    parser.add_argument("--model", type=str, default="./facenet_weights/facenet.onnx")
+    parser.add_argument("--num_samples", type=int, default=1000)
+    parser.add_argument("--data_path", type=str, default="./facenet_datasets/")
+    parser.add_argument("--analyze", action="store_true")
+    parser.add_argument("--observer", type=str, default="hist_percentile")
+    parser.add_argument("--fp32_acc", action="store_true")
+    parser.add_argument("--use_ixrt", action="store_true")
+    parser.add_argument("--quant_params", type=str, default=None)
+    parser.add_argument("--disable_bias_correction", action="store_true")
+    return parser
+
+def parse_args():
+    parser = create_argparser("PTQ Quantization")
+    args = parser.parse_args()
+    args.use_ixquant = not args.use_ixrt
+    return args
+
+
+def main():
+    args = parse_args()
+    print(args)
+    dataloader = create_dataloader(args)
+
+    if args.model.endswith(".onnx"):
+        model_name = os.path.basename(args.model)
+        model_name = model_name.rsplit(".", maxsplit=1)[0]
+        model = args.model
+    else:
+        print("[Error] file name not correct ", args.model)
+    quantize_model(args, model_name, model, dataloader)
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh b/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..27e5e8ad859d95c86dfc9b29fdc78150b0c60c95
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
@@ -0,0 +1,136 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=64
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
+echo PROJ_DIR : ${PROJ_DIR}
+RUN_DIR="${PROJ_DIR}/ixrt/"
+DATASETS_DIR="${RUN_DIR}/facenet_datasets/"
+CHECKPOINTS_DIR="${RUN_DIR}/facenet_weights/"
+CONFIG_DIR="${PROJ_DIR}/ixrt/config/FACENET_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    cd $RUN_DIR
+    python3 ${RUN_DIR}/deploy.py \
+    --onnx_name ${CHECKPOINTS_DIR}/facenet_export.onnx
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/${MODEL_NAME}-quant.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL} \
+            --batch_size ${QUANT_BATCHSIZE} \
+            --img_size ${IMGSIZE} \
+            --num_samples 6400 \
+            --observer ${QUANT_OBSERVER} \
+            --disable_bias_correction
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+FINAL_MODEL=${SIM_MODEL}
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --fps_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh b/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..401658cafd85297b9d98f7febb9e7c88746062ef
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
@@ -0,0 +1,136 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=64
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
+echo PROJ_DIR : ${PROJ_DIR}
+RUN_DIR="${PROJ_DIR}/ixrt/"
+DATASETS_DIR="${RUN_DIR}/facenet_datasets/"
+CHECKPOINTS_DIR="${RUN_DIR}/facenet_weights/"
+CONFIG_DIR="${PROJ_DIR}/ixrt/config/FACENET_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    cd $RUN_DIR
+    python3 ${RUN_DIR}/deploy.py \
+    --onnx_name ${CHECKPOINTS_DIR}/facenet_export.onnx
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/${MODEL_NAME}-quant.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL} \
+            --batch_size ${QUANT_BATCHSIZE} \
+            --img_size ${IMGSIZE} \
+            --num_samples 6400 \
+            --observer ${QUANT_OBSERVER} \
+            --disable_bias_correction
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+FINAL_MODEL=${SIM_MODEL}
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --fps_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh b/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c2c2f176bcd0ea6bb00acedb6fbda80b47456a08
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=64
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
+echo PROJ_DIR : ${PROJ_DIR}
+RUN_DIR="${PROJ_DIR}/ixrt/"
+DATASETS_DIR="${RUN_DIR}/facenet_datasets/"
+CHECKPOINTS_DIR="${RUN_DIR}/facenet_weights/"
+CONFIG_DIR="${PROJ_DIR}/ixrt/config/FACENET_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    cd $RUN_DIR
+    python3 ${RUN_DIR}/deploy.py \
+    --onnx_name ${CHECKPOINTS_DIR}/facenet_export.onnx
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/${MODEL_NAME}-quant.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        cd $RUN_DIR
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL} \
+            --batch_size ${QUANT_BATCHSIZE} \
+            --data_path ${DATASETS_DIR} \
+            --img_size ${IMGSIZE} \
+            --num_samples 6400 \
+            --observer ${QUANT_OBSERVER} \
+            --disable_bias_correction
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+FINAL_MODEL=${SIM_MODEL}
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --fps_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_performance.sh b/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7574347c028dfdb28e3b06016d4c61fb6d3e1328
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
+    fi
+}
+
+# Run paraments
+BSZ=64
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
+echo PROJ_DIR : ${PROJ_DIR}
+RUN_DIR="${PROJ_DIR}/ixrt/"
+DATASETS_DIR="${RUN_DIR}/facenet_datasets/"
+CHECKPOINTS_DIR="${RUN_DIR}/facenet_weights/"
+CONFIG_DIR="${PROJ_DIR}/ixrt/config/FACENET_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model, ${SIM_MODEL} has been existed
+else
+    cd $RUN_DIR
+    python3 ${RUN_DIR}/deploy.py \
+    --onnx_name ${CHECKPOINTS_DIR}/facenet_export.onnx
+    echo "  "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/${MODEL_NAME}-quant.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        cd $RUN_DIR
+        python3 ${RUN_DIR}/quant.py            \
+            --model ${SIM_MODEL} \
+            --batch_size ${QUANT_BATCHSIZE} \
+            --data_path ${DATASETS_DIR} \
+            --img_size ${IMGSIZE} \
+            --num_samples 6400 \
+            --observer ${QUANT_OBSERVER} \
+            --disable_bias_correction
+        SIM_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Generate ${SIM_MODEL}
+    fi
+fi
+
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+FINAL_MODEL=${SIM_MODEL}
+if [ -f $ENGINE_FILE ];then
+    echo "  "Build Engine Skip, $ENGINE_FILE has been existed
+else
+    python3 ${RUN_DIR}/build_engine.py          \
+        --precision ${PRECISION}                \
+        --model ${FINAL_MODEL}                    \
+        --engine ${ENGINE_FILE}
+    echo "  "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py     \
+    --engine_file=${ENGINE_FILE}    \
+    --datasets_dir=${DATASETS_DIR}  \
+    --imgsz=${IMGSIZE}              \
+    --warm_up=${WARM_UP}            \
+    --loop_count ${LOOP_COUNT}      \
+    --test_mode ${RUN_MODE}         \
+    --fps_target ${TGT}             \
+    --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/tensorflow2pytorch.py b/models/cv/face/facenet/ixrt/tensorflow2pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f76ba0fff91ae1ac334c2babbc10f0d65139b711
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/tensorflow2pytorch.py
@@ -0,0 +1,387 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
+import torch
+import json
+import os, sys
+
+from dependencies.facenet.src import facenet
+from dependencies.facenet.src.models import inception_resnet_v1 as tf_mdl
+from dependencies.facenet.src.align import detect_face
+
+from models.inception_resnet_v1 import InceptionResnetV1
+from models.mtcnn import PNet, RNet, ONet
+
+
+def import_tf_params(tf_mdl_dir, sess):
+    """Import tensorflow model from save directory.
+    
+    Arguments:
+        tf_mdl_dir {str} -- Location of protobuf, checkpoint, meta files.
+        sess {tensorflow.Session} -- Tensorflow session object.
+    
+    Returns:
+        (list, list, list) -- Tuple of lists containing the layer names,
+            parameter arrays as numpy ndarrays, parameter shapes.
+    """
+    print('\nLoading tensorflow model\n')
+    if callable(tf_mdl_dir):
+        tf_mdl_dir(sess)
+    else:
+        facenet.load_model(tf_mdl_dir)
+
+    print('\nGetting model weights\n')
+    images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
+    print(images_placeholder)
+    tf_layers = tf.trainable_variables()
+    tf_params = sess.run(tf_layers)
+    print(tf.get_default_graph())
+
+    tf_shapes = [p.shape for p in tf_params]
+    tf_layers = [l.name for l in tf_layers]
+    
+    print(tf_shapes)
+    print(tf_shapes)
+    
+    if not callable(tf_mdl_dir):
+        path = os.path.join(tf_mdl_dir, 'layer_description.json')
+    else:
+        path = 'data/layer_description.json'
+    with open(path, 'w') as f:
+        json.dump({l: s for l, s in zip(tf_layers, tf_shapes)}, f)
+
+    return tf_layers, tf_params, tf_shapes
+
+
+def get_layer_indices(layer_lookup, tf_layers):
+    """Giving a lookup of model layer attribute names and tensorflow variable names,
+    find matching parameters.
+    
+    Arguments:
+        layer_lookup {dict} -- Dictionary mapping pytorch attribute names to (partial)
+            tensorflow variable names. Expects dict of the form {'attr': ['tf_name', ...]}
+            where the '...'s are ignored.
+        tf_layers {list} -- List of tensorflow variable names.
+    
+    Returns:
+        list -- The input dictionary with the list of matching inds appended to each item.
+    """
+    layer_inds = {}
+    for name, value in layer_lookup.items():
+        layer_inds[name] = value + [[i for i, n in enumerate(tf_layers) if value[0] in n]]
+    return layer_inds
+
+
+def load_tf_batchNorm(weights, layer):
+    """Load tensorflow weights into nn.BatchNorm object.
+    
+    Arguments:
+        weights {list} -- Tensorflow parameters.
+        layer {torch.nn.Module} -- nn.BatchNorm.
+    """
+    layer.bias.data = torch.tensor(weights[0]).view(layer.bias.data.shape)
+    layer.weight.data = torch.ones_like(layer.weight.data)
+    layer.running_mean = torch.tensor(weights[1]).view(layer.running_mean.shape)
+    layer.running_var = torch.tensor(weights[2]).view(layer.running_var.shape)
+
+
+def load_tf_conv2d(weights, layer, transpose=False):
+    """Load tensorflow weights into nn.Conv2d object.
+    
+    Arguments:
+        weights {list} -- Tensorflow parameters.
+        layer {torch.nn.Module} -- nn.Conv2d.
+    """
+    if isinstance(weights, list):
+        if len(weights) == 2:
+            layer.bias.data = (
+                torch.tensor(weights[1])
+                    .view(layer.bias.data.shape)
+            )
+        weights = weights[0]
+    
+    if transpose:
+        dim_order = (3, 2, 1, 0)
+    else:
+        dim_order = (3, 2, 0, 1)
+
+    layer.weight.data = (
+        torch.tensor(weights)
+            .permute(dim_order)
+            .view(layer.weight.data.shape)
+    )
+
+
+def load_tf_conv2d_trans(weights, layer):
+    return load_tf_conv2d(weights, layer, transpose=True)
+
+
+def load_tf_basicConv2d(weights, layer):
+    """Load tensorflow weights into grouped Conv2d+BatchNorm object.
+    
+    Arguments:
+        weights {list} -- Tensorflow parameters.
+        layer {torch.nn.Module} -- Object containing Conv2d+BatchNorm.
+    """
+    load_tf_conv2d(weights[0], layer.conv)
+    load_tf_batchNorm(weights[1:], layer.bn)
+
+
+def load_tf_linear(weights, layer):
+    """Load tensorflow weights into nn.Linear object.
+    
+    Arguments:
+        weights {list} -- Tensorflow parameters.
+        layer {torch.nn.Module} -- nn.Linear.
+    """
+    if isinstance(weights, list):
+        if len(weights) == 2:
+            layer.bias.data = (
+                torch.tensor(weights[1])
+                    .view(layer.bias.data.shape)
+            )
+        weights = weights[0]
+    layer.weight.data = (
+        torch.tensor(weights)
+            .transpose(-1, 0)
+            .view(layer.weight.data.shape)
+    )
+
+
+# High-level parameter-loading functions:
+
+def load_tf_block35(weights, layer):
+    load_tf_basicConv2d(weights[:4], layer.branch0)
+    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
+    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
+    load_tf_basicConv2d(weights[12:16], layer.branch2[0])
+    load_tf_basicConv2d(weights[16:20], layer.branch2[1])
+    load_tf_basicConv2d(weights[20:24], layer.branch2[2])
+    load_tf_conv2d(weights[24:26], layer.conv2d)
+
+
+def load_tf_block17_8(weights, layer):
+    load_tf_basicConv2d(weights[:4], layer.branch0)
+    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
+    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
+    load_tf_basicConv2d(weights[12:16], layer.branch1[2])
+    load_tf_conv2d(weights[16:18], layer.conv2d)
+
+
+def load_tf_mixed6a(weights, layer):
+    if len(weights) != 16:
+        raise ValueError(f'Number of weight arrays ({len(weights)}) not equal to 16')
+    load_tf_basicConv2d(weights[:4], layer.branch0)
+    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
+    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
+    load_tf_basicConv2d(weights[12:16], layer.branch1[2])
+
+
+def load_tf_mixed7a(weights, layer):
+    if len(weights) != 28:
+        raise ValueError(f'Number of weight arrays ({len(weights)}) not equal to 28')
+    load_tf_basicConv2d(weights[:4], layer.branch0[0])
+    load_tf_basicConv2d(weights[4:8], layer.branch0[1])
+    load_tf_basicConv2d(weights[8:12], layer.branch1[0])
+    load_tf_basicConv2d(weights[12:16], layer.branch1[1])
+    load_tf_basicConv2d(weights[16:20], layer.branch2[0])
+    load_tf_basicConv2d(weights[20:24], layer.branch2[1])
+    load_tf_basicConv2d(weights[24:28], layer.branch2[2])
+
+
+def load_tf_repeats(weights, layer, rptlen, subfun):
+    if len(weights) % rptlen != 0:
+        raise ValueError(f'Number of weight arrays ({len(weights)}) not divisible by {rptlen}')
+    weights_split = [weights[i:i+rptlen] for i in range(0, len(weights), rptlen)]
+    for i, w in enumerate(weights_split):
+        subfun(w, getattr(layer, str(i)))
+
+
+def load_tf_repeat_1(weights, layer):
+    load_tf_repeats(weights, layer, 26, load_tf_block35)
+
+
+def load_tf_repeat_2(weights, layer):
+    load_tf_repeats(weights, layer, 18, load_tf_block17_8)
+
+
+def load_tf_repeat_3(weights, layer):
+    load_tf_repeats(weights, layer, 18, load_tf_block17_8)
+
+
+def test_loaded_params(mdl, tf_params, tf_layers):
+    """Check each parameter in a pytorch model for an equivalent parameter
+    in a list of tensorflow variables.
+    
+    Arguments:
+        mdl {torch.nn.Module} -- Pytorch model.
+        tf_params {list} -- List of ndarrays representing tensorflow variables.
+        tf_layers {list} -- Corresponding list of tensorflow variable names.
+    """
+    tf_means = torch.stack([torch.tensor(p).mean() for p in tf_params])
+    for name, param in mdl.named_parameters():
+        pt_mean = param.data.mean()
+        matching_inds = ((tf_means - pt_mean).abs() < 1e-8).nonzero()
+        print(f'{name} equivalent to {[tf_layers[i] for i in matching_inds]}')
+
+
+def compare_model_outputs(pt_mdl, sess, test_data):
+    """Given some testing data, compare the output of pytorch and tensorflow models.
+    
+    Arguments:
+        pt_mdl {torch.nn.Module} -- Pytorch model.
+        sess {tensorflow.Session} -- Tensorflow session object.
+        test_data {torch.Tensor} -- Pytorch tensor.
+    """
+    print('\nPassing test data through TF model\n')
+    if isinstance(sess, tf.Session):
+        images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
+        phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
+        embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
+        feed_dict = {images_placeholder: test_data.numpy(), phase_train_placeholder: False}
+        tf_output = torch.tensor(sess.run(embeddings, feed_dict=feed_dict))
+    else:
+        tf_output = sess(test_data)
+
+    print(tf_output.shape, tf_output)
+
+    print('\nPassing test data through PT model\n')
+    pt_output = pt_mdl(test_data.permute(0, 3, 1, 2))
+    print(pt_output.shape, pt_output)
+
+    distance = (tf_output - pt_output).norm()
+    print(f'\nDistance {distance}\n')
+
+
+def compare_mtcnn(pt_mdl, tf_fun, sess, ind, test_data):
+    tf_mdls = tf_fun(sess)
+    tf_mdl = tf_mdls[ind]
+
+    print('\nPassing test data through TF model\n')
+    tf_output = tf_mdl(test_data.numpy())
+    tf_output = [torch.tensor(out) for out in tf_output]
+    print('\n'.join([str(o.view(-1)[:10]) for o in tf_output]))
+
+    print('\nPassing test data through PT model\n')
+    with torch.no_grad():
+        pt_output = pt_mdl(test_data.permute(0, 3, 2, 1))
+    pt_output = [torch.tensor(out) for out in pt_output]
+    for i in range(len(pt_output)):
+        if len(pt_output[i].shape) == 4:
+            pt_output[i] = pt_output[i].permute(0, 3, 2, 1).contiguous()
+    print('\n'.join([str(o.view(-1)[:10]) for o in pt_output]))
+
+    distance = [(tf_o - pt_o).norm() for tf_o, pt_o in zip(tf_output, pt_output)]
+    print(f'\nDistance {distance}\n')
+
+
+def load_tf_model_weights(mdl, layer_lookup, tf_mdl_dir, is_resnet=True, arg_num=None):
+    """Load tensorflow parameters into a pytorch model.
+    
+    Arguments:
+        mdl {torch.nn.Module} -- Pytorch model.
+        layer_lookup {[type]} -- Dictionary mapping pytorch attribute names to (partial)
+            tensorflow variable names, and a function suitable for loading weights.
+            Expects dict of the form {'attr': ['tf_name', function]}. 
+        tf_mdl_dir {str} -- Location of protobuf, checkpoint, meta files.
+    """
+    tf.reset_default_graph()
+    with tf.Session() as sess:
+        tf_layers, tf_params, tf_shapes = import_tf_params(tf_mdl_dir, sess)
+        layer_info = get_layer_indices(layer_lookup, tf_layers)
+
+        for layer_name, info in layer_info.items():
+            print(f'Loading {info[0]}/* into {layer_name}')
+            weights = [tf_params[i] for i in info[2]]
+            layer = getattr(mdl, layer_name)
+            info[1](weights, layer)
+
+        test_loaded_params(mdl, tf_params, tf_layers)
+
+        if is_resnet:
+            compare_model_outputs(mdl, sess, torch.randn(5, 160, 160, 3).detach())
+
+
+def tensorflow2pytorch(args):
+    lookup_inception_resnet_v1 = {
+        'conv2d_1a': ['InceptionResnetV1/Conv2d_1a_3x3', load_tf_basicConv2d],
+        'conv2d_2a': ['InceptionResnetV1/Conv2d_2a_3x3', load_tf_basicConv2d],
+        'conv2d_2b': ['InceptionResnetV1/Conv2d_2b_3x3', load_tf_basicConv2d],
+        'conv2d_3b': ['InceptionResnetV1/Conv2d_3b_1x1', load_tf_basicConv2d],
+        'conv2d_4a': ['InceptionResnetV1/Conv2d_4a_3x3', load_tf_basicConv2d],
+        'conv2d_4b': ['InceptionResnetV1/Conv2d_4b_3x3', load_tf_basicConv2d],
+        'repeat_1': ['InceptionResnetV1/Repeat/block35', load_tf_repeat_1],
+        'mixed_6a': ['InceptionResnetV1/Mixed_6a', load_tf_mixed6a],
+        'repeat_2': ['InceptionResnetV1/Repeat_1/block17', load_tf_repeat_2],
+        'mixed_7a': ['InceptionResnetV1/Mixed_7a', load_tf_mixed7a],
+        'repeat_3': ['InceptionResnetV1/Repeat_2/block8', load_tf_repeat_3],
+        'block8': ['InceptionResnetV1/Block8', load_tf_block17_8],
+        'last_linear': ['InceptionResnetV1/Bottleneck/weights', load_tf_linear],
+        'last_bn': ['InceptionResnetV1/Bottleneck/BatchNorm', load_tf_batchNorm],
+        # 'logits': ['Logits', load_tf_linear],
+    }
+
+    print('\nLoad CASIA-Webface-trained weights and save\n')
+    mdl = InceptionResnetV1(num_classes=10575).eval()
+    tf_mdl_dir = args.facenet_pb_path
+
+    load_tf_model_weights(mdl, lookup_inception_resnet_v1, tf_mdl_dir)
+    # print(f'????????')
+    # data_name = 'casia-webfacexxxxxxx'
+    # state_dict = mdl.state_dict()
+    # torch.save(state_dict, f'{tf_mdl_dir}-{data_name}.pt')
+
+    x = torch.rand(64, 3, 160, 160)#.cuda()
+    # y = resnet(x)
+    # print(y.shape)
+
+
+    f = f"{args.facenet_weights_path}/{args.onnx_save_name}"
+    torch.onnx.export(mdl, x, f, verbose=False, opset_version=11, 
+                    input_names=['input'], output_names=['output'], dynamic_axes=None)
+
+
+    
+import argparse
+def parse_args():
+    parser = argparse.ArgumentParser("deploy facenet")
+    parser.add_argument("--facenet_weights_path", default="", help="onnx model path")
+    parser.add_argument("--facenet_pb_path", default="", help="")
+    parser.add_argument("--onnx_save_name", default="", help="")
+
+    return parser.parse_args()
+args = parse_args()
+
+tensorflow2pytorch(args)
+
+
+# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+# print('Running on device: {}'.format(device))
+
+# # Load pretrained resnet model
+# resnet = InceptionResnetV1(
+#     classify=False,
+#     pretrained='casia-webface'
+# )#.to(device)
+
+# x = torch.rand(64, 3, 160, 160)#.cuda()
+# y = resnet(x)
+# print(y.shape)
+
+
+# f = f"{args.facenet_weights_path}/{args.onnx_save_name}"
+# torch.onnx.export(resnet, x, f, verbose=False, opset_version=11, input_names=['input'], output_names=['output'], dynamic_axes=None)
diff --git a/models/cv/face/facenet/ixrt/utils.py b/models/cv/face/facenet/ixrt/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab8f213bf6bf629ad073140f4ab886760c707759
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/utils.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import math
+
+from sklearn.model_selection import KFold
+from scipy import interpolate
+import numpy as np
+
+
+# LFW functions taken from David Sandberg's FaceNet implementation
+def distance(embeddings1, embeddings2, distance_metric=0):
+    if distance_metric==0:
+        # Euclidian distance
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff),1)
+    elif distance_metric==1:
+        # Distance based on cosine similarity
+        dot = np.sum(np.multiply(embeddings1, embeddings2), axis=1)
+        norm = np.linalg.norm(embeddings1, axis=1) * np.linalg.norm(embeddings2, axis=1)
+        similarity = dot / norm
+        dist = np.arccos(similarity) / math.pi
+    else:
+        raise 'Undefined distance metric %d' % distance_metric
+
+    return dist
+
+def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False):
+    assert(embeddings1.shape[0] == embeddings2.shape[0])
+    assert(embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds,nrof_thresholds))
+    fprs = np.zeros((nrof_folds,nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+
+    is_false_positive = []
+    is_false_negative = []
+
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        if subtract_mean:
+            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
+        else:
+          mean = 0.0
+        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx], _ ,_ = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _, _, _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
+        _, _, accuracy[fold_idx], is_fp, is_fn = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
+
+        tpr = np.mean(tprs,0)
+        fpr = np.mean(fprs,0)
+        is_false_positive.extend(is_fp)
+        is_false_negative.extend(is_fn)
+
+    return tpr, fpr, accuracy, is_false_positive, is_false_negative
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    is_fp = np.logical_and(predict_issame, np.logical_not(actual_issame))
+    is_fn = np.logical_and(np.logical_not(predict_issame), actual_issame)
+
+    tpr = 0 if (tp+fn==0) else float(tp) / float(tp+fn)
+    fpr = 0 if (fp+tn==0) else float(fp) / float(fp+tn)
+    acc = float(tp+tn)/dist.size
+    return tpr, fpr, acc, is_fp, is_fn
+
+def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False):
+    assert(embeddings1.shape[0] == embeddings2.shape[0])
+    assert(embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+    val = np.zeros(nrof_folds)
+    far = np.zeros(nrof_folds)
+
+    indices = np.arange(nrof_pairs)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        if subtract_mean:
+            mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
+        else:
+          mean = 0.0
+        dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
+
+        # Find the threshold that gives FAR = far_target
+        far_train = np.zeros(nrof_thresholds)
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
+        if np.max(far_train)>=far_target:
+            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+            threshold = f(far_target)
+        else:
+            threshold = 0.0
+
+        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
+
+    val_mean = np.mean(val)
+    far_mean = np.mean(far)
+    val_std = np.std(val)
+    return val_mean, val_std, far_mean
+
+def calculate_val_far(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+    false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    n_same = np.sum(actual_issame)
+    n_diff = np.sum(np.logical_not(actual_issame))
+    val = float(true_accept) / float(n_same)
+    far = float(false_accept) / float(n_diff)
+    return val, far
+
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy, fp, fn  = calculate_roc(thresholds, embeddings1, embeddings2,
+        np.asarray(actual_issame), nrof_folds=nrof_folds, distance_metric=distance_metric, subtract_mean=subtract_mean)
+    thresholds = np.arange(0, 4, 0.001)
+    val, val_std, far = calculate_val(thresholds, embeddings1, embeddings2,
+        np.asarray(actual_issame), 1e-3, nrof_folds=nrof_folds, distance_metric=distance_metric, subtract_mean=subtract_mean)
+    return tpr, fpr, accuracy, val, val_std, far, fp, fn
+
+def add_extension(path):
+    if os.path.exists(path+'.jpg'):
+        return path+'.jpg'
+    elif os.path.exists(path+'.png'):
+        return path+'.png'
+    else:
+        raise RuntimeError('No file "%s" with extension png or jpg.' % path)
+
+def get_paths(lfw_dir, pairs):
+    nrof_skipped_pairs = 0
+    path_list = []
+    issame_list = []
+    for pair in pairs:
+        if len(pair) == 3:
+            path0 = add_extension(os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])))
+            path1 = add_extension(os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[2])))
+            issame = True
+        elif len(pair) == 4:
+            path0 = add_extension(os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])))
+            path1 = add_extension(os.path.join(lfw_dir, pair[2], pair[2] + '_' + '%04d' % int(pair[3])))
+            issame = False
+        if os.path.exists(path0) and os.path.exists(path1):    # Only add the pair if both paths exist
+            path_list += (path0,path1)
+            issame_list.append(issame)
+        else:
+            nrof_skipped_pairs += 1
+    if nrof_skipped_pairs>0:
+        print('Skipped %d image pairs' % nrof_skipped_pairs)
+
+    return path_list, issame_list
+
+def read_pairs(pairs_filename):
+    pairs = []
+    with open(pairs_filename, 'r') as f:
+        for line in f.readlines()[1:]:
+            pair = line.strip().split()
+            pairs.append(pair)
+    return np.array(pairs, dtype=object)
\ No newline at end of file
diff --git a/models/nlp/language_model/albert/ixrt/README.md b/models/nlp/language_model/albert/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cace00994d1b030154c2064f87d8f110029edbd0
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/README.md
@@ -0,0 +1,105 @@
+# AlBERT
+
+## Description
+
+Albert (A Lite BERT) is a variant of the BERT (Bidirectional Encoder Representations from Transformers) model that focuses on efficiency and scalability while maintaining strong performance in natural language processing tasks. The AlBERT model introduces parameter reduction techniques and incorporates self-training strategies to enhance its effectiveness.
+
+## Setup
+
+### Install
+
+```bash
+apt install -y libnuma-dev
+
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+pip3 install transformers==4.33.3
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_albert.tar>
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar> to download the squad dataset.
+
+or you can :
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/albert/ixrt
+cd ${MODEL_PATH}
+bash ./scripts/prepare_model_and_dataset.sh
+```
+
+### Model Conversion
+
+Please correct the paths in the following commands or files.
+
+```bash
+wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/albert-torch-fp32.json
+python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/open_albert/albert-base-squad.pt --output_path albert-torch-fp32.onnx
+onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./albert-torch-fp32-sim
+export OPTIMIER_FILE=./ixrt-oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+
+bash scripts/infer_albert_fp16_performance.sh
+```
+
+### Accuracy
+
+```bash
+# get madlag.tar
+wget http://files.deepspark.org.cn:880/deepspark/madlag.tar
+tar xvf madlag.tar
+rm -f madlag.tar
+
+# link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+
+# modify perf_engine.py
+mv ./perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+
+# edit madlag/albert-base-v2-squad path
+sed -i "s#madlag#/${MODEL_PATH}/madlag#" ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+
+# copy open_squad data
+cp datasets/open_squad/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/
+
+# copy open_albert data
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/open_albert
+cp ./general_perf/model_zoo/popular/open_albert/*.pt ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/open_albert
+
+# run acc script
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+sed -i 's/tensorrt_legacy/tensorrt/' ./backends/ILUVATAR/common.py
+sed -i 's/tensorrt_legacy/tensorrt/' ./backends/ILUVATAR/compile_backend_iluvatar.py
+sed -i 's/tensorrt_legacy/tensorrt/' ./backends/ILUVATAR/runtime_backend_iluvatar.py
+python3 core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
+```
+
+## Results
+
+| Model  | BatchSize | Precision | QPS   | Exact Match | F1 Score |
+| ------ | --------- | --------- | ----- | ----------- | -------- |
+| AlBERT | 1         | FP16      | 50.99 | 80.18       | 87.57    |
diff --git a/models/nlp/language_model/albert/ixrt/perf_engine.py b/models/nlp/language_model/albert/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/models/nlp/language_model/albert/ixrt/scripts/infer_albert_fp16_performance.sh b/models/nlp/language_model/albert/ixrt/scripts/infer_albert_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..977eb85c3f3cb4aa4b337c79c9246114f369bf41
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/scripts/infer_albert_fp16_performance.sh
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+
+run(){
+    BS=16
+    TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+    TARGET_ENGINE=${ORIGIN_ONNX_NAME}_bs_${BS}_end.engine
+    if [[ ! -f "${ORIGIN_ONNX}" ]];then
+        echo "${ORIGIN_ONNX} not exists!"
+        exit 1
+    fi
+
+    # Graph optimize
+    python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+
+    # Build Engine
+    ixrtexec --onnx ${TARGET_ONNX} --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+                                   --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+                                   --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+                                   --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin
+
+    # Test Performance
+    ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/albert/ixrt/scripts/prepare_model_and_dataset.sh b/models/nlp/language_model/albert/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..115faac30dcef7617327a4083e4a67f1ff4c322b
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model....  *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_squad/
+
+wget -O general_perf/download/open_albert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_albert.tar
+tar xf general_perf/download/open_albert.tar -C general_perf/model_zoo/popular/ 
+
+
+# # Download Datasets
+wget -O general_perf/download/open_squad.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
+tar xf general_perf/download/open_squad.tar -C datasets/open_squad/
+
+
+echo "Extract Done."
diff --git a/models/nlp/language_model/albert/ixrt/torch2onnx.py b/models/nlp/language_model/albert/ixrt/torch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f115b730caf065b3f3dfc496c161916afc96d9e
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/torch2onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+    model_name = output_path.split("/")[-1][:-4]
+    with open(model_name + "json", "r") as f:
+        model_info = json.load(f)
+    model_inputs = model_info["inputs"].split(",")
+    input_shapes = model_info["input_shape"]
+    input_type = model_info["input_type"].split(",")
+    example_inputs = _get_fake_samples(input_shapes, input_type)
+
+    model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+    model.eval()
+
+    names = model_inputs
+    dynamic_inputs = {}
+    for i in range(len(names)):
+        dynamic_inputs[names[i]] = {0: "batch_size"}
+    outputs = model_info["outputs"].split(",")
+    for output in outputs:
+        dynamic_inputs[output] = {0: "batch_size"}
+    torch.onnx.export(
+        model,
+        example_inputs,
+        output_path,
+        opset_version=11,
+        input_names=names,
+        output_names=outputs,
+        dynamic_axes=dynamic_inputs,
+    )
+
+
+def _get_fake_samples(shape, type):
+    data = []
+    idx = 0
+    for key, val in shape.items():
+        val = [val[0] * 1] + val[1:]
+        data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+        idx += 1
+    return data
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    torch_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/deberta/ixrt/README.md b/models/nlp/language_model/deberta/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..221a33a895d476b2d73672ab6c26420528d0a33a
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/README.md
@@ -0,0 +1,102 @@
+# DeBERTa
+
+## Description
+
+DeBERTa (Decoding-enhanced BERT with disentangled attention) is an enhanced version of the BERT (Bidirectional Encoder Representations from Transformers) model. It improves text representation learning by introducing disentangled attention mechanisms and decoding enhancement techniques.DeBERTa introduces disentangled attention mechanisms that decompose the self-attention matrix into different parts, focusing on different semantic information. This helps the model better capture relationships between texts.By incorporating decoding enhancement techniques, DeBERTa adjusts the decoder during fine-tuning to better suit specific downstream tasks, thereby improving the model’s performance on those tasks.
+
+## Setup
+
+### Install
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/deberta/ixrt
+cd ${MODEL_PATH}
+
+apt install -y libnuma-dev
+
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+pip3 install tf2onnx
+pip3 install transformers==4.33.3
+```
+
+### Download
+
+Pretrained model: <<https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_deberta.tar> >
+
+Dataset: <<https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar> > to download the squad dataset.
+
+```bash
+bash ./scripts/prepare_model_and_dataset.sh
+```
+
+### Model Conversion
+
+```bash
+wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/deberta-torch-fp32.json
+python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/open_deberta/deberta-base-squad.pt --output_path deberta-torch-fp32.onnx
+onnxsim deberta-torch-fp32.onnx deberta-torch-fp32-sim.onnx
+python3 remove_clip_and_cast.py
+
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./deberta-sim-drop-clip-drop-invaild-cast
+export OPTIMIER_FILE=/Path/ixrt/oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+
+bash scripts/infer_deberta_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit the website: < <https://github.com/yudefu/ByteMLPerf/tree/iluvatar_general_infer> >, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: < <https://github.com/yudefu/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> > Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+
+# setup
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+cp ./datasets/open_squad/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/
+
+mv ./deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
+mv ./general_perf/model_zoo/popular/ ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/
+
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+wget http://files.deepspark.org.cn:880/deepspark/Palak.tar
+tar -zxvf Palak.tar
+
+#接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py -AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/Your/Path/Palak/microsoft_deberta-base_squad")
+
+# run acc perf
+sed -i 's/tensorrt_legacy/tensorrt/g' backends/ILUVATAR/common.py
+python3 core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
+```
+
+## Results
+
+| Model   | BatchSize | Precision | QPS   | Exact Match | F1 Score |
+| ------- | --------- | --------- | ----- | ----------- | -------- |
+| DeBERTa | 1         | FP16      | 18.58 | 73.76       | 81.24    |
diff --git a/models/nlp/language_model/deberta/ixrt/perf_engine.py b/models/nlp/language_model/deberta/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/models/nlp/language_model/deberta/ixrt/remove_clip_and_cast.py b/models/nlp/language_model/deberta/ixrt/remove_clip_and_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..d362439f13a195b8ba895a70407a59ae881db181
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/remove_clip_and_cast.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import onnx_graphsurgeon as gs
+import onnx
+
+onnx_op_set_2_ir_version = {
+    11:6,
+    12:7,
+    13:7,
+}
+
+visited_add_tensor = {}
+def replace_expand_values(graph, expand_node, clip_node, cast_node, sub_node, add_node):
+    if add_node.inputs[0].name not in visited_add_tensor:
+        print(add_node.inputs[0].name)
+        print(add_node.inputs[0].values)
+        add_node.inputs[0].values = add_node.inputs[0].values + 384
+        add_node.inputs[0].values[add_node.inputs[0].values < 0] = 0
+        add_node.inputs[0].values[add_node.inputs[0].values > 767] = 767
+        print(add_node.inputs[0].values)
+        visited_add_tensor[add_node.inputs[0].name] = True
+    expand_node.inputs = [add_node.inputs[0]] + expand_node.inputs[1:]
+
+def replace_clip_related_nodes(graph):
+    node_name_to_index_map = {}
+    expand_node_names = []
+    output_name_to_node_name_map = {}
+    for i, node in enumerate(graph.nodes):
+        node_name_to_index_map[node.name] = i
+        if node.op == "Expand":
+            expand_node_names.append(node.name)
+        for j in node.outputs:
+            output_name_to_node_name_map[j.name] = node.name
+
+    for name in expand_node_names:
+        expand_node = graph.nodes[node_name_to_index_map[name]]
+        expand_producer_name = output_name_to_node_name_map[expand_node.inputs[0].name]
+        expand_producer = graph.nodes[node_name_to_index_map[expand_producer_name]]
+        if expand_producer.op == "Clip":
+            clip_node = expand_producer
+            clip_producer_name = output_name_to_node_name_map[clip_node.inputs[-1].name]
+            clip_producer = graph.nodes[node_name_to_index_map[clip_producer_name]]
+            if  clip_producer.op == "Cast":
+                cast_producer_name = output_name_to_node_name_map[clip_producer.inputs[0].name]
+                cast_producer = graph.nodes[node_name_to_index_map[cast_producer_name]]
+                if cast_producer.op == "Sub":
+                    add_node_name = output_name_to_node_name_map[clip_node.inputs[0].name]
+                    add_node = graph.nodes[node_name_to_index_map[add_node_name]]
+                    replace_expand_values(graph, expand_node, clip_node, clip_producer, cast_producer, add_node)
+
+def drop_cast_nodes(graph):
+    node_name_to_index_map = {}
+    cast_node_names = []
+    output_name_to_node_name_map = {}
+    for i, node in enumerate(graph.nodes):
+        node_name_to_index_map[node.name] = i
+        if node.op == "Cast":
+            cast_node_names.append(node.name)
+        for j in node.outputs:
+            output_name_to_node_name_map[j.name] = node.name
+
+    for name in cast_node_names:
+        cast_node = graph.nodes[node_name_to_index_map[name]]
+        cast_producer_name = output_name_to_node_name_map[cast_node.inputs[0].name]
+        cast_producer = graph.nodes[node_name_to_index_map[cast_producer_name]]
+        if cast_producer.op == "Cast":
+            cast_node.inputs = cast_producer.inputs
+
+
+input_path = r"./deberta-torch-fp32-sim.onnx"
+save_path = r"./deberta-sim-drop-clip-drop-invaild-cast.onnx"
+graph = gs.import_onnx(onnx.load(input_path))
+
+replace_clip_related_nodes(graph)
+drop_cast_nodes(graph)
+
+graph.cleanup().toposort()
+onnx.save(gs.export_onnx(graph), save_path)
+
+model = onnx.load(save_path)
+model.ir_version = onnx_op_set_2_ir_version[model.opset_import[0].version]
+onnx.save(model, save_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/deberta/ixrt/scripts/infer_deberta_fp16_performance.sh b/models/nlp/language_model/deberta/ixrt/scripts/infer_deberta_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c9ced2418f6be8ff775b509d01d0db121af79108
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/scripts/infer_deberta_fp16_performance.sh
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+    BS=${1:-1}
+    TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+    TARGET_ENGINE=${ORIGIN_ONNX_NAME}_bs_${BS}_end.engine
+    if [[ ! -f "${ORIGIN_ONNX}" ]];then
+        echo "${ORIGIN_ONNX} not exists!"
+        exit 1
+    fi
+
+    # Graph optimize
+    python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+
+    # Build Engine
+    ixrtexec --onnx ${TARGET_ONNX} --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin --shapes input_ids.1:${BS}x384,attention_mask.1:${BS}x384\
+             --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384 --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384 --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384
+
+    # Test Performance
+    ixrtexec --load_engine ${TARGET_ENGINE}  --shapes input_ids.1:${BS}x384,attention_mask.1:${BS}x384 --plugins ixrt_plugin
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/deberta/ixrt/scripts/prepare_model_and_dataset.sh b/models/nlp/language_model/deberta/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..575ab8f7d141b387de01d44e84962d8e7e7900dc
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model....  *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_squad/
+
+wget -O general_perf/download/open_deberta.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_deberta.tar
+tar xf general_perf/download/open_deberta.tar -C general_perf/model_zoo/popular/
+
+
+# # Download Datasets
+wget -O general_perf/download/open_squad.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
+tar xf general_perf/download/open_squad.tar -C datasets/open_squad/
+
+
+echo "Extract Done."
diff --git a/models/nlp/language_model/deberta/ixrt/torch2onnx.py b/models/nlp/language_model/deberta/ixrt/torch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f7c06bbd18710f0820870c1ae5711505dd136bb
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/torch2onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+    model_name = output_path.split("/")[-1][:-4]
+    with open("./" + model_name + "json", "r") as f:
+        model_info = json.load(f)
+    model_inputs = model_info["inputs"].split(",")
+    input_shapes = model_info["input_shape"]
+    input_type = model_info["input_type"].split(",")
+    example_inputs = _get_fake_samples(input_shapes, input_type)
+
+    model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+    model.eval()
+
+    names = model_inputs
+    dynamic_inputs = {}
+    for i in range(len(names)):
+        dynamic_inputs[names[i]] = {0: "batch_size"}
+    outputs = model_info["outputs"].split(",")
+    for output in outputs:
+        dynamic_inputs[output] = {0: "batch_size"}
+    torch.onnx.export(
+        model,
+        example_inputs,
+        output_path,
+        opset_version=11,
+        input_names=names,
+        output_names=outputs,
+        dynamic_axes=dynamic_inputs,
+    )
+
+
+def _get_fake_samples(shape, type):
+    data = []
+    idx = 0
+    for key, val in shape.items():
+        val = [val[0] * 1] + val[1:]
+        data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+        idx += 1
+    return data
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    torch_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roberta/ixrt/README.md b/models/nlp/language_model/roberta/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0588c797f1f8bf147fe5d37607cf34c2821e7f6d
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/README.md
@@ -0,0 +1,100 @@
+# RoBERTa
+
+## Description
+
+Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code.
+
+## Setup
+
+### Install
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/roberta/ixrt
+cd ${MODEL_PATH}
+
+pip3 install onnxsim
+pip3 install py-libnuma==1.2
+pip3 install bert
+pip3 install pycuda
+pip3 install transformers==4.33.3
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roberta.tar>
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar>
+
+```bash
+# Go to path of this model
+cd ${PROJ_ROOT}/models/nlp/language_model/roberta/ixrt/
+
+# get open_roberta
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roberta.tar
+tar xf open_roberta.tar
+rm -f open_roberta.tar
+
+# get roberta-torch-fp32.json
+wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/roberta-torch-fp32.json
+
+# export onnx
+python3 export_onnx.py --model_path open_roberta/roberta-base-squad.pt --output_path open_roberta/roberta-torch-fp32.onnx
+
+# Simplify onnx model
+onnxsim open_roberta/roberta-torch-fp32.onnx open_roberta/roberta-torch-fp32_sim.onnx
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./open_roberta/roberta-torch-fp32_sim
+export OPTIMIER_FILE=${IXRT_OSS_ROOT}/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+bash scripts/infer_roberta_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit the website: <https://github.com/yudefu/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: <https://github.com/yudefu/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# Link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+
+# Move open_roberta
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+mv open_roberta ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+
+# Get open_squad
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
+tar xf open_squad.tar
+cp ./open_squad/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad
+rm -f open_squad.tar
+
+# Get csarron.tar
+wget http://files.deepspark.org.cn:880/deepspark/csarron.tar
+tar xf csarron.tar
+rm -f csarron.tar
+mv csarron/ ./ByteMLPerf/byte_infer_perf/
+
+# Run Acc scripts
+cd ./ByteMLPerf/byte_infer_perf/
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
+```
+
+## Results
+
+| Model   | BatchSize | Precision | FPS    | F1       | Exact Match |
+| ------- | --------- | --------- | ------ | -------- | ----------- |
+| RoBERTa | 1         | FP16      | 355.48 | 83.14387 | 76.50175    |
diff --git a/models/nlp/language_model/roberta/ixrt/export_onnx.py b/models/nlp/language_model/roberta/ixrt/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc9d2da750a00a4eefd2323faf0354d9eb3eaf69
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/export_onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+    model_name = output_path.split(".")[0]
+    with open(model_name + ".json", "r") as f:
+        model_info = json.load(f)
+    model_inputs = model_info["inputs"].split(",")
+    input_shapes = model_info["input_shape"]
+    input_type = model_info["input_type"].split(",")
+    example_inputs = _get_fake_samples(input_shapes, input_type)
+
+    model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+    model.eval()
+
+    names = model_inputs
+    dynamic_inputs = {}
+    for i in range(len(names)):
+        dynamic_inputs[names[i]] = {0: "batch_size"}
+    outputs = model_info["outputs"].split(",")
+    for output in outputs:
+        dynamic_inputs[output] = {0: "batch_size"}
+    torch.onnx.export(
+        model,
+        example_inputs,
+        output_path,
+        opset_version=11,
+        input_names=names,
+        output_names=outputs,
+        dynamic_axes=dynamic_inputs,
+    )
+
+
+def _get_fake_samples(shape, type):
+    data = []
+    idx = 0
+    for key, val in shape.items():
+        val = [val[0] * 1] + val[1:]
+        data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+        idx += 1
+    return data
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    torch_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roberta/ixrt/gen_data.py b/models/nlp/language_model/roberta/ixrt/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a59225b2613b2e456b88ed4c79329287713e77a6
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/gen_data.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+
+import numpy as np
+import torch
+
+
+def gen_data(batch_size, output):
+    a = torch.randint(0, 50265, (batch_size, 384))
+    a = a.numpy().astype(np.int64)
+    a.tofile(output+"input_ids.bin")
+
+    a = np.ones((batch_size, 384), dtype=np.int64)
+    a.tofile(output+"input_mask.bin")
+
+    a = np.zeros((batch_size, 384), dtype=np.int64)
+    a.tofile(output+"token_type_ids.bin")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate data for RoBERTa model.")
+    parser.add_argument(
+        "--batch_size", type=int, required=True, help="Batch size for data generation"
+    )
+    parser.add_argument("--output_path", default="")
+
+    args = parser.parse_args()
+
+    gen_data(args.batch_size, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roberta/ixrt/perf_engine.py b/models/nlp/language_model/roberta/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f108474b281bfce71ccaf73d60ba3119cf97c1
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
\ No newline at end of file
diff --git a/models/nlp/language_model/roberta/ixrt/scripts/infer_roberta_fp16_performance.sh b/models/nlp/language_model/roberta/ixrt/scripts/infer_roberta_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..90bdec9be8b064f41e4c5c96a40bd09d1f52b253
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/scripts/infer_roberta_fp16_performance.sh
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+    BS=${1:-1}
+    TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+    TARGET_ENGINE=${ORIGIN_ONNX_NAME}_bs_${BS}_end.engine
+    if [[ ! -f "${ORIGIN_ONNX}" ]];then
+        echo "${ORIGIN_ONNX} not exists!"
+        exit 1
+    fi
+
+    python3 ${PROJ_PATH}/gen_data.py --batch_size ${BS} --output_path ${PROJ_PATH}
+
+    # Graph optimize
+    [ -f "${TARGET_ONNX}" ] || python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+    
+    # Build Engine
+    ixrtexec --onnx ${TARGET_ONNX} --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+                                   --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+                                   --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+                                   --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin
+
+    # Test Performance
+    ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/roformer/ixrt/README.md b/models/nlp/language_model/roformer/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c088cf0f740821d5cc96557dbc53588f4ee5866f
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/README.md
@@ -0,0 +1,105 @@
+# RoFormer
+
+## Description
+
+Position encoding recently has shown effective in the transformer architecture. It enables valuable supervision for dependency modeling between elements at different positions of the sequence. In this paper, we first investigate various methods to integrate positional information into the learning process of transformer-based language models. Then, we propose a novel method named Rotary Position Embedding(RoPE) to effectively leverage the positional information. Specifically, the proposed RoPE encodes the absolute position with a rotation matrix and meanwhile incorporates the explicit relative position dependency in self-attention formulation. Notably, RoPE enables valuable properties, including the flexibility of sequence length, decaying inter-token dependency with increasing relative distances, and the capability of equipping the linear self-attention with relative position encoding. Finally, we evaluate the enhanced transformer with rotary position embedding, also called RoFormer, on various long text classification benchmark datasets.
+
+## Setup
+
+### Install
+
+```bash
+apt install -y libnuma-dev
+
+pip3 install tf2onnx
+pip3 install pycuda
+pip3 install onnxsim
+pip3 install py-libnuma==1.2
+
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roformer.tar>
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_cail2019.tar>
+
+```bash
+# Go to path of this model
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/roformer/ixrt
+cd ${MODEL_PATH}
+
+# Download the pretrained model and dataset to 'data'
+mkdir -p data/
+pushd data/
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roformer.tar
+tar xf open_roformer.tar
+rm -f open_roformer.tar
+popd
+```
+
+### Deal with ONNX
+
+```bash
+# export onnx
+python3 export_onnx.py --model_path ./data/open_roformer --output_path ./data/open_roformer/roformer-frozen_org.onnx
+
+# Simplify onnx model
+onnxsim ./data/open_roformer/roformer-frozen_org.onnx ./data/open_roformer/roformer-frozen.onnx
+python3 deploy.py --model_path ./data/open_roformer/roformer-frozen.onnx --output_path ./data/open_roformer/roformer-frozen.onnx
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./data/open_roformer/roformer-frozen
+export OPTIMIER_FILE=${IXRT_OSS_ROOT}/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+bash scripts/infer_roformer_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit the website: <https://github.com/yudefu/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend.
+
+For detailed steps regarding this model, please refer to this document: <https://github.com/yudefu/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link ByteMLPerf and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+
+# Comment Line102 in compile_backend_iluvatar.py
+sed -i '102s/build_engine/# build_engine/' ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+
+# Move open_roformer
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+mv ./data/open_roformer ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+
+# Setup open_cail2019 dataset
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_cail2019.tar
+tar xf open_cail2019.tar
+cp ./open_cail2019/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019
+rm -f open_cail2019.tar
+
+# Go to general_perf/
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+# Modify model_zoo/roformer-tf-fp32.json
+sed -i 's/segment:0/segment0/g; s/token:0/token0/g' model_zoo/roformer-tf-fp32.json
+# Run Acc scripts
+python3 core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
+```
+
+## Results
+
+| Model    | BatchSize | Precision | FPS     | ACC     |
+| -------- | --------- | --------- | ------- | ------- |
+| RoFormer | 2         | FP16      | 195.186 | 0.33789 |
diff --git a/models/nlp/language_model/roformer/ixrt/deploy.py b/models/nlp/language_model/roformer/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..073fb7333577624be7c304eaeb1916d272cb4dcc
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/deploy.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import onnx
+import argparse
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = get_args()
+    model = onnx.load(args.model_path)
+    for input in model.graph.input:
+        for node in model.graph.node:
+            for i, name in enumerate(node.input):
+                if name == input.name:
+                    node.input[i] =name.replace(':',"")
+        input.name=input.name.replace(':',"")# 保存修改后的模型
+    onnx.save(model, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roformer/ixrt/export_onnx.py b/models/nlp/language_model/roformer/ixrt/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..475dddd7c2ab27b6ca342be98ea92d2c791ff60b
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/export_onnx.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tf2onnx
+from tf2onnx import tf_loader
+import argparse
+ONNX_OPSET = 11
+
+def _convert_graphdef_to_onnx(graph_def,
+    inputs=None,
+    outputs=None,
+    output_path='',
+    **kwargs):
+    
+    inputs_as_nchw = kwargs.get('inputs_as_nchw', None)
+    custom_ops = kwargs.get('custom_ops', None)
+    custom_op_handlers = kwargs.get('custom_op_handlers', None)
+    custom_rewriter = kwargs.get('custom_rewriter', None)
+    extra_opset = kwargs.get('extra_opset', None)
+    large_model = kwargs.get('large_model', False)
+    name = kwargs.get('name', 'habana_convert')
+    target = kwargs.get('target', None)
+    shape_override = kwargs.get('shape_override', {})
+    
+    tf2onnx.convert.from_graph_def(graph_def,
+        name=name,
+        input_names=inputs,
+        output_names=outputs,
+        opset=ONNX_OPSET,
+        custom_ops=custom_ops,
+        custom_op_handlers=custom_op_handlers,
+        custom_rewriter=custom_rewriter,
+        inputs_as_nchw=inputs_as_nchw,
+        extra_opset=extra_opset,
+        shape_override=shape_override,
+        target=target,
+        large_model=large_model,
+        output_path=output_path)
+    return output_path
+
+def savedmodel_to_onnx(model_path, output_path='', **kwargs):
+    inputs = kwargs.get('inputs', None)
+    outputs = kwargs.get('outputs', None)
+    graph_def, inputs, outputs = tf_loader.from_saved_model(
+    model_path, inputs, outputs)
+    return _convert_graphdef_to_onnx(graph_def, inputs, outputs, output_path, **kwargs)
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+ args = get_args()
+ savedmodel_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roformer/ixrt/perf_engine.py b/models/nlp/language_model/roformer/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f108474b281bfce71ccaf73d60ba3119cf97c1
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
\ No newline at end of file
diff --git a/models/nlp/language_model/roformer/ixrt/scripts/infer_roformer_fp16_performance.sh b/models/nlp/language_model/roformer/ixrt/scripts/infer_roformer_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0510e32d98c50d995b584fac3241b804eca512c6
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/scripts/infer_roformer_fp16_performance.sh
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+    BS=${1:-1}
+    TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+    TARGET_ENGINE=${ORIGIN_ONNX_NAME}_end.engine
+    SHAPE="input_segment0:${BS}x1024,input_token0:${BS}x1024"
+    MAX_SHAPE="input_segment0:64x1024,input_token0:64x1024"
+    if [[ ! -f "${ORIGIN_ONNX}" ]];then
+        echo "${ORIGIN_ONNX} not exists!"
+        exit 1
+    fi
+
+    # Graph optimize
+    python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --model_type roformer
+
+    # Build Engine
+    ixrtexec --onnx ${TARGET_ONNX} --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin \
+        --min_shape $SHAPE --opt_shape $SHAPE --max_shape $MAX_SHAPE --shapes $SHAPE
+
+    # Test Performance
+    ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes ${SHAPE}
+
+}
+run 2
\ No newline at end of file
diff --git a/models/nlp/language_model/videobert/ixrt/README.md b/models/nlp/language_model/videobert/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d485fbe39f024ea7036fa987876307bfff02b2f5
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/README.md
@@ -0,0 +1,84 @@
+# VideoBERT
+
+## Description
+
+VideoBERT is a model designed for video understanding tasks, extending the capabilities of BERT (Bidirectional Encoder Representations from Transformers) to video data. It enhances video representation learning by integrating both visual and textual information into a unified framework.
+
+## Setup
+
+### Install
+
+```bash
+apt install -y libnuma-dev
+
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+pip3 install transformers==4.33.3
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_videobert.tar>
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/cifar-100-python.tar> to download the cifar-100-python dataset.
+
+or you can :
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/videobert/ixrt
+cd ${MODEL_PATH}
+bash ./scripts/prepare_model_and_dataset.sh
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./general_perf/model_zoo/popular/open_videobert/video-bert
+export OPTIMIER_FILE=./ixrt-oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+bash scripts/infer_videobert_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/byte_infer_perf/general_perf>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+
+# copy data
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/
+cp -r ./datasets/open_cifar/cifar-100-python/ ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/open_videobert/
+cp ./general_perf/model_zoo/popular/open_videobert/video-bert.onnx ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/open_videobert/
+
+# run acc scripts
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
+```
+
+## Results
+
+| Model     | BatchSize | Precision | QPS   | Top-1 ACC |
+| --------- | --------- | --------- | ----- | --------- |
+| VideoBERT | 4         | FP16      | 37.68 | 61.67     |
diff --git a/models/nlp/language_model/videobert/ixrt/perf_engine.py b/models/nlp/language_model/videobert/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh b/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7911aecdb775bcec206c398b81eff18e083597f0
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+    BS=16
+    TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+    TARGET_ENGINE=${ORIGIN_ONNX_NAME}_end.engine
+    if [[ ! -f "${ORIGIN_ONNX}" ]];then
+        echo "${ORIGIN_ONNX} not exists!"
+        exit 1
+    fi
+
+    # Graph optimize
+    python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+
+    # Build Engine
+    ixrtexec --onnx ${TARGET_ONNX} --min_shape image:${BS}x3x224x224,text:100x77 \
+                                   --opt_shape image:${BS}x3x224x224,text:100x77 \
+                                   --max_shape image:${BS}x3x224x224,text:100x77 \
+                                   --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin --shapes image:${BS}x3x224x224,text:100x77
+
+    # Test Performance
+    ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes image:${BS}x3x224x224,text:100x77
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh b/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c57f758d35547a14106d1acbedb2510fba335c44
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model....  *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_cifar/
+
+wget -O general_perf/download/open_videobert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_videobert.tar
+tar xf general_perf/download/open_videobert.tar -C general_perf/model_zoo/popular/
+
+
+# # Download Datasets
+wget -O general_perf/download/cifar-100-python.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/cifar-100-python.tar
+tar xf general_perf/download/cifar-100-python.tar -C datasets/open_cifar
+
+
+echo "Extract Done."
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/README.md b/models/recommendation/widedeep/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..350a6da35f97da01cb2c82932c93dac64360f3cc
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/README.md
@@ -0,0 +1,84 @@
+# Wide&Deep
+
+## Description
+
+Generalized linear models with nonlinear feature transformations are widely used for large-scale regression and classification problems with sparse inputs. Memorization of feature interactions through a wide set of cross-product feature transformations are effective and interpretable, while generalization requires more feature engineering effort. With less feature engineering, deep neural networks can generalize better to unseen feature combinations through low-dimensional dense embeddings learned for the sparse features. However, deep neural networks with embeddings can over-generalize and recommend less relevant items when the user-item interactions are sparse and high-rank. In this paper, we present Wide & Deep learning---jointly trained wide linear models and deep neural networks---to combine the benefits of memorization and generalization for recommender systems. We productionized and evaluated the system on Google Play, a commercial mobile app store with over one billion active users and over one million apps. Online experiment results show that Wide & Deep significantly increased app acquisitions compared with wide-only and deep-only models. We have also open-sourced our implementation in TensorFlow.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install tf2onnx
+pip3 install pycuda
+pip3 install onnxsim
+pip3 install py-libnuma==1.2
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_wide_deep_saved_model.tar>
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/eval.csv>
+
+```bash
+# Go to path of this model
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/recommendation/widedeep/ixrt
+cd ${MODEL_PATH}
+
+# export onnx
+python3 export_onnx.py --model_path open_wide_deep_saved_model --output_path open_wide_deep_saved_model/widedeep.onnx
+
+# Simplify onnx model
+onnxsim open_wide_deep_saved_model/widedeep.onnx open_wide_deep_saved_model/widedeep_sim.onnx
+python3 deploy.py --model_path open_wide_deep_saved_model/widedeep_sim.onnx --output_path open_wide_deep_saved_model/widedeep_sim.onnx
+python3 change2dynamic.py --model_path open_wide_deep_saved_model/widedeep_sim.onnx --output_path open_wide_deep_saved_model/widedeep_sim.onnx
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./open_wide_deep_saved_model/widedeep_sim
+export OPTIMIER_FILE=${IXRT_OSS_ROOT}/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+bash scripts/infer_widedeep_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit the website: <https://github.com/yudefu/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: <https://github.com/yudefu/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link and install ByteMLPerf requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+
+# Get eval.csv and onnx
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/
+
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/eval.csv
+mv eval.csv ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/
+
+wget http://files.deepspark.org.cn:880/deepspark/widedeep_dynamicshape_new.onnx
+mv widedeep_dynamicshape_new.onnx ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model/
+
+# Run Acc scripts
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+```
+
+## Results
+
+| Model     | BatchSize | Precision | FPS      | ACC     |
+| --------- | --------- | --------- | -------- | ------- |
+| Wide&Deep | 1024      | FP16      | 77073.93 | 0.74597 |
diff --git a/models/recommendation/widedeep/ixrt/change2dynamic.py b/models/recommendation/widedeep/ixrt/change2dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9bcf6f156bcd1bfb6e9a7e150c0eb4461e70f98
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/change2dynamic.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+import onnx
+
+def change_input_output_dim(model):
+    # Use some symbolic name not used for any other dimension
+    sym_batch_dim = "batch"
+    # sym_batch_dim = -1
+
+    # The following code changes the first dimension of every input to be batch-dim
+    # Modify as appropriate ... note that this requires all inputs to
+    # have the same batch_dim
+    inputs = model.graph.input
+    for input in inputs:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = input.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        dim1.dim_param = sym_batch_dim
+
+        if input.name == "new_categorical_placeholder:0":
+            input.type.tensor_type.shape.dim[1].dim_value = int(2)
+        elif input.name == "new_numeric_placeholder:0":
+            input.type.tensor_type.shape.dim[1].dim_value = int(13)
+        elif input.name == "import/head/predictions/zeros_like:0":
+            input.type.tensor_type.shape.dim[1].dim_value = int(1)
+
+        # or update it to be an actual value:
+        # dim1.dim_value = actual_batch_dim
+
+    outputs = model.graph.output
+
+    for output in outputs:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = output.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        dim1.dim_param = sym_batch_dim
+        
+def change_input_node_name(model, input_names):
+    for i,input in enumerate(model.graph.input):
+        input_name = input_names[i]
+        for node in model.graph.node:
+            for i, name in enumerate(node.input):
+                if name == input.name:
+                    node.input[i] = input_name
+        input.name = input_name
+
+
+def change_output_node_name(model, output_names):
+    for i,output in enumerate(model.graph.output):
+        output_name = output_names[i]
+        for node in model.graph.node:
+            for i, name in enumerate(node.output):
+                if name == output.name:
+                    node.output[i] = output_name
+        output.name = output_name
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+
+if __name__ == "__main__":
+    args = get_args()
+    model = onnx.load(args.model_path)
+    change_input_output_dim(model)
+    model = onnx.load(args.model_path)
+    for input in model.graph.input:
+        for node in model.graph.node:
+            for i, name in enumerate(node.input):
+                if name == input.name:
+                    node.input[i] =name.replace(':',"")
+        input.name=input.name.replace(':',"")# 保存修改后的模型
+    onnx.save(model, args.output_path)
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/deploy.py b/models/recommendation/widedeep/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1ac694f9a096b4aa6cb0b2acbbc689e5d901db
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/deploy.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import onnx
+import argparse
+import copy
+
+from typing import Union, Callable, List
+
+from tensorrt.deploy.api import *
+from tensorrt.deploy.backend.onnx.converter import default_converter
+from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
+from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
+from tensorrt.deploy.ir.operator_type import OperatorType as OP
+from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
+from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
+from tensorrt.deploy.ir import Graph
+from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
+from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+
+class FuseGemmPass(BasePass):
+    def process(self, graph: Graph) -> Graph:
+        self.transform = GraphTransform(graph)
+
+        self.transform.find_sequence_subgraph(
+            pattern=[OP.MATMUL, OP.ADD], callback=self.fuse_gemm, strict=True
+        )
+        return graph
+
+    def fuse_gemm(self, graph, pattern: PatternGraph):
+        matmul = pattern.nodes[0]
+        add = pattern.nodes[1]
+
+        if len(add.operator.inputs) != 2:
+            return
+
+        b_var = graph.get_variable(matmul.operator.inputs[1])
+        if not graph.is_leaf_variable(b_var) or b_var.value is None:
+            return
+
+        if b_var.value.ndim != 2:
+            return
+
+        bias_var = None
+        for input in add.operator.inputs:
+            if input not in matmul.operator.outputs:
+                bias_var = input
+
+        matmul.operator.inputs.append(bias_var)
+        self.transform.delete_operator_and_link(
+            add.operator, link_input=matmul.operator.outputs[0]
+        )
+
+        matmul.operator.op_type = OP.GEMM
+        matmul.operator.attributes = attr.GemmAttr(transB=1)
+
+def replace_input(graph):
+    transformer = GraphTransform(graph)
+    from_op = graph.get_operator("Shape__8")
+    to_op = graph.get_operator('import/head/predictions/zeros_like')
+    var = graph.get_variable("import/head/predictions/zeros_like:0")
+    transformer.delete_operators_between_op_op(from_op=from_op, to_op=to_op)
+    transformer.add_input("import/head/predictions/zeros_like:0")
+    return graph
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = get_args()
+    graph = create_source(args.model_path)()
+    graph = FuseGemmPass().process(graph)
+    graph = replace_input(graph)
+    create_target(saved_path=args.output_path).export(graph)
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/export_onnx.py b/models/recommendation/widedeep/ixrt/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..475dddd7c2ab27b6ca342be98ea92d2c791ff60b
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/export_onnx.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tf2onnx
+from tf2onnx import tf_loader
+import argparse
+ONNX_OPSET = 11
+
+def _convert_graphdef_to_onnx(graph_def,
+    inputs=None,
+    outputs=None,
+    output_path='',
+    **kwargs):
+    
+    inputs_as_nchw = kwargs.get('inputs_as_nchw', None)
+    custom_ops = kwargs.get('custom_ops', None)
+    custom_op_handlers = kwargs.get('custom_op_handlers', None)
+    custom_rewriter = kwargs.get('custom_rewriter', None)
+    extra_opset = kwargs.get('extra_opset', None)
+    large_model = kwargs.get('large_model', False)
+    name = kwargs.get('name', 'habana_convert')
+    target = kwargs.get('target', None)
+    shape_override = kwargs.get('shape_override', {})
+    
+    tf2onnx.convert.from_graph_def(graph_def,
+        name=name,
+        input_names=inputs,
+        output_names=outputs,
+        opset=ONNX_OPSET,
+        custom_ops=custom_ops,
+        custom_op_handlers=custom_op_handlers,
+        custom_rewriter=custom_rewriter,
+        inputs_as_nchw=inputs_as_nchw,
+        extra_opset=extra_opset,
+        shape_override=shape_override,
+        target=target,
+        large_model=large_model,
+        output_path=output_path)
+    return output_path
+
+def savedmodel_to_onnx(model_path, output_path='', **kwargs):
+    inputs = kwargs.get('inputs', None)
+    outputs = kwargs.get('outputs', None)
+    graph_def, inputs, outputs = tf_loader.from_saved_model(
+    model_path, inputs, outputs)
+    return _convert_graphdef_to_onnx(graph_def, inputs, outputs, output_path, **kwargs)
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+ args = get_args()
+ savedmodel_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/scripts/infer_widedeep_fp16_performance.sh b/models/recommendation/widedeep/ixrt/scripts/infer_widedeep_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..866adb44937ac5c616b856e13122073ea5cb4233
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/scripts/infer_widedeep_fp16_performance.sh
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+    BS=${1:-1}
+    TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+    TARGET_ENGINE=${ORIGIN_ONNX_NAME}_bs_${BS}_end.engine
+    if [[ ! -f "${ORIGIN_ONNX}" ]];then
+        echo "${ORIGIN_ONNX} not exists!"
+        exit 1
+    fi
+
+    # Graph optimize
+    python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --input_shapes "new_categorical_placeholder0:$((26 * ${BS}))x2,new_numeric_placeholder0:${BS}x13,import/head/predictions/zeros_like0:${BS}x1"
+    # Build Engine
+    ixrtexec --onnx ${TARGET_ONNX} --save_engine ${TARGET_ENGINE} --log_level error
+
+    # Test Performance
+    ixrtexec --load_engine ${TARGET_ENGINE}
+
+}
+run 1
\ No newline at end of file
diff --git a/models/speech/speech_recognition/conformer/ixrt/README.md b/models/speech/speech_recognition/conformer/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2ad0e26a13c8c6b2400e78726f8b0bab713cea45
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/README.md
@@ -0,0 +1,61 @@
+# Conformer
+
+## Description
+
+Conformer is a speech recognition model proposed by Google in 2020. It combines the advantages of CNN and Transformer. CNN efficiently extracts local features, while Transformer is more effective in capturing long sequence dependencies. Conformer applies convolution to the Encoder layer of Transformer, enhancing the performance of Transformer in the ASR (Automatic Speech Recognition) domain.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install typeguard==2.13.3
+pip3 install onnxsim
+pip3 install pycuda
+```
+
+### Download
+
+Pretrained model: <https://github.com/wenet-e2e/wenet/blob/main/docs/pretrained_models.en.md>
+
+Dataset: <https://www.openslr.org/33/> to download the Aishell dataset.
+
+Download and put model in conformer_checkpoints.
+
+```bash
+ln -s /home/deepspark/datasets/INFER/conformer/20210601_u2++_conformer_exp_aishell ./conformer_checkpoints
+```
+
+### Prepare Data
+
+```bash
+# Accuracy
+DATA_DIR=/PATH/to/data_aishell
+TOOL_DIR="$(pwd)/tools"
+bash scripts/aishell_data_prepare.sh ${DATA_DIR} ${TOOL_DIR}
+```
+
+## Model Conversion And Inference
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_conformer_fp16_accuracy_ixrt.sh
+# Performance
+bash scripts/infer_conformer_fp16_performance_ixrt.sh
+```
+
+## Results
+
+| Model     | BatchSize | Precision | QPS     | CER    |
+| --------- | --------- | --------- | ------- | ------ |
+| Conformer | 24        | FP16      | 387.821 | 0.0517 |
diff --git a/models/speech/speech_recognition/conformer/ixrt/build_engine.py b/models/speech/speech_recognition/conformer/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa20ee59f6ecd23d8a8cb9272ece0087ed65ab89
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/build_engine.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+"""
+Build Engine From FusionPlugin Onnx.
+"""
+
+import os
+import ctypes
+import json
+import onnx
+import logging
+import argparse
+
+import tensorrt
+import tensorrt as trt
+from tensorrt import Dims
+
+
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = os.path.join(os.path.dirname(trt.__file__), "lib", "libixrt_plugin.so")
+    if not os.path.exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!"
+        )
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    trt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
+
+load_ixrt_plugin()
+
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="build tensorrt engine of conformer.", usage="")
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+        help="conformer",
+    )
+    parser.add_argument(
+        "--onnx_path",
+        type=str,
+        required=True,
+        help="onnx_path path to save",
+    )
+    parser.add_argument(
+        "--engine_path",
+        type=str,
+        required=True,
+        help="engine path to save",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        required=True,
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        required=True,
+    )
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+MaxBSZ = args.max_batch_size
+MaxSeqLen = args.max_seq_len
+
+
+def build_engine_trtapi_dynamicshape(args):
+    onnx_model = args.onnx_path
+    assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!"
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+
+    profile = builder.create_optimization_profile()
+    profile.set_shape("input", Dims([MaxBSZ, 100, 80]), Dims([MaxBSZ, 1000, 80]), Dims([MaxBSZ, 1500, 80]))
+    profile.set_shape("mask", Dims([MaxBSZ, 1, 25]), Dims([MaxBSZ, 1, 250]), Dims([MaxBSZ, 1, 374]))
+    profile.set_shape("pos_emb", Dims([1, 25, 256]), Dims([1, 250, 256]), Dims([1, 374, 256]))
+    build_config.add_optimization_profile(profile)
+
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(onnx_model)
+    build_config.set_flag(tensorrt.BuilderFlag.FP16)
+
+    # set dynamic
+    # input
+    input_tensor = network.get_input(0)
+    input_tensor.shape = Dims([MaxBSZ, -1, 80])
+    # mask
+    mask_tensor = network.get_input(1)
+    mask_tensor.shape = Dims([MaxBSZ, 1, -1])
+    # pos_emb
+    pos_emb_tensor = network.get_input(2)
+    pos_emb_tensor.shape = Dims([1, -1, 256])
+
+    plan = builder.build_serialized_network(network, build_config)
+    with open(args.engine_path, "wb") as f:
+        f.write(plan)
+
+    print("Build dynamic shape engine done!")
+
+
+def build_engine_trtapi_staticshape(args):
+    onnx_model = args.onnx_path
+    assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!"
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+
+    parser.parse_from_file(onnx_model)
+    build_config.set_flag(tensorrt.BuilderFlag.FP16)
+
+    plan = builder.build_serialized_network(network, build_config)
+    with open(args.engine_path, "wb") as f:
+        f.write(plan)
+
+    print("Build static shape engine done!")
+
+
+if __name__ == "__main__":
+    build_engine_trtapi_dynamicshape(args)
+    # build_engine_trtapi_staticshape(args)
diff --git a/models/speech/speech_recognition/conformer/ixrt/common.py b/models/speech/speech_recognition/conformer/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..89023300ddc7ca3e4f0f992f4b124d8a8c131ae5
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/common.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import ctypes
+import cv2
+import glob
+import torch
+import tensorrt
+import tensorrt as trt
+import numpy as np
+import pycuda.driver as cuda
+
+from tensorrt.hook.utils import copy_ixrt_io_tensors_as_np
+
+
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = os.path.join(os.path.dirname(trt.__file__), "lib", "libixrt_plugin.so")
+    if not os.path.exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!"
+        )
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    trt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
+load_ixrt_plugin()
+
+
+def trtapi(engine_file):
+    datatype = tensorrt.DataType.FLOAT
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    with open(engine_file, "rb") as f, tensorrt.Runtime(logger) as runtime:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+
+def create_engine_context(engine_path, logger):
+    with open(engine_path, "rb") as f:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+
+    return engine, context
+
+
+def get_io_bindings(engine):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = engine.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
+
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+        for s in shape:
+            size *= s
+        allocation = cuda.mem_alloc(size)
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+        }
+        allocations.append(allocation)
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+    return inputs, outputs, allocations
diff --git a/models/speech/speech_recognition/conformer/ixrt/convert2onnx.py b/models/speech/speech_recognition/conformer/ixrt/convert2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..823ae3215f58d18a636e868668199ed3f388ee20
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/convert2onnx.py
@@ -0,0 +1,529 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+"""
+Build Compute Graph(Fusion Plugin Onnx) From Checkpoints.
+"""
+
+import os
+import json
+import torch
+import argparse
+import numpy as np
+from collections import OrderedDict
+
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+from tensorrt.deploy.ir.data_type import DataType
+from tensorrt.deploy.ir.variable import Variable, VariableOptions
+from tensorrt.deploy.ir.graph import Graph
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Build Compute Graph From Checkpoints.", usage=""
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+        help="conformer",
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="checkpont of conformer",
+    )
+    parser.add_argument(
+        "--onnx_path",
+        type=str,
+        required=True,
+        help="raw onnx path to save",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        required=True,
+        help="the batch size for test.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def add_global_cmvn_op(graph, state_dict, args):
+    t = graph
+
+    sub_inputs = [t.make_variable("input", dtype=DataType.FLOAT, shape=(128, 1500, 80))]
+    key = "encoder.global_cmvn.mean"
+    sub_inputs.append(t.make_variable(name=key, value=state_dict[key]))
+    sub_outputs = [t.make_variable("Sub_output_0", dtype=DataType.FLOAT, shape=(128, 1500, 80))]
+    t.make_operator(
+        "Sub",
+        inputs=sub_inputs,
+        outputs=sub_outputs,
+    )
+
+    mul_inputs = sub_outputs
+    key = "encoder.global_cmvn.istd"
+    mul_inputs.append(t.make_variable(name=key, value=state_dict[key]))
+    mul_outputs = [t.make_variable("Mul_output_0", dtype=DataType.FLOAT, shape=(128, 1500, 80))]
+    t.make_operator(
+        "Mul",
+        inputs=mul_inputs,
+        outputs=mul_outputs,
+    )
+
+    unsqueeze_inputs = mul_outputs
+    unsqueeze_inputs.append(t.make_variable("axes", value=np.array([1], dtype=np.int64)))
+    unsqueeze_outputs = [t.make_variable("Unsqueeze_output_0", dtype=DataType.FLOAT, shape=(128, 1, 1500, 80))]
+    t.make_operator(
+        "Unsqueeze",
+        inputs=unsqueeze_inputs,
+        outputs=unsqueeze_outputs,
+    )
+
+
+def add_first_submodule_op(graph, state_dict, args):
+    """
+    The firt submodule part contains follows:
+        1.Conv2d+ReLU;
+        2.Conv2d+ReLU;
+        3.Transpose+Reshape;
+        4.MatMul+Add+Mul;
+    """
+
+    t = graph
+    conv2d0_weight_keys = [
+        "encoder.embed.conv.0.weight",
+        "encoder.embed.conv.0.bias",
+    ]
+    conv2d0_attributes = {
+        "dilations": [1, 1],
+        "group": 1,
+        "kernel_shape": [3, 3],
+        "pads": [0, 0, 0, 0],
+        "strides": [2, 2],
+    }
+    conv2d0_inputs = [t.get_variable("Unsqueeze_output_0")]
+    conv2d0_outputs = [t.make_variable("Conv_output_0", dtype=DataType.FLOAT)]
+
+    for key in conv2d0_weight_keys:
+        conv2d0_inputs.append(t.make_variable(name=key, value=state_dict[key]))
+    t.make_operator(
+        "Conv",
+        inputs=conv2d0_inputs,
+        outputs=conv2d0_outputs,
+        **conv2d0_attributes
+    )
+
+    relu0_inputs = conv2d0_outputs
+    relu0_outputs = [t.make_variable("Relu_output_0", dtype=DataType.FLOAT)]
+    t.make_operator(
+        "Relu",
+        inputs=relu0_inputs,
+        outputs=relu0_outputs
+    )
+
+    conv2d1_weight_keys = [
+        "encoder.embed.conv.2.weight",
+        "encoder.embed.conv.2.bias",
+    ]
+    conv2d1_attributes = {
+        "dilations": [1, 1],
+        "group": 1,
+        "kernel_shape": [3, 3],
+        "pads": [0, 0, 0, 0],
+        "strides": [2, 2],
+    }
+    conv2d1_inputs = relu0_outputs
+    conv2d1_outputs = [t.make_variable("Conv_output_1", dtype=DataType.FLOAT)]
+
+    for key in conv2d1_weight_keys:
+        conv2d1_inputs.append(t.make_variable(name=key, value=state_dict[key]))
+    t.make_operator(
+        "Conv",
+        inputs=conv2d1_inputs,
+        outputs=conv2d1_outputs,
+        **conv2d1_attributes
+    )
+
+    relu1_inputs = conv2d1_outputs
+    relu1_outputs = [t.make_variable("Relu_output_1", dtype=DataType.FLOAT)]
+    t.make_operator(
+        "Relu",
+        inputs=relu1_inputs,
+        outputs=relu1_outputs
+    )
+
+    tran_inputs = relu1_outputs
+    tran_outputs = [t.make_variable("Transpose_output_0", dtype=DataType.FLOAT)]
+    tran_attributes = {"perm": [0, 2, 1, 3]}
+    t.make_operator(
+        "Transpose",
+        inputs=tran_inputs,
+        outputs=tran_outputs,
+        **tran_attributes
+    )
+
+    reshape_inputs = tran_outputs
+    reshape_inputs.append(t.make_variable(name="constant_0", value=np.array([args.batch_size, -1, 4864]), dtype=DataType.INT64))
+    reshape_outputs = [t.make_variable("Reshape_output_0", dtype=DataType.FLOAT)]
+    t.make_operator(
+        "Reshape",
+        inputs=reshape_inputs,
+        outputs=reshape_outputs,
+    )
+
+    matmul_inputs = reshape_outputs
+    matmul_inputs.append(t.make_variable(name="embed.out.0.weight", value=state_dict["encoder.embed.out.0.weight"].transpose(1, 0)))  # (256,4864)--->(4864,256)
+    matmul_outputs = [t.make_variable("MatMul_output_0", dtype=DataType.FLOAT)]
+    t.make_operator(
+        "MatMul",
+        inputs=matmul_inputs,
+        outputs=matmul_outputs,
+    )
+
+    add_inputs = matmul_outputs
+    add_inputs.append(t.make_variable(name="embed.out.0.bias", value=state_dict["encoder.embed.out.0.bias"]))
+    add_outputs = [t.make_variable("Add_output_0", dtype=DataType.FLOAT)]
+    t.make_operator(
+        "Add",
+        inputs=add_inputs,
+        outputs=add_outputs,
+    )
+
+    mul_inputs = add_outputs
+    mul_inputs.append(t.make_variable(name="constant_1", value=np.array([16.], dtype=np.float32), dtype=DataType.FLOAT))
+    mul_outputs = [t.make_variable("Mul_output_1", dtype=DataType.FLOAT)]
+    t.make_operator(
+        "Mul",
+        inputs=mul_inputs,
+        outputs=mul_outputs,
+    )
+
+
+def add_encoder_ff_macaron_op(graph, state_dict, args, index):
+
+    t = graph
+    ff_macaron_keys = [
+        "encoder.encoders.{}.norm_ff_macaron.weight",
+        "encoder.encoders.{}.norm_ff_macaron.bias",
+        "encoder.encoders.{}.feed_forward_macaron.w_1.weight",
+        "encoder.encoders.{}.feed_forward_macaron.w_1.bias",
+        "encoder.encoders.{}.feed_forward_macaron.w_2.weight",
+        "encoder.encoders.{}.feed_forward_macaron.w_2.bias",
+    ]
+
+    attributes = {
+        "in_feature": 256,
+        "hidden_size": 2048,
+        "act_type": 12,
+        "ff_scale": 0.5,
+    }
+
+    if index == 0:
+        inputs = [graph.get_variable("Mul_output_1")]
+    else:
+        inputs = [graph.get_variable("norm_final_{}_output".format(index-1))]
+
+    outputs = [t.make_variable("ff_macaron_{}_output".format(index), dtype=DataType.FLOAT)]
+
+    for key in ff_macaron_keys:
+        key = key.format(index)
+        inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+    t.make_operator(
+        "PositionWiseFFNPluginDynamic_IxRT",
+        inputs=inputs,
+        outputs=outputs,
+        **attributes
+    )
+
+
+def add_encoder_mhsa_op(graph, state_dict, args, index):
+
+    t = graph
+    mhsa_keys = [
+        "encoder.encoders.{}.norm_mha.weight",
+        "encoder.encoders.{}.norm_mha.bias",
+        "encoder.encoders.{}.self_attn.linear_q.weight",
+        "encoder.encoders.{}.self_attn.linear_q.bias",
+        "encoder.encoders.{}.self_attn.linear_k.weight",
+        "encoder.encoders.{}.self_attn.linear_k.bias",
+        "encoder.encoders.{}.self_attn.linear_v.weight",
+        "encoder.encoders.{}.self_attn.linear_v.bias",
+        "encoder.encoders.{}.self_attn.linear_pos.weight",
+        "encoder.encoders.{}.self_attn.pos_bias_u",
+        "encoder.encoders.{}.self_attn.pos_bias_v",
+        "encoder.encoders.{}.self_attn.linear_out.weight",
+        "encoder.encoders.{}.self_attn.linear_out.bias",
+    ]
+
+    attributes = {
+        "bs": 128,
+        "seq_len": 374,
+        "n_head": 4,
+        "n_feat": 256,
+    }
+
+    if index == 0:
+        inputs = [
+            graph.get_variable("ff_macaron_{}_output".format(index)),
+            t.make_variable("mask", dtype=DataType.INT32, shape=(128, 1, 374)),
+            t.make_variable("pos_emb", dtype=DataType.FLOAT, shape=(1, 374, 256)),
+        ]
+    else:
+        inputs = [
+            graph.get_variable("ff_macaron_{}_output".format(index)),
+            graph.get_variable("mask"),
+            graph.get_variable("pos_emb"),
+        ]
+
+    outputs = [t.make_variable("mhsa_{}_output".format(index), dtype=DataType.FLOAT)]
+
+    for key in mhsa_keys:
+        key = key.format(index)
+        inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+    t.make_operator(
+        "ConformerMultiHeadSelfAttentionPlugin_IxRT",
+        inputs=inputs,
+        outputs=outputs,
+        **attributes
+    )
+
+
+def add_encoder_conv_module_op(graph, state_dict, args, index):
+
+    t = graph
+    conv_module_keys = [
+        "encoder.encoders.{}.norm_conv.weight",
+        "encoder.encoders.{}.norm_conv.bias",
+        "encoder.encoders.{}.conv_module.pointwise_conv1.weight",
+        "encoder.encoders.{}.conv_module.pointwise_conv1.bias",
+        "encoder.encoders.{}.conv_module.depthwise_conv.weight",
+        "encoder.encoders.{}.conv_module.depthwise_conv.bias",
+        "encoder.encoders.{}.conv_module.norm.weight",
+        "encoder.encoders.{}.conv_module.norm.bias",
+        "encoder.encoders.{}.conv_module.pointwise_conv2.weight",
+        "encoder.encoders.{}.conv_module.pointwise_conv2.bias",
+    ]
+
+    attributes = {
+        "kernel_size_1": 1,
+        "stride_1": 1,
+        "odim_1": 512,
+        "kernel_size_2": 8,
+        "stride_2": 1,
+        "odim_2": 256,
+        "kernel_size_3": 1,
+        "stride_3": 1,
+        "odim_3": 256,
+    }
+
+    inputs = [
+        graph.get_variable("mhsa_{}_output".format(index)),
+        graph.get_variable("mask"),
+    ]
+    outputs = [t.make_variable("conv_module_{}_output".format(index), dtype=DataType.FLOAT)]
+
+    for key in conv_module_keys:
+        key = key.format(index)
+
+        if "conv_module.depthwise_conv.weight" in key:
+            inputs.append(t.make_variable(name=key, value=state_dict[key].permute(1, 2, 0).half(), dtype=DataType.FLOAT16))
+        elif "bias" in key and "norm" not in key:
+            inputs.append(t.make_variable(name=key, value=state_dict[key], dtype=DataType.FLOAT))
+        else:
+            inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+    t.make_operator(
+        "ConformerConvModulePlugin_IxRT",
+        inputs=inputs,
+        outputs=outputs,
+        **attributes
+    )
+
+
+def add_encoder_positionwise_ff_op(graph, state_dict, args, index):
+
+    t = graph
+    positionwise_ff_keys = [
+        "encoder.encoders.{}.norm_ff.weight",
+        "encoder.encoders.{}.norm_ff.bias",
+        "encoder.encoders.{}.feed_forward.w_1.weight",
+        "encoder.encoders.{}.feed_forward.w_1.bias",
+        "encoder.encoders.{}.feed_forward.w_2.weight",
+        "encoder.encoders.{}.feed_forward.w_2.bias",
+    ]
+
+    attributes = {
+        "in_feature": 256,
+        "hidden_size": 2048,
+        "act_type": 12,
+        "ff_scale": 0.5,
+    }
+
+    inputs = [graph.get_variable('conv_module_{}_output'.format(index))]
+    outputs = [t.make_variable("positionwise_ff_{}_output".format(index), dtype=DataType.FLOAT)]
+
+    for key in positionwise_ff_keys:
+        key = key.format(index)
+        inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+    t.make_operator(
+        "PositionWiseFFNPluginDynamic_IxRT",
+        inputs=inputs,
+        outputs=outputs,
+        **attributes
+    )
+
+
+def add_encoder_ln_op(graph, state_dict, args, index):
+
+    t = graph
+    ln_keys = [
+        "encoder.encoders.{}.norm_final.weight",
+        "encoder.encoders.{}.norm_final.bias",
+    ]
+
+    attributes = {
+        "axis": -1,
+        "epsilon": 0.000009999999747378752,
+        "stash_type": 1,
+    }
+
+    inputs = [graph.get_variable("positionwise_ff_{}_output".format(index))]
+    outputs = [t.make_variable("norm_final_{}_output".format(index), dtype=DataType.FLOAT)]
+
+    for key in ln_keys:
+        key = key.format(index)
+        inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+    t.make_operator(
+        "LayerNormalization",
+        inputs=inputs,
+        outputs=outputs,
+        **attributes
+    )
+
+
+def add_final_ln_op(graph, state_dict, args):
+
+    t = graph
+    ln_keys = [
+        "encoder.after_norm.weight",
+        "encoder.after_norm.bias",
+    ]
+
+    attributes = {
+        "axis": -1,
+        "epsilon": 0.000009999999747378752,
+        "stash_type": 1,
+    }
+
+    inputs = [graph.get_variable("norm_final_11_output")]
+    outputs = [t.make_variable("norm_final_output", dtype=DataType.FLOAT)]
+
+    for key in ln_keys:
+        inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+    t.make_operator(
+        "LayerNormalization",
+        inputs=inputs,
+        outputs=outputs,
+        **attributes
+    )
+
+
+def add_ctc_op(graph, state_dict, args):
+    t = graph
+    # matmul
+    matmul_inputs = [graph.get_variable("norm_final_output")]
+    matmul_inputs.append(t.make_variable(name="ctc.ctc_lo.weight", value=state_dict["ctc.ctc_lo.weight"].transpose(1, 0)))   # (4233,256)--->(256,4233)
+    matmul_outputs = [t.make_variable("MatMul_output_1", dtype=DataType.FLOAT)]
+    t.make_operator(
+        "MatMul",
+        inputs=matmul_inputs,
+        outputs=matmul_outputs,
+    )
+
+    add_inputs = matmul_outputs
+    add_inputs.append(t.make_variable(name="ctc.ctc_lo.bias", value=state_dict["ctc.ctc_lo.bias"]))
+    add_outputs = [t.make_variable("Add_output_1", dtype=DataType.FLOAT)]
+    t.make_operator(
+        "Add",
+        inputs=add_inputs,
+        outputs=add_outputs,
+    )
+
+    logsoftmax_inputs = add_outputs
+    logsoftmax_outputs = [t.make_variable("output", dtype=DataType.FLOAT)]
+    attributes = {
+        "axis": 2
+    }
+    t.make_operator(
+        "LogSoftmax",
+        inputs=logsoftmax_inputs,
+        outputs=logsoftmax_outputs,
+        **attributes
+    )
+
+
+def main(args):
+    graph = Graph()
+    transform = GraphTransform(graph)
+    state_dict = torch.load(args.model_path)
+
+    # 0. Global CMVN: sub+mul+unsqueeze
+    add_global_cmvn_op(transform, state_dict, args)
+
+    # 1. First Submodule: Conv2d+Relu+Transpose+MatMul
+    add_first_submodule_op(transform, state_dict, args)
+
+    # 2. Second Submodule: ConformerEncoderLayer: 12 layers
+    for i in range(args.num_layers):
+        add_encoder_ff_macaron_op(transform, state_dict, args, i)
+        add_encoder_mhsa_op(transform, state_dict, args, i)
+        add_encoder_conv_module_op(transform, state_dict, args, i)
+        add_encoder_positionwise_ff_op(transform, state_dict, args, i)
+        add_encoder_ln_op(transform, state_dict, args, i)
+
+    # 3. Third Submodule: FinalNorm
+    add_final_ln_op(transform, state_dict, args)
+
+    # 4.Forth Submodule: CTC+LogSoftmax
+    add_ctc_op(transform, state_dict, args)
+
+    # 5. set input and output
+    graph.add_input(graph.get_variable("input"))
+    graph.add_input(graph.get_variable("mask"))
+    graph.add_input(graph.get_variable("pos_emb"))
+    graph.add_output(graph.get_variable("output"))
+    # 5. export onnx file
+    create_target(saved_path=args.onnx_path).export(graph)
+    print("save onnx: ", args.onnx_path)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    model_name = args.model_name.lower()
+    args.num_layers = 12
+    args.hidden_size = 2048
+    args.head_num = 4
+    args.head_dim = 64
+    args.pad_id = 0
+    args.inner_size = 3072
+    main(args)
diff --git a/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py b/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..35aad9bbf24533bed27e98ddbe4e326fa897df88
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+
+import argparse
+import yaml
+import copy
+import torch
+import numpy as np
+
+from tqdm.contrib import tqdm
+from torch.utils.data import DataLoader
+from wenet.file_utils import read_symbol_table
+from wenet.dataset import Dataset
+from tools.compute_cer import Calculator, characterize, normalize, default_cluster
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings,trtapi,setup_io_bindings
+import pickle
+
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+from utils import make_pad_mask, RelPositionalEncoding
+from postprocess import ctc_greedy_search
+
+
+rel_positional_encoding = RelPositionalEncoding(256, 0.1)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="recognize with your model")
+    parser.add_argument(
+        "--infer_type",
+        default="fp16",
+        choices=["fp16", "int8"],
+        help="inference type: fp16 or int8",
+    )
+    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
+    parser.add_argument("--batch_size", type=int, default=24)
+    parser.add_argument("--data_dir", required=True, help="test data directory")
+    parser.add_argument(
+        "--model_dir", type=str, required=True, help="model for inference"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def tensorrt_infer(engine, context, all_inputs):
+
+    input_names = ["input", "mask", "pos_emb"]
+    output_names = ["output"]
+
+    for input_name, input_data in zip(input_names, all_inputs):
+        input_idx = engine.get_binding_index(input_name)
+        input_shape = input_data.shape
+        context.set_binding_shape(input_idx, Dims(input_shape))
+
+    inputs, outputs, allocations = setup_io_bindings(engine, context)
+    pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+
+    for i, input_data in enumerate(all_inputs):
+        cuda.memcpy_htod(inputs[i]["allocation"], input_data)
+
+    context.execute_v2(allocations)
+    cuda.memcpy_dtoh(pred_output, outputs[0]["allocation"])
+    return pred_output
+
+
+def engine_init(engine):
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    engine, context = create_engine_context(engine, logger)
+
+    return engine,context
+
+
+def calculate_cer(data, reference_data):
+    calculator = Calculator()
+    tochar = True
+    split = None
+    case_sensitive = False
+    ignore_words = set()
+    rec_set = {}
+    for line in data:
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.strip().split()
+        if len(array) == 0:
+            continue
+        fid = array[0]
+        rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
+
+    default_clusters = {}
+    default_words = {}
+    for line in reference_data:
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.strip().split()
+        if len(array) == 0:
+            continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+        result = calculator.calculate(lab, rec)
+
+    result = calculator.overall()
+    cer = float(result["ins"] + result["sub"] + result["del"]) / result["all"]
+    corr = result["cor"] / result["all"]
+
+    return cer, corr
+
+
+def main():
+    args = get_args()
+
+    # 读取配置文件
+    config_fn = os.path.join(args.model_dir, "config.yaml")
+    with open(config_fn, "r") as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    dataset_conf = copy.deepcopy(configs["dataset_conf"])
+    dataset_conf["filter_conf"]["max_length"] = 102400
+    dataset_conf["filter_conf"]["min_length"] = 0
+    dataset_conf["filter_conf"]["token_max_length"] = 102400
+    dataset_conf["filter_conf"]["token_min_length"] = 0
+    dataset_conf["filter_conf"]["max_output_input_ratio"] = 102400
+    dataset_conf["filter_conf"]["min_output_input_ratio"] = 0
+    dataset_conf["speed_perturb"] = False
+    dataset_conf["spec_aug"] = False
+    dataset_conf["shuffle"] = False
+    dataset_conf["sort"] = True
+    dataset_conf["fbank_conf"]["dither"] = 0.0
+    dataset_conf["batch_conf"]["batch_type"] = "static"
+    dataset_conf["batch_conf"]["batch_size"] = args.batch_size
+
+    # Load dict
+    dict_fn = os.path.join(args.model_dir, "words.txt")
+    char_dict = {}
+    with open(dict_fn, "r", encoding="utf8") as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+    eos = len(char_dict) - 1
+
+    data_type = "raw"
+    test_data_fn = os.path.join(args.data_dir, "data.list")
+    symbol_table = read_symbol_table(dict_fn)
+    test_dataset = Dataset(
+        data_type, test_data_fn, symbol_table, dataset_conf, partition=False
+    )
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+
+    data_path_pkl = os.path.join(args.data_dir, f"aishell_test_data_bs{args.batch_size}.pkl")
+
+    print("*** 1. Prepare data ***")
+    if not os.path.isfile(data_path_pkl):
+        eval_samples = []
+        max_batch_size = -1
+        max_feature_length = -1
+        for batch in test_data_loader:
+            keys, feats, target, feats_lengths, target_lengths = batch
+            max_feature_length = max(max_feature_length, feats.size(1))
+            max_batch_size = max(max_batch_size, feats.size(0))
+            eval_samples.append(
+                [
+                    keys,
+                    feats.cpu().numpy().astype(np.float16),
+                    feats_lengths.cpu().numpy().astype(np.int32),
+                ]
+            )
+        with open(data_path_pkl, "wb") as f:
+            pickle.dump(
+                [
+                    eval_samples,
+                    max_batch_size,
+                    max_feature_length
+                ],
+                f,
+            )
+    else:
+        print(f"load data from tmp: {data_path_pkl}")
+        with open(data_path_pkl, "rb") as f:
+            (
+                eval_samples,
+                max_batch_size,
+                max_feature_length
+            ) = pickle.load(f)
+    print(
+        f"dataset max shape: batch_size: {max_batch_size}, feat_length: {max_feature_length}"
+    )
+
+    print("*** 2. Load engine ***")
+    engine_path = os.path.join(args.model_dir, f"conformer_encoder_fusion.engine")
+    engine, context = engine_init(engine_path)
+
+    print("*** 3. Warm up ***")
+    if args.warm_up > 0:
+        for i in range(args.warm_up):
+            feats_tmp = np.ones((args.batch_size, 1500, 80)).astype(np.float32)
+            feats_lengths_tmp = np.ones((args.batch_size)).astype(np.int32) * 1500
+            mask_tmp = make_pad_mask(feats_lengths_tmp, 1500)
+            mask_len_tmp = mask_tmp.shape[-1]
+            pos_emb_tmp = rel_positional_encoding(mask_len_tmp).numpy()
+            all_inputs = [feats_tmp, mask_tmp, pos_emb_tmp]
+            tensorrt_infer(engine, context, all_inputs)
+
+    results = []
+    for keys, feats, feats_lengths in tqdm(eval_samples):
+        b, seq_len, feat = feats.shape
+
+        inputs = feats.astype(np.float32)
+        mask = make_pad_mask(feats_lengths, seq_len)
+        mask_len = mask.shape[-1]
+        pos_emb = rel_positional_encoding(mask_len).numpy()
+
+        all_inputs = [inputs, mask, pos_emb]
+        hyps = tensorrt_infer(
+            engine,
+            context,
+            all_inputs
+        )
+
+        ctc_probs = torch.from_numpy(hyps)
+        ctc_lens = torch.from_numpy(feats_lengths)
+        hyps = ctc_greedy_search(ctc_probs, ctc_lens)
+
+        for i, key in enumerate(keys):
+            line = f"{key} "
+            for w in hyps[i]:
+                w = w - 1
+                if w == eos:
+                    break
+                line += char_dict[w]
+            results.append(line)
+
+    # 3. 计算 CER
+    reference_file = os.path.join(args.data_dir, "text")
+    reference_data = []
+    for line in open(reference_file, "r", encoding="utf-8"):
+        reference_data.append(line)
+
+    cer, corr = calculate_cer(results, reference_data)
+    target_cer = float(os.environ["Accuracy"])
+    print("CER: ", cer, "target CER: ", target_cer)
+    if cer <= target_cer:
+        print("pass!")
+        exit()
+    else:
+        print("failed!")
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_performance.py b/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19233fa6813722083e1e86fbfc310dcd1370670
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_performance.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import sys
+import time
+
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+
+import argparse
+import yaml
+import copy
+import torch
+import numpy as np
+
+from tqdm.contrib import tqdm
+from torch.utils.data import DataLoader
+from wenet.file_utils import read_symbol_table
+from wenet.dataset import Dataset
+from tools.compute_cer import Calculator, characterize, normalize, default_cluster
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings,trtapi,setup_io_bindings
+import pickle
+
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+from utils import make_pad_mask, RelPositionalEncoding
+from postprocess import ctc_greedy_search
+
+
+rel_positional_encoding = RelPositionalEncoding(256, 0.1)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="recognize with your model")
+    parser.add_argument(
+        "--infer_type",
+        default="fp16",
+        choices=["fp16", "int8"],
+        help="inference type: fp16 or int8",
+    )
+    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
+    parser.add_argument("--batch_size", type=int, default=24)
+    parser.add_argument("--data_dir", required=True, help="test data directory")
+    parser.add_argument(
+        "--model_dir", type=str, required=True, help="model for inference"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def tensorrt_infer(engine, context, all_inputs):
+
+    input_names = ["input", "mask", "pos_emb"]
+    output_names = ["output"]
+
+    for input_name, input_data in zip(input_names, all_inputs):
+        input_idx = engine.get_binding_index(input_name)
+        input_shape = input_data.shape
+        context.set_binding_shape(input_idx, Dims(input_shape))
+
+    inputs, outputs, allocations = setup_io_bindings(engine, context)
+    pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+
+    for i, input_data in enumerate(all_inputs):
+        cuda.memcpy_htod(inputs[i]["allocation"], input_data)
+
+    context.execute_v2(allocations)
+    cuda.memcpy_dtoh(pred_output, outputs[0]["allocation"])
+    return pred_output
+
+
+def engine_init(engine):
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+    engine, context = create_engine_context(engine, logger)
+
+    return engine,context
+
+
+def calculate_cer(data, reference_data):
+    calculator = Calculator()
+    tochar = True
+    split = None
+    case_sensitive = False
+    ignore_words = set()
+    rec_set = {}
+    for line in data:
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.strip().split()
+        if len(array) == 0:
+            continue
+        fid = array[0]
+        rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
+
+    default_clusters = {}
+    default_words = {}
+    for line in reference_data:
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.strip().split()
+        if len(array) == 0:
+            continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+        result = calculator.calculate(lab, rec)
+
+    result = calculator.overall()
+    cer = float(result["ins"] + result["sub"] + result["del"]) / result["all"]
+    corr = result["cor"] / result["all"]
+
+    return cer, corr
+
+
+def main():
+    args = get_args()
+
+    # 读取配置文件
+    config_fn = os.path.join(args.model_dir, "config.yaml")
+    with open(config_fn, "r") as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    dataset_conf = copy.deepcopy(configs["dataset_conf"])
+    dataset_conf["filter_conf"]["max_length"] = 102400
+    dataset_conf["filter_conf"]["min_length"] = 0
+    dataset_conf["filter_conf"]["token_max_length"] = 102400
+    dataset_conf["filter_conf"]["token_min_length"] = 0
+    dataset_conf["filter_conf"]["max_output_input_ratio"] = 102400
+    dataset_conf["filter_conf"]["min_output_input_ratio"] = 0
+    dataset_conf["speed_perturb"] = False
+    dataset_conf["spec_aug"] = False
+    dataset_conf["shuffle"] = False
+    dataset_conf["sort"] = True
+    dataset_conf["fbank_conf"]["dither"] = 0.0
+    dataset_conf["batch_conf"]["batch_type"] = "static"
+    dataset_conf["batch_conf"]["batch_size"] = args.batch_size
+
+    # Load dict
+    dict_fn = os.path.join(args.model_dir, "words.txt")
+    char_dict = {}
+    with open(dict_fn, "r", encoding="utf8") as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+    eos = len(char_dict) - 1
+
+    data_type = "raw"
+    test_data_fn = os.path.join(args.data_dir, "data.list")
+    symbol_table = read_symbol_table(dict_fn)
+    test_dataset = Dataset(
+        data_type, test_data_fn, symbol_table, dataset_conf, partition=False
+    )
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+
+    data_path_pkl = os.path.join(args.data_dir, f"aishell_test_data_bs{args.batch_size}.pkl")
+
+    print("*** 1. Prepare data ***")
+    if not os.path.isfile(data_path_pkl):
+        eval_samples = []
+        max_batch_size = -1
+        max_feature_length = -1
+        for batch in test_data_loader:
+            keys, feats, target, feats_lengths, target_lengths = batch
+            max_feature_length = max(max_feature_length, feats.size(1))
+            max_batch_size = max(max_batch_size, feats.size(0))
+            eval_samples.append(
+                [
+                    keys,
+                    feats.cpu().numpy().astype(np.float16),
+                    feats_lengths.cpu().numpy().astype(np.int32),
+                ]
+            )
+        with open(data_path_pkl, "wb") as f:
+            pickle.dump(
+                [
+                    eval_samples,
+                    max_batch_size,
+                    max_feature_length
+                ],
+                f,
+            )
+    else:
+        print(f"load data from tmp: {data_path_pkl}")
+        with open(data_path_pkl, "rb") as f:
+            (
+                eval_samples,
+                max_batch_size,
+                max_feature_length
+            ) = pickle.load(f)
+    print(
+        f"dataset max shape: batch_size: {max_batch_size}, feat_length: {max_feature_length}"
+    )
+
+    print("*** 2. Load engine ***")
+    engine_path = os.path.join(args.model_dir, f"conformer_encoder_fusion.engine")
+    engine, context = engine_init(engine_path)
+
+    print("*** 3. Warm up ***")
+    if args.warm_up > 0:
+        for i in range(args.warm_up):
+            feats_tmp = np.ones((args.batch_size, 1500, 80)).astype(np.float32)
+            feats_lengths_tmp = np.ones((args.batch_size)).astype(np.int32) * 1500
+            mask_tmp = make_pad_mask(feats_lengths_tmp, 1500)
+            mask_len_tmp = mask_tmp.shape[-1]
+            pos_emb_tmp = rel_positional_encoding(mask_len_tmp).numpy()
+            all_inputs = [feats_tmp, mask_tmp, pos_emb_tmp]
+            tensorrt_infer(engine, context, all_inputs)
+
+    print("*** 4. Inference ***")
+    start_time = time.time()
+    num_samples = 0
+    results = []
+    for keys, feats, feats_lengths in tqdm(eval_samples):
+        b, seq_len, feat = feats.shape
+        num_samples += b
+        inputs = feats.astype(np.float32)
+        mask = make_pad_mask(feats_lengths, seq_len)
+        mask_len = mask.shape[-1]
+        pos_emb = rel_positional_encoding(mask_len).numpy()
+
+        all_inputs = [inputs, mask, pos_emb]
+        hyps = tensorrt_infer(
+            engine,
+            context,
+            all_inputs
+        )
+
+    eval_time = time.time() - start_time
+
+    QPS = num_samples / eval_time
+    print(f"Recognize {num_samples} sentences, {QPS} sentences/s")
+    target_qps = float(os.environ["Accuracy"])
+    print("QPS: = ", QPS, "target QPS: ", target_qps)
+    if QPS >= target_qps:
+        print("pass!")
+        exit()
+    else:
+        print("failed!")
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/speech/speech_recognition/conformer/ixrt/postprocess/__init__.py b/models/speech/speech_recognition/conformer/ixrt/postprocess/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33f8b0465aee011298fa9933086fbdc1c8dbd4d4
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/postprocess/__init__.py
@@ -0,0 +1 @@
+from .search import ctc_greedy_search
diff --git a/models/speech/speech_recognition/conformer/ixrt/postprocess/search.py b/models/speech/speech_recognition/conformer/ixrt/postprocess/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2ae55650539b9d0be352e78a64999606ac12fbb
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/postprocess/search.py
@@ -0,0 +1,103 @@
+import math
+from collections import defaultdict
+from typing import List, Dict
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+
+def remove_duplicates_and_blank(hyp: List[int],
+                                blank_id: int = 0) -> List[int]:
+    new_hyp: List[int] = []
+    cur = 0
+    while cur < len(hyp):
+        if hyp[cur] != blank_id:
+            new_hyp.append(hyp[cur])
+        prev = cur
+        while cur < len(hyp) and hyp[cur] == hyp[prev]:
+            cur += 1
+    return new_hyp
+
+
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of padded part.
+
+    See description of make_non_pad_mask.
+
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: Mask tensor containing indices of padded part.
+
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+
+    mask = mask[:, 2::2][:, 2::2]
+    return mask
+
+
+class DecodeResult:
+
+    def __init__(self,
+                 tokens: List[int],
+                 score: float = 0.0,
+                 confidence: float = 0.0,
+                 tokens_confidence: List[float] = None,
+                 times: List[int] = None,
+                 nbest: List[List[int]] = None,
+                 nbest_scores: List[float] = None,
+                 nbest_times: List[List[int]] = None):
+        """
+        Args:
+            tokens: decode token list
+            score: the total decode score of this result
+            confidence: the total confidence of this result, it's in 0~1
+            tokens_confidence: confidence of each token
+            times: timestamp of each token, list of (start, end)
+            nbest: nbest result
+            nbest_scores: score of each nbest
+            nbest_times:
+        """
+        self.tokens = tokens
+        self.score = score
+        self.confidence = confidence
+        self.tokens_confidence = tokens_confidence
+        self.times = times
+        self.nbest = nbest
+        self.nbest_scores = nbest_scores
+        self.nbest_times = nbest_times
+
+
+def ctc_greedy_search(ctc_probs: torch.Tensor,
+                      ctc_lens: torch.Tensor,
+                      blank_id: int = 0) -> List[DecodeResult]:
+
+    batch_size = ctc_probs.shape[0]
+    maxlen = ctc_probs.size(1)
+    topk_prob, topk_index = ctc_probs.topk(1, dim=2)  # (B, maxlen, 1)
+    topk_index = topk_index.view(batch_size, maxlen)  # (B, maxlen)
+
+    mask_ctc_lens = ctc_lens[0].item()
+    mask = make_pad_mask(ctc_lens, mask_ctc_lens)  # (B, maxlen)
+    topk_index = topk_index.masked_fill_(mask, blank_id)  # (B, maxlen)
+    hyps = [hyp.tolist() for hyp in topk_index]
+    scores = topk_prob.max(1)
+    results = []
+    for hyp in hyps:
+        results.append(remove_duplicates_and_blank(hyp, blank_id))
+    return results
+
diff --git a/models/speech/speech_recognition/conformer/ixrt/scripts/aishell_data_prepare.sh b/models/speech/speech_recognition/conformer/ixrt/scripts/aishell_data_prepare.sh
new file mode 100755
index 0000000000000000000000000000000000000000..985564c2294b2a413531d6ced018029ec911fb23
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/scripts/aishell_data_prepare.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+# set -euox pipefail
+
+data_dir=$1
+tool_dir=$2
+
+wav_dir=${data_dir}/wav
+aishell_text=${data_dir}/transcript/aishell_transcript_v0.8.txt
+
+# data directory check
+if [ ! -d $wav_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: wav directory and aishell text not found!"
+  exit 1;
+fi
+
+# find test wav file
+local_dir=${data_dir}/local
+mkdir -p $local_dir
+find $wav_dir -iname "*.wav" > $local_dir/wav.flist || exit 1;
+
+# Transcriptions preparation
+sed -e 's/\.wav//' $local_dir/wav.flist | awk -F '/' '{print $NF}' > $local_dir/utt.list
+paste -d' ' $local_dir/utt.list $local_dir/wav.flist > $local_dir/wav.scp_all
+${tool_dir}/filter_scp.pl -f 1 $local_dir/utt.list $aishell_text > $local_dir/transcripts.txt
+awk '{print $1}' $local_dir/transcripts.txt > $local_dir/utt.list
+${tool_dir}/filter_scp.pl -f 1 $local_dir/utt.list $local_dir/wav.scp_all | sort -u > $local_dir/wav.scp
+sort -u $local_dir/transcripts.txt > $local_dir/text
+echo "Preparing transcriptions succeeded!"
+
+test_dir=${data_dir}/test
+mkdir -p ${test_dir}
+for f in wav.scp text; do
+  cp $local_dir/$f ${test_dir}/$f || exit 1;
+done
+rm -r ${data_dir}/local
+
+# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
+# `shard` is used for large dataset which is over 1k hours, and `shard` is
+# faster on reading data and training.
+data_type=raw
+num_utts_per_shard=1000
+
+# remove the space between the text labels for Mandarin dataset
+cp $test_dir/text $test_dir/text.org
+paste -d " " <(cut -f 1 -d" " ${test_dir}/text.org) \
+  <(cut -f 2- -d" " ${test_dir}/text.org | tr -d " ") \
+  > ${test_dir}/text
+rm ${test_dir}/text.org
+
+# Prepare required format
+if [ $data_type == "shard" ]; then
+  ${tool_dir}/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+    --num_threads 16 $test_dir/wav.scp $test_dir/text \
+    $(realpath $test_dir/shards) $test_dir/data.list
+else
+  ${tool_dir}/make_raw_list.py $test_dir/wav.scp $test_dir/text \
+    $test_dir/data.list
+fi
+
+echo "AISHELL data preparation succeeded!"
\ No newline at end of file
diff --git a/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy_ixrt.sh b/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy_ixrt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f1af4bb4e03a0c9c6084ae7a122f66f765c27c86
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy_ixrt.sh
@@ -0,0 +1,49 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -euo pipefail
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+
+PROJECT_DIR=${current_path}/..
+DATA_DIR=${current_path}/../aishell_test_data/test
+MODEL_DIR=${current_path}/../conformer_checkpoints
+
+export Accuracy=${Accuracy:=0.052}
+
+cd ${PROJECT_DIR}
+
+echo "Step1.Export Onnx From Checkpoints!"
+python3 convert2onnx.py \
+    --model_name "Conformer" \
+    --model_path=${MODEL_DIR}/final.pt                          \
+    --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx      \
+    --batch_size=8
+
+echo "Step2.Build Engine!"
+python3 build_engine.py \
+    --model_name "Conformer" \
+    --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx        \
+    --engine_path=${MODEL_DIR}/conformer_encoder_fusion.engine    \
+    --max_batch_size=8  \
+    --max_seq_len=1500
+
+echo "Step3.Inference(Test ACC)!"
+python3 ixrt_inference_accuracy.py \
+    --infer_type fp16 \
+    --warm_up 3       \
+    --batch_size ${BATCH_SIZE:=8} \
+    --data_dir ${DATA_DIR}  \
+    --model_dir ${MODEL_DIR}
diff --git a/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance_ixrt.sh b/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance_ixrt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dc02673c03fb21a4301b757a18885af81cbad31d
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance_ixrt.sh
@@ -0,0 +1,59 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+set -euo pipefail
+
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0));then
+    echo "fails"
+    EXIT_STATUS=1
+    fi
+}
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+
+PROJECT_DIR=${current_path}/..
+DATA_DIR=${current_path}/../aishell_test_data/test
+MODEL_DIR=${current_path}/../conformer_checkpoints
+
+export Accuracy=${Accuracy:=350}
+
+cd ${PROJECT_DIR}
+
+
+echo "Step1.Export Onnx From Checkpoints!"
+python3 convert2onnx.py \
+    --model_name "Conformer" \
+    --model_path=${MODEL_DIR}/final.pt                          \
+    --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx      \
+    --batch_size=24
+
+echo "Step2.Build Engine!"
+python3 build_engine.py \
+    --model_name "Conformer" \
+    --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx        \
+    --engine_path=${MODEL_DIR}/conformer_encoder_fusion.engine    \
+    --max_batch_size=24  \
+    --max_seq_len=1500
+
+echo "Step3.Inference(Test QPS)!"
+python3 ixrt_inference_performance.py \
+    --infer_type fp16 \
+    --batch_size ${BATCH_SIZE:=24} \
+    --data_dir ${DATA_DIR}  \
+    --model_dir ${MODEL_DIR}
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/__init__.py b/models/speech/speech_recognition/conformer/ixrt/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/compute_cer.py b/models/speech/speech_recognition/conformer/ixrt/tools/compute_cer.py
new file mode 100755
index 0000000000000000000000000000000000000000..a5db08979f4d31a4a2ac9e4ceb0d122537690aac
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/compute_cer.py
@@ -0,0 +1,532 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+import sys
+import unicodedata
+import codecs
+
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = ['!', ',', '?',
+          '、', '。', '！', '，', '；', '？',
+          '：', '「', '」', '︰', '『', '』', '《', '》']
+
+def characterize(string) :
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        # https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == 'Lo':  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = ' '
+            if char == '<':
+                sep = '>'
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == '>':
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+
+def stripoff_tags(x):
+    if not x:
+        return ''
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == '<':
+            while i < T and x[i] != '>':
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return ''.join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        if x.isalnum():
+            for k in x:
+                new_sentence.append(k)
+        else:
+            new_sentence.append(x)
+    return new_sentence
+
+class Calculator :
+    def __init__(self) :
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost['cor'] = 0
+        self.cost['sub'] = 1
+        self.cost['del'] = 1
+        self.cost['ins'] = 1
+
+    def calculate(self, lab, rec) :
+        # Initialization
+        lab.insert(0, '')
+        rec.insert(0, '')
+        while len(self.space) < len(lab) :
+            self.space.append([])
+        for row in self.space :
+            for element in row :
+                element['dist'] = 0
+                element['error'] = 'non'
+            while len(row) < len(rec) :
+                row.append({'dist' : 0, 'error' : 'non'})
+        for i in range(len(lab)) :
+            self.space[i][0]['dist'] = i
+            self.space[i][0]['error'] = 'del'
+        for j in range(len(rec)) :
+            self.space[0][j]['dist'] = j
+            self.space[0][j]['error'] = 'ins'
+        self.space[0][0]['error'] = 'non'
+        for token in lab :
+            if token not in self.data and len(token) > 0 :
+                self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0,
+                                    'ins' : 0, 'del' : 0}
+        for token in rec :
+            if token not in self.data and len(token) > 0 :
+                self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0,
+                                    'ins' : 0, 'del' : 0}
+        # Computing edit distance
+        for i, lab_token in enumerate(lab) :
+            for j, rec_token in enumerate(rec) :
+                if i == 0 or j == 0 :
+                    continue
+                min_dist = sys.maxsize
+                min_error = 'none'
+                dist = self.space[i - 1][j]['dist'] + self.cost['del']
+                error = 'del'
+                if dist < min_dist :
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+                error = 'ins'
+                if dist < min_dist :
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token :
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+                    error = 'cor'
+                else :
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+                    error = 'sub'
+                if dist < min_dist :
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]['dist'] = min_dist
+                self.space[i][j]['error'] = min_error
+        # Tracing back
+        result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0,
+                  'ins': 0, 'del': 0}
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True :
+            if self.space[i][j]['error'] == 'cor' :  # correct
+                if len(lab[i]) > 0 :
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+                    result['all'] = result['all'] + 1
+                    result['cor'] = result['cor'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'sub' :  # substitution
+                if len(lab[i]) > 0 :
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+                    result['all'] = result['all'] + 1
+                    result['sub'] = result['sub'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'del' :  # deletion
+                if len(lab[i]) > 0 :
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+                    result['all'] = result['all'] + 1
+                    result['del'] = result['del'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, "")
+                i = i - 1
+            elif self.space[i][j]['error'] == 'ins' :  # insertion
+                if len(rec[j]) > 0 :
+                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+                    result['ins'] = result['ins'] + 1
+                result['lab'].insert(0, "")
+                result['rec'].insert(0, rec[j])
+                j = j - 1
+            elif self.space[i][j]['error'] == 'non' :  # starting point
+                break
+            else :  # shouldn't reach here
+                print('this should not happen , i={i} , j={j} , \
+                      error={error}'.
+                      format(i=i, j=j, error=self.space[i][j]['error']))
+        return result
+
+    def overall(self) :
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in self.data :
+            result['all'] = result['all'] + self.data[token]['all']
+            result['cor'] = result['cor'] + self.data[token]['cor']
+            result['sub'] = result['sub'] + self.data[token]['sub']
+            result['ins'] = result['ins'] + self.data[token]['ins']
+            result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def cluster(self, data) :
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in data :
+            if token in self.data :
+                result['all'] = result['all'] + self.data[token]['all']
+                result['cor'] = result['cor'] + self.data[token]['cor']
+                result['sub'] = result['sub'] + self.data[token]['sub']
+                result['ins'] = result['ins'] + self.data[token]['ins']
+                result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def keys(self) :
+        return list(self.data.keys())
+
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+def default_cluster(word) :
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))) :
+        if unicode_names[i].startswith('DIGIT') :  # 1
+            unicode_names[i] = 'Number'  # 'DIGIT'
+        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
+              unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) :
+            # 明 / 郎
+            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
+                unicode_names[i].startswith('LATIN SMALL LETTER')) :
+            # A / a
+            unicode_names[i] = 'English'  # 'LATIN LETTER'
+        elif unicode_names[i].startswith('HIRAGANA LETTER') :  # は こ め
+            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+        elif (unicode_names[i].startswith('AMPERSAND') or
+              unicode_names[i].startswith('APOSTROPHE') or
+              unicode_names[i].startswith('COMMERCIAL AT') or
+              unicode_names[i].startswith('DEGREE CELSIUS') or
+              unicode_names[i].startswith('EQUALS SIGN') or
+              unicode_names[i].startswith('FULL STOP') or
+              unicode_names[i].startswith('HYPHEN-MINUS') or
+              unicode_names[i].startswith('LOW LINE') or
+              unicode_names[i].startswith('NUMBER SIGN') or
+              unicode_names[i].startswith('PLUS SIGN') or
+              unicode_names[i].startswith('SEMICOLON')) :
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else :
+            return 'Other'
+    if len(unicode_names) == 0 :
+        return 'Other'
+    if len(unicode_names) == 1 :
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1) :
+        if unicode_names[i] != unicode_names[i + 1] :
+            return 'Other'
+    return unicode_names[0]
+
+def usage() :
+    print("compute-wer.py : compute word error rate (WER) \
+          and align recognition results and references.")
+    print("         usage : python compute-wer.py [--cs={0,1}] \
+          [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \
+          [--padding-symbol={space,underline}] test.ref test.hyp > test.wer")
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1 :
+        usage()
+        sys.exit(0)
+    calculator = Calculator()
+    cluster_file = ''
+    ignore_words = set()
+    tochar = False
+    verbose = 1
+    padding_symbol = ' '
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+    while len(sys.argv) > 3:
+        a = '--maxw='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):]
+            del sys.argv[1]
+            max_words_per_line = int(b)
+            continue
+        a = '--rt='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            remove_tag = (b == 'true') or (b != '0')
+            continue
+        a = '--cs='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            case_sensitive = (b == 'true') or (b != '0')
+            continue
+        a = '--cluster='
+        if sys.argv[1].startswith(a):
+            cluster_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            continue
+        a = '--splitfile='
+        if sys.argv[1].startswith(a):
+            split_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            split = dict()
+            with codecs.open(split_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    words = line.strip().split()
+                    if len(words) >= 2:
+                        split[words[0]] = words[1:]
+            continue
+        a = '--ig='
+        if sys.argv[1].startswith(a):
+            ignore_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    line = line.strip()
+                    if len(line) > 0:
+                        ignore_words.add(line)
+            continue
+        a = '--char='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            tochar = (b == 'true') or (b != '0')
+            continue
+        a = '--v='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            verbose = 0
+            try:
+                verbose = int(b)
+            except Exception:
+                if b == 'true' or b != '0':
+                    verbose = 1
+            continue
+        a = '--padding-symbol='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            if b == 'space':
+                padding_symbol = ' '
+            elif b == 'underline':
+                padding_symbol = '_'
+            continue
+        if True or sys.argv[1].startswith('-'):
+            # ignore invalid switch
+            del sys.argv[1]
+            continue
+
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+
+    default_clusters = {}
+    default_words = {}
+
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    rec_set = {}
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+
+    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+        for line in fh:
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0:
+                continue
+            fid = array[0]
+            rec_set[fid] = normalize(array[1:], ignore_words,
+                                     case_sensitive, split)
+
+    # compute error rate on the interaction of reference file and hyp file
+    for line in open(ref_file, 'r', encoding='utf-8') :
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0:
+            continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+        if verbose:
+            print('\nutt: %s' % fid)
+
+        for word in rec + lab :
+            if word not in default_words :
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters :
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name] :
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+
+        result = calculator.calculate(lab, rec)
+        if verbose:
+            if result['all'] != 0 :
+                wer = float(result['ins'] + result['sub'] +
+                            result['del']) * 100.0 / result['all']
+            else :
+                wer = 0.0
+            print('WER: %4.2f %%' % wer, end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'],
+                   result['del'], result['ins']))
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])) :
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]) :
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]) :
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+
+    if verbose:
+        print('==================================================='
+              '========================')
+        print()
+
+    result = calculator.overall()
+    if result['all'] != 0 :
+        wer = float(result['ins'] + result['sub'] +
+                    result['del']) * 100.0 / result['all']
+    else :
+        wer = 0.0
+    print('Overall -> wer %4.2f %% Corr %4.2f %%' % (wer, result['cor']*100/result['all']), end=' ')
+    print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'],
+           result['del'], result['ins']))
+    if not verbose:
+        print()
+
+    if verbose:
+        for cluster_id in default_clusters :
+            result = calculator.cluster(k for k in default_clusters[cluster_id])
+            if result['all'] != 0 :
+                wer = float(result['ins'] + result['sub'] +
+                            result['del']) * 100.0 / result['all']
+            else :
+                wer = 0.0
+            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'],
+                   result['del'], result['ins']))
+        if len(cluster_file) > 0 :  # compute separated WERs for word clusters
+            cluster_id = ''
+            cluster = []
+            for line in open(cluster_file, 'r', encoding='utf-8') :
+                for token in line.decode('utf-8').rstrip('\n').split() :
+                    # end of cluster reached, like </Keyword>
+                    if token[0:2] == '</' and token[len(token) - 1] == '>' and \
+                       token.lstrip('</').rstrip('>') == cluster_id :
+                        result = calculator.cluster(cluster)
+                        if result['all'] != 0 :
+                            wer = float(result['ins'] + result['sub'] +
+                                        result['del']) * 100.0 / result['all']
+                        else :
+                            wer = 0.0
+                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+                        print('N=%d C=%d S=%d D=%d I=%d' %
+                              (result['all'], result['cor'], result['sub'],
+                               result['del'], result['ins']))
+                        cluster_id = ''
+                        cluster = []
+                    # begin of cluster reached, like <Keyword>
+                    elif (token[0] == '<' and token[len(token) - 1] == '>' and
+                          cluster_id == ''):
+                        cluster_id = token.lstrip('<').rstrip('>')
+                        cluster = []
+                    # general terms, like WEATHER / CAR / ...
+                    else :
+                        cluster.append(token)
+        print()
+        print('======================================='
+              '====================================')
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/filter_scp.pl b/models/speech/speech_recognition/conformer/ixrt/tools/filter_scp.pl
new file mode 100755
index 0000000000000000000000000000000000000000..b76d37f41be0886470281978bfacf97f6b8ae976
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/filter_scp.pl
@@ -0,0 +1,87 @@
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation
+#                     Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose "n-th" field is an utterance id), printing
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
+# the -f <n> switch
+
+$exclude = 0;
+$field = 1;
+$shifted = 0;
+
+do {
+  $shifted=0;
+  if ($ARGV[0] eq "--exclude") {
+    $exclude = 1;
+    shift @ARGV;
+    $shifted=1;
+  }
+  if ($ARGV[0] eq "-f") {
+    $field = $ARGV[1];
+    shift @ARGV; shift @ARGV;
+    $shifted=1
+  }
+} while ($shifted);
+
+if(@ARGV < 1 || @ARGV > 2) {
+  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
+      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
+      "only the lines that were *not* in id_list.\n" .
+      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+      "-f option, add 1 to the argument.\n" .
+      "See also: utils/filter_scp.pl .\n";
+}
+
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+  @A = split;
+  @A>=1 || die "Invalid id-list file line $_";
+  $seen{$A[0]} = 1;
+}
+
+if ($field == 1) { # Treat this as special case, since it is common.
+  while(<>) {
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+      print $_;
+    }
+  }
+} else {
+  while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+      print $_;
+    }
+  }
+}
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/make_raw_list.py b/models/speech/speech_recognition/conformer/ixrt/tools/make_raw_list.py
new file mode 100755
index 0000000000000000000000000000000000000000..2f84f015542bb38da027b8ea61e8638f873cec33
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/make_raw_list.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--segments', default=None, help='segments file')
+    parser.add_argument('wav_file', help='wav file')
+    parser.add_argument('text_file', help='text file')
+    parser.add_argument('output_file', help='output list file')
+    args = parser.parse_args()
+
+    wav_table = {}
+    with open(args.wav_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            wav_table[arr[0]] = arr[1]
+
+    if args.segments is not None:
+        segments_table = {}
+        with open(args.segments, 'r', encoding='utf8') as fin:
+            for line in fin:
+                arr = line.strip().split()
+                assert len(arr) == 4
+                segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3]))
+
+    with open(args.text_file, 'r', encoding='utf8') as fin, \
+         open(args.output_file, 'w', encoding='utf8') as fout:
+        for line in fin:
+            arr = line.strip().split(maxsplit=1)
+            key = arr[0]
+            txt = arr[1] if len(arr) > 1 else ''
+            if args.segments is None:
+                assert key in wav_table
+                wav = wav_table[key]
+                line = dict(key=key, wav=wav, txt=txt)
+            else:
+                assert key in segments_table
+                wav_key, start, end = segments_table[key]
+                wav = wav_table[wav_key]
+                line = dict(key=key, wav=wav, txt=txt, start=start, end=end)
+            json_line = json.dumps(line, ensure_ascii=False)
+            fout.write(json_line + '\n')
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/make_shard_list.py b/models/speech/speech_recognition/conformer/ixrt/tools/make_shard_list.py
new file mode 100755
index 0000000000000000000000000000000000000000..fcd4bcd7d62ba933cf27c34fc02e18371a6b10a6
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/make_shard_list.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import io
+import logging
+import os
+import tarfile
+import time
+import multiprocessing
+
+import torch
+import torchaudio
+import torchaudio.backend.sox_io_backend as sox
+
+AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
+
+
+def write_tar_file(data_list,
+                   no_segments,
+                   tar_file,
+                   resample=16000,
+                   index=0,
+                   total=1):
+    logging.info('Processing {} {}/{}'.format(tar_file, index, total))
+    read_time = 0.0
+    save_time = 0.0
+    write_time = 0.0
+    with tarfile.open(tar_file, "w") as tar:
+        prev_wav = None
+        for item in data_list:
+            if no_segments:
+                key, txt, wav = item
+            else:
+                key, txt, wav, start, end = item
+
+            suffix = wav.split('.')[-1]
+            assert suffix in AUDIO_FORMAT_SETS
+            if no_segments:
+                ts = time.time()
+                with open(wav, 'rb') as fin:
+                    data = fin.read()
+                read_time += (time.time() - ts)
+            else:
+                if wav != prev_wav:
+                    ts = time.time()
+                    waveforms, sample_rate = sox.load(wav, normalize=False)
+                    read_time += (time.time() - ts)
+                    prev_wav = wav
+                start = int(start * sample_rate)
+                end = int(end * sample_rate)
+                audio = waveforms[:1, start:end]
+
+                # resample
+                if sample_rate != resample:
+                    audio = torchaudio.transforms.Resample(
+                        sample_rate, resample)(audio)
+
+                ts = time.time()
+                f = io.BytesIO()
+                sox.save(f, audio, resample, format="wav", bits_per_sample=16)
+                # Save to wav for segments file
+                suffix = "wav"
+                f.seek(0)
+                data = f.read()
+                save_time += (time.time() - ts)
+
+            assert isinstance(txt, str)
+            ts = time.time()
+            txt_file = key + '.txt'
+            txt = txt.encode('utf8')
+            txt_data = io.BytesIO(txt)
+            txt_info = tarfile.TarInfo(txt_file)
+            txt_info.size = len(txt)
+            tar.addfile(txt_info, txt_data)
+
+            wav_file = key + '.' + suffix
+            wav_data = io.BytesIO(data)
+            wav_info = tarfile.TarInfo(wav_file)
+            wav_info.size = len(data)
+            tar.addfile(wav_info, wav_data)
+            write_time += (time.time() - ts)
+        logging.info('read {} save {} write {}'.format(read_time, save_time,
+                                                       write_time))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('--num_utts_per_shard',
+                        type=int,
+                        default=1000,
+                        help='num utts per shard')
+    parser.add_argument('--num_threads',
+                        type=int,
+                        default=1,
+                        help='num threads for make shards')
+    parser.add_argument('--prefix',
+                        default='shards',
+                        help='prefix of shards tar file')
+    parser.add_argument('--segments', default=None, help='segments file')
+    parser.add_argument('--resample',
+                        type=int,
+                        default=16000,
+                        help='segments file')
+    parser.add_argument('wav_file', help='wav file')
+    parser.add_argument('text_file', help='text file')
+    parser.add_argument('shards_dir', help='output shards dir')
+    parser.add_argument('shards_list', help='output shards list file')
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.INFO,
+                        format='%(asctime)s %(levelname)s %(message)s')
+
+    torch.set_num_threads(1)
+    wav_table = {}
+    with open(args.wav_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            wav_table[arr[0]] = arr[1]
+
+    no_segments = True
+    segments_table = {}
+    if args.segments is not None:
+        no_segments = False
+        with open(args.segments, 'r', encoding='utf8') as fin:
+            for line in fin:
+                arr = line.strip().split()
+                assert len(arr) == 4
+                segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3]))
+
+    data = []
+    with open(args.text_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split(maxsplit=1)
+            key = arr[0]
+            txt = arr[1] if len(arr) > 1 else ''
+            if no_segments:
+                assert key in wav_table
+                wav = wav_table[key]
+                data.append((key, txt, wav))
+            else:
+                wav_key, start, end = segments_table[key]
+                wav = wav_table[wav_key]
+                data.append((key, txt, wav, start, end))
+
+    num = args.num_utts_per_shard
+    chunks = [data[i:i + num] for i in range(0, len(data), num)]
+    os.makedirs(args.shards_dir, exist_ok=True)
+
+    # Using thread pool to speedup
+    pool = multiprocessing.Pool(processes=args.num_threads)
+    shards_list = []
+    tasks_list = []
+    num_chunks = len(chunks)
+    for i, chunk in enumerate(chunks):
+        tar_file = os.path.join(args.shards_dir,
+                                '{}_{:09d}.tar'.format(args.prefix, i))
+        shards_list.append(tar_file)
+        pool.apply_async(
+            write_tar_file,
+            (chunk, no_segments, tar_file, args.resample, i, num_chunks))
+
+    pool.close()
+    pool.join()
+
+    with open(args.shards_list, 'w', encoding='utf8') as fout:
+        for name in shards_list:
+            fout.write(name + '\n')
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/text2token.py b/models/speech/speech_recognition/conformer/ixrt/tools/text2token.py
new file mode 100755
index 0000000000000000000000000000000000000000..4f4dcc901d436650695f0b80e0cf99e1e99269ee
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/text2token.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
+# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import re
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def exist_or_not(i, match_pos):
+    start_pos = None
+    end_pos = None
+    for pos in match_pos:
+        if pos[0] <= i < pos[1]:
+            start_pos = pos[0]
+            end_pos = pos[1]
+            break
+
+    return start_pos, end_pos
+
+def seg_char(sent):
+    pattern = re.compile(r'([\u4e00-\u9fa5])')
+    chars = pattern.split(sent)
+    chars = [w for w in chars if len(w.strip()) > 0]
+    return chars
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description='convert raw text to tokenized text',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--nchar',
+                        '-n',
+                        default=1,
+                        type=int,
+                        help='number of characters to split, i.e., \
+                        aabb -> a a b b with -n 1 and aa bb with -n 2')
+    parser.add_argument('--skip-ncols',
+                        '-s',
+                        default=0,
+                        type=int,
+                        help='skip first n columns')
+    parser.add_argument('--space',
+                        default='<space>',
+                        type=str,
+                        help='space symbol')
+    parser.add_argument('--bpe-model',
+                        '-m',
+                        default=None,
+                        type=str,
+                        help='bpe model for english part')
+    parser.add_argument('--non-lang-syms',
+                        '-l',
+                        default=None,
+                        type=str,
+                        help='list of non-linguistic symobles,'
+                        ' e.g., <NOISE> etc.')
+    parser.add_argument('text',
+                        type=str,
+                        default=False,
+                        nargs='?',
+                        help='input text')
+    parser.add_argument('--trans_type',
+                        '-t',
+                        type=str,
+                        default="char",
+                        choices=["char", "phn", "cn_char_en_bpe"],
+                        help="""Transcript type. char/phn. e.g., for TIMIT
+                             FADG0_SI1279 -
+                             If trans_type is char, read from
+                             SI1279.WRD file -> "bricks are an alternative"
+                             Else if trans_type is phn,
+                             read from SI1279.PHN file ->
+                             "sil b r ih sil k s aa r er n aa l
+                             sil t er n ih sil t ih v sil" """)
+    return parser
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    rs = []
+    if args.non_lang_syms is not None:
+        with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f:
+            nls = [x.rstrip() for x in f.readlines()]
+            rs = [re.compile(re.escape(x)) for x in nls]
+
+    if args.bpe_model is not None:
+        import sentencepiece as spm
+        sp = spm.SentencePieceProcessor()
+        sp.load(args.bpe_model)
+
+    if args.text:
+        f = codecs.open(args.text, encoding="utf-8")
+    else:
+        f = codecs.getreader("utf-8")(
+            sys.stdin if is_python2 else sys.stdin.buffer)
+
+    sys.stdout = codecs.getwriter("utf-8")(
+        sys.stdout if is_python2 else sys.stdout.buffer)
+    line = f.readline()
+    n = args.nchar
+    while line:
+        x = line.split()
+        print(' '.join(x[:args.skip_ncols]), end=" ")
+        a = ' '.join(x[args.skip_ncols:])
+
+        # get all matched positions
+        match_pos = []
+        for r in rs:
+            i = 0
+            while i >= 0:
+                m = r.search(a, i)
+                if m:
+                    match_pos.append([m.start(), m.end()])
+                    i = m.end()
+                else:
+                    break
+
+        if len(match_pos) > 0:
+            chars = []
+            i = 0
+            while i < len(a):
+                start_pos, end_pos = exist_or_not(i, match_pos)
+                if start_pos is not None:
+                    chars.append(a[start_pos:end_pos])
+                    i = end_pos
+                else:
+                    chars.append(a[i])
+                    i += 1
+            a = chars
+
+        if (args.trans_type == "phn"):
+            a = a.split(" ")
+        elif args.trans_type == "cn_char_en_bpe":
+            b = seg_char(a)
+            a = []
+            for j in b:
+                # we use "▁" to instead of blanks among english words
+                # warning: here is "▁", not "_"
+                for l in j.strip().split("▁"):
+                    if not l.encode('UTF-8').isalpha():
+                        a.append(l)
+                    else:
+                        for k in sp.encode_as_pieces(l):
+                            a.append(k)
+        else:
+            a = [a[j:j + n] for j in range(0, len(a), n)]
+
+        a_flat = []
+        for z in a:
+            a_flat.append("".join(z))
+
+        a_chars = [z.replace(' ', args.space) for z in a_flat]
+        if (args.trans_type == "phn"):
+            a_chars = [z.replace("sil", args.space) for z in a_chars]
+        print(' '.join(a_chars))
+        line = f.readline()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/models/speech/speech_recognition/conformer/ixrt/utils/__init__.py b/models/speech/speech_recognition/conformer/ixrt/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57435c110fc12f39d79c1b02f4b2e83dfe1a3e3
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/utils/__init__.py
@@ -0,0 +1,39 @@
+import os
+import torch
+import numpy as np
+
+from .embedding import RelPositionalEncoding
+
+
+rel_positional_encoding = RelPositionalEncoding(256, 0.1)
+
+
+def make_pad_mask(lengths: np.ndarray, max_len: int = 0) -> np.ndarray :
+    """Make mask tensor containing indices of padded part.
+
+    See description of make_non_pad_mask.
+
+    Args:
+        lengths (numpy.ndarray): Batch of lengths (B,).
+    Returns:
+        numpy.ndarray: Mask tensor containing indices of padded part.
+
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+
+    batch_size = lengths.shape[0]
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = np.arange(0, max_len, dtype=np.int64)
+    seq_range_expand = np.tile(seq_range, batch_size).reshape(batch_size, max_len)
+    seq_length_expand = lengths[..., None]
+    mask = seq_range_expand >= seq_length_expand
+    mask = np.expand_dims(mask, axis=1)
+    mask = ~mask
+    mask = mask[:, :, 2::2][:, :, 2::2]
+    mask = mask.astype(np.int32)
+    return mask
diff --git a/models/speech/speech_recognition/conformer/ixrt/utils/embedding.py b/models/speech/speech_recognition/conformer/ixrt/utils/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fd65c4cdfc3fec244c88d2c47cf94b33b9088f3
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/utils/embedding.py
@@ -0,0 +1,133 @@
+"""Positonal Encoding Module."""
+
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    """
+
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int = 5000,
+                 reverse: bool = False):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.max_len = max_len
+
+        pe = torch.zeros(self.max_len, self.d_model)
+        position = torch.arange(0, self.max_len,
+                                dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer("pe", pe)
+
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int, torch.tensor): position offset
+
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            torch.Tensor: for compatibility to RelPositionalEncoding
+        """
+
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int,
+                          apply_dropout: bool = True) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        # How to subscript a Union type:
+        #   https://github.com/pytorch/pytorch/issues/69434
+        # import ipdb;ipdb.set_trace()
+        if isinstance(offset, int):
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        elif isinstance(offset, torch.Tensor) and offset.dim() == 0:  # scalar
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        else:  # for batched streaming decoding on GPU
+            assert torch.max(offset) + size <= self.max_len
+            index = offset.unsqueeze(1) + \
+                torch.arange(0, size).to(offset.device)  # B X T
+            flag = index > 0
+            # remove negative offset
+            index = index * flag
+            pos_emb = F.embedding(index, self.pe[0])  # B X T X d_model
+
+        if apply_dropout:
+            pos_emb = self.dropout(pos_emb)
+        return pos_emb
+
+
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+
+    def forward(self,
+                seq_len: int,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        pos_emb = self.position_encoding(offset, seq_len, False)
+        # return self.dropout(pos_emb)
+        return pos_emb
+
diff --git a/models/speech/speech_recognition/conformer/ixrt/wenet/__init__.py b/models/speech/speech_recognition/conformer/ixrt/wenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/speech/speech_recognition/conformer/ixrt/wenet/dataset.py b/models/speech/speech_recognition/conformer/ixrt/wenet/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..88a8cd15aec2277a36358883b25e929b179165e8
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/wenet/dataset.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+
+import wenet.processor as processor
+from wenet.file_utils import read_lists
+
+
+class Processor(IterableDataset):
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+
+
+class DistributedSampler:
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+
+            Args:
+                data(List): input data list
+
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # TODO(Binbin Zhang): fix this
+        # We can not handle uneven data for CV on DDP, so we don't
+        # sample data by rank, that means every GPU gets the same
+        # and all the CV data
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            data = data[self.rank::self.world_size]
+        data = data[self.worker_id::self.num_workers]
+        return data
+
+
+class DataList(IterableDataset):
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            # yield dict(src=src)
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+
+
+def Dataset(data_type,
+            data_list_file,
+            symbol_table,
+            conf,
+            bpe_model=None,
+            non_lang_syms=None,
+            partition=True):
+    """ Construct dataset from arguments
+
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+
+        Args:
+            data_type(str): raw/shard
+            bpe_model(str): model for english bpe part
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert data_type in ['raw', 'shard']
+    lists = read_lists(data_list_file)
+    shuffle = conf.get('shuffle', True)
+    dataset = DataList(lists, shuffle=shuffle, partition=partition)
+    if data_type == 'shard':
+        dataset = Processor(dataset, processor.url_opener)
+        dataset = Processor(dataset, processor.tar_file_and_group)
+    else:
+        dataset = Processor(dataset, processor.parse_raw)
+
+    dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model,
+                        non_lang_syms, conf.get('split_with_space', False))
+    filter_conf = conf.get('filter_conf', {})
+    dataset = Processor(dataset, processor.filter, **filter_conf)
+
+    resample_conf = conf.get('resample_conf', {})
+    dataset = Processor(dataset, processor.resample, **resample_conf)
+
+    speed_perturb = conf.get('speed_perturb', False)
+    if speed_perturb:
+        dataset = Processor(dataset, processor.speed_perturb)
+
+    fbank_conf = conf.get('fbank_conf', {})
+    dataset = Processor(dataset, processor.compute_fbank, **fbank_conf)
+
+    spec_aug = conf.get('spec_aug', True)
+    if spec_aug:
+        spec_aug_conf = conf.get('spec_aug_conf', {})
+        dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf)
+
+    if shuffle:
+        shuffle_conf = conf.get('shuffle_conf', {})
+        dataset = Processor(dataset, processor.shuffle, **shuffle_conf)
+
+    sort = conf.get('sort', True)
+    if sort:
+        sort_conf = conf.get('sort_conf', {})
+        dataset = Processor(dataset, processor.sort, **sort_conf)
+
+    batch_conf = conf.get('batch_conf', {})
+    dataset = Processor(dataset, processor.batch, **batch_conf)
+    dataset = Processor(dataset, processor.padding)
+    return dataset
diff --git a/models/speech/speech_recognition/conformer/ixrt/wenet/file_utils.py b/models/speech/speech_recognition/conformer/ixrt/wenet/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b7e516cc61f759267f4ef09309ff0b45110a0c1
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/wenet/file_utils.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def read_lists(list_file):
+    lists = []
+    with open(list_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            lists.append(line.strip())
+    return lists
+
+
+def read_non_lang_symbols(non_lang_sym_path):
+    """read non-linguistic symbol from file.
+
+    The file format is like below:
+
+    {NOISE}\n
+    {BRK}\n
+    ...
+
+
+    Args:
+        non_lang_sym_path: non-linguistic symbol file path, None means no any
+        syms.
+
+    """
+    if non_lang_sym_path is None:
+        return None
+    else:
+        syms = read_lists(non_lang_sym_path)
+        non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
+        for sym in syms:
+            if non_lang_syms_pattern.fullmatch(sym) is None:
+                class BadSymbolFormat(Exception):
+                    pass
+                raise BadSymbolFormat(
+                    "Non-linguistic symbols should be "
+                    "formatted in {xxx}/<xxx>/[xxx], consider"
+                    " modify '%s' to meet the requirment. "
+                    "More details can be found in discussions here : "
+                    "https://github.com/wenet-e2e/wenet/pull/819" % (sym))
+        return syms
+
+
+def read_symbol_table(symbol_table_file):
+    symbol_table = {}
+    with open(symbol_table_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            symbol_table[arr[0]] = int(arr[1])
+    return symbol_table
diff --git a/models/speech/speech_recognition/conformer/ixrt/wenet/processor.py b/models/speech/speech_recognition/conformer/ixrt/wenet/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a542a3d204cdb3def8cf61ce0b0fd8bb31af32e
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/wenet/processor.py
@@ -0,0 +1,550 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import json
+import random
+import re
+import tarfile
+from subprocess import PIPE, Popen
+from urllib.parse import urlparse
+
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+from torch.nn.utils.rnn import pad_sequence
+
+AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
+
+
+def url_opener(data):
+    """ Give url or local file, return file descriptor
+        Inplace operation.
+
+        Args:
+            data(Iterable[str]): url or local file list
+
+        Returns:
+            Iterable[{src, stream}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        # TODO(Binbin Zhang): support HTTP
+        url = sample['src']
+        try:
+            pr = urlparse(url)
+            # local file
+            if pr.scheme == '' or pr.scheme == 'file':
+                stream = open(url, 'rb')
+            # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP
+            else:
+                cmd = f'curl -s -L {url}'
+                process = Popen(cmd, shell=True, stdout=PIPE)
+                sample.update(process=process)
+                stream = process.stdout
+            sample.update(stream=stream)
+            yield sample
+        except Exception as ex:
+            logging.warning('Failed to open {}'.format(url))
+
+
+def tar_file_and_group(data):
+    """ Expand a stream of open tar files into a stream of tar file contents.
+        And groups the file with same prefix
+
+        Args:
+            data: Iterable[{src, stream}]
+
+        Returns:
+            Iterable[{key, wav, txt, sample_rate}]
+    """
+    for sample in data:
+        assert 'stream' in sample
+        stream = tarfile.open(fileobj=sample['stream'], mode="r|*")
+        prev_prefix = None
+        example = {}
+        valid = True
+        for tarinfo in stream:
+            name = tarinfo.name
+            pos = name.rfind('.')
+            assert pos > 0
+            prefix, postfix = name[:pos], name[pos + 1:]
+            if prev_prefix is not None and prefix != prev_prefix:
+                example['key'] = prev_prefix
+                if valid:
+                    yield example
+                example = {}
+                valid = True
+            with stream.extractfile(tarinfo) as file_obj:
+                try:
+                    if postfix == 'txt':
+                        example['txt'] = file_obj.read().decode('utf8').strip()
+                    elif postfix in AUDIO_FORMAT_SETS:
+                        waveform, sample_rate = torchaudio.load(file_obj)
+                        example['wav'] = waveform
+                        example['sample_rate'] = sample_rate
+                    else:
+                        example[postfix] = file_obj.read()
+                except Exception as ex:
+                    valid = False
+                    logging.warning('error to parse {}'.format(name))
+            prev_prefix = prefix
+        if prev_prefix is not None:
+            example['key'] = prev_prefix
+            yield example
+        stream.close()
+        if 'process' in sample:
+            sample['process'].communicate()
+        sample['stream'].close()
+
+
+def parse_raw(data):
+    """ Parse key/wav/txt from json line
+
+        Args:
+            data: Iterable[str], str is a json line has key/wav/txt
+
+        Returns:
+            Iterable[{key, wav, txt, sample_rate}]
+    """
+    for sample in data:
+        assert 'src' in sample
+        json_line = sample['src']
+        obj = json.loads(json_line)
+        assert 'key' in obj
+        assert 'wav' in obj
+        assert 'txt' in obj
+        key = obj['key']
+        wav_file = obj['wav']
+        txt = obj['txt']
+        try:
+            if 'start' in obj:
+                assert 'end' in obj
+                sample_rate = torchaudio.backend.sox_io_backend.info(
+                    wav_file).sample_rate
+                start_frame = int(obj['start'] * sample_rate)
+                end_frame = int(obj['end'] * sample_rate)
+                waveform, _ = torchaudio.backend.sox_io_backend.load(
+                    filepath=wav_file,
+                    num_frames=end_frame - start_frame,
+                    frame_offset=start_frame)
+            else:
+                waveform, sample_rate = torchaudio.load(wav_file)
+            example = dict(key=key,
+                           txt=txt,
+                           wav=waveform,
+                           sample_rate=sample_rate)
+            yield example
+        except Exception as ex:
+            logging.warning('Failed to read {}'.format(wav_file))
+
+
+def filter(data,
+           max_length=10240,
+           min_length=10,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1):
+    """ Filter sample according to feature and label length
+        Inplace operation.
+
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'label' in sample
+        # sample['wav'] is torch.Tensor, we have 100 frames every second
+        num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100
+        if num_frames < min_length:
+            continue
+        if num_frames > max_length:
+            continue
+        if len(sample['label']) < token_min_length:
+            continue
+        if len(sample['label']) > token_max_length:
+            continue
+        if num_frames != 0:
+            if len(sample['label']) / num_frames < min_output_input_ratio:
+                continue
+            if len(sample['label']) / num_frames > max_output_input_ratio:
+                continue
+        yield sample
+
+
+def resample(data, resample_rate=16000):
+    """ Resample data.
+        Inplace operation.
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        if sample_rate != resample_rate:
+            sample['sample_rate'] = resample_rate
+            sample['wav'] = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+        yield sample
+
+
+def speed_perturb(data, speeds=None):
+    """ Apply speed perturb to the data.
+        Inplace operation.
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            speeds(List[float]): optional speed
+
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+    if speeds is None:
+        speeds = [0.9, 1.0, 1.1]
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        speed = random.choice(speeds)
+        if speed != 1.0:
+            wav, _ = torchaudio.sox_effects.apply_effects_tensor(
+                waveform, sample_rate,
+                [['speed', str(speed)], ['rate', str(sample_rate)]])
+            sample['wav'] = wav
+
+        yield sample
+
+
+def compute_fbank(data,
+                  num_mel_bins=23,
+                  frame_length=25,
+                  frame_shift=10,
+                  dither=0.0):
+    """ Extract fbank
+
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'sample_rate' in sample
+        assert 'wav' in sample
+        assert 'key' in sample
+        assert 'label' in sample
+        sample_rate = sample['sample_rate']
+        waveform = sample['wav']
+        waveform = waveform * (1 << 15)
+        # Only keep key, feat, label
+        mat = kaldi.fbank(waveform,
+                          num_mel_bins=num_mel_bins,
+                          frame_length=frame_length,
+                          frame_shift=frame_shift,
+                          dither=dither,
+                          energy_floor=0.0,
+                          sample_frequency=sample_rate)
+        yield dict(key=sample['key'], label=sample['label'], feat=mat)
+
+
+def __tokenize_by_bpe_model(sp, txt):
+    tokens = []
+    # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref:
+    # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    pattern = re.compile(r'([\u4e00-\u9fff])')
+    # Example:
+    #   txt   = "你好 ITS'S OKAY 的"
+    #   chars = ["你", "好", " ITS'S OKAY ", "的"]
+    chars = pattern.split(txt.upper())
+    mix_chars = [w for w in chars if len(w.strip()) > 0]
+    for ch_or_w in mix_chars:
+        # ch_or_w is a single CJK charater(i.e., "你"), do nothing.
+        if pattern.fullmatch(ch_or_w) is not None:
+            tokens.append(ch_or_w)
+        # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "),
+        # encode ch_or_w using bpe_model.
+        else:
+            for p in sp.encode_as_pieces(ch_or_w):
+                tokens.append(p)
+
+    return tokens
+
+
+def tokenize(data, symbol_table, bpe_model=None, non_lang_syms=None,
+             split_with_space=False):
+    """ Decode text to chars or BPE
+        Inplace operation
+
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+    if non_lang_syms is not None:
+        non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
+    else:
+        non_lang_syms = {}
+        non_lang_syms_pattern = None
+
+    if bpe_model is not None:
+        import sentencepiece as spm
+        sp = spm.SentencePieceProcessor()
+        sp.load(bpe_model)
+    else:
+        sp = None
+
+    for sample in data:
+        assert 'txt' in sample
+        txt = sample['txt'].strip()
+        if non_lang_syms_pattern is not None:
+            parts = non_lang_syms_pattern.split(txt.upper())
+            parts = [w for w in parts if len(w.strip()) > 0]
+        else:
+            parts = [txt]
+
+        label = []
+        tokens = []
+        for part in parts:
+            if part in non_lang_syms:
+                tokens.append(part)
+            else:
+                if bpe_model is not None:
+                    tokens.extend(__tokenize_by_bpe_model(sp, part))
+                else:
+                    if split_with_space:
+                        part = part.split(" ")
+                    for ch in part:
+                        if ch == ' ':
+                            ch = "▁"
+                        tokens.append(ch)
+
+        for ch in tokens:
+            if ch in symbol_table:
+                label.append(symbol_table[ch])
+            elif '<unk>' in symbol_table:
+                label.append(symbol_table['<unk>'])
+
+        sample['tokens'] = tokens
+        sample['label'] = label
+        yield sample
+
+
+def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80):
+    """ Do spec augmentation
+        Inplace operation
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            num_t_mask: number of time mask to apply
+            num_f_mask: number of freq mask to apply
+            max_t: max width of time mask
+            max_f: max width of freq mask
+            max_w: max width of time warp
+
+        Returns
+            Iterable[{key, feat, label}]
+    """
+    for sample in data:
+        assert 'feat' in sample
+        x = sample['feat']
+        assert isinstance(x, torch.Tensor)
+        y = x.clone().detach()
+        max_frames = y.size(0)
+        max_freq = y.size(1)
+        # time mask
+        for i in range(num_t_mask):
+            start = random.randint(0, max_frames - 1)
+            length = random.randint(1, max_t)
+            end = min(max_frames, start + length)
+            y[start:end, :] = 0
+        # freq mask
+        for i in range(num_f_mask):
+            start = random.randint(0, max_freq - 1)
+            length = random.randint(1, max_f)
+            end = min(max_freq, start + length)
+            y[:, start:end] = 0
+        sample['feat'] = y
+        yield sample
+
+
+def shuffle(data, shuffle_size=10000):
+    """ Local shuffle the data
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= shuffle_size:
+            random.shuffle(buf)
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    random.shuffle(buf)
+    for x in buf:
+        yield x
+
+
+def sort(data, sort_size=500):
+    """ Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= sort_size:
+            buf.sort(key=lambda x: x['feat'].size(0))
+            for x in buf:
+                yield x
+            buf = []
+    # The sample left over
+    buf.sort(key=lambda x: x['feat'].size(0))
+    for x in buf:
+        yield x
+
+
+def static_batch(data, batch_size=16):
+    """ Static batch the data by `batch_size`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    for sample in data:
+        buf.append(sample)
+        if len(buf) >= batch_size:
+            yield buf
+            buf = []
+    if len(buf) > 0:
+        yield buf
+
+
+def dynamic_batch(data, max_frames_in_batch=12000):
+    """ Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+    buf = []
+    longest_frames = 0
+    for sample in data:
+        assert 'feat' in sample
+        assert isinstance(sample['feat'], torch.Tensor)
+        new_sample_frames = sample['feat'].size(0)
+        longest_frames = max(longest_frames, new_sample_frames)
+        frames_after_padding = longest_frames * (len(buf) + 1)
+        if frames_after_padding > max_frames_in_batch:
+            yield buf
+            buf = [sample]
+            longest_frames = new_sample_frames
+        else:
+            buf.append(sample)
+    if len(buf) > 0:
+        yield buf
+
+
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000):
+    """ Wrapper for static/dynamic batch
+    """
+    if batch_type == 'static':
+        return static_batch(data, batch_size)
+    elif batch_type == 'dynamic':
+        return dynamic_batch(data, max_frames_in_batch)
+    else:
+        logging.fatal('Unsupported batch type {}'.format(batch_type))
+
+
+def padding(data):
+    """ Padding the data into training data
+
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+    for sample in data:
+        assert isinstance(sample, list)
+        feats_length = torch.tensor([x['feat'].size(0) for x in sample],
+                                    dtype=torch.int32)
+        order = torch.argsort(feats_length, descending=True)
+        feats_lengths = torch.tensor(
+            [sample[i]['feat'].size(0) for i in order], dtype=torch.int32)
+        sorted_feats = [sample[i]['feat'] for i in order]
+        sorted_keys = [sample[i]['key'] for i in order]
+        sorted_labels = [
+            torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order
+        ]
+        label_lengths = torch.tensor([x.size(0) for x in sorted_labels],
+                                     dtype=torch.int32)
+
+        padded_feats = pad_sequence(sorted_feats,
+                                    batch_first=True,
+                                    padding_value=0)
+        padding_labels = pad_sequence(sorted_labels,
+                                      batch_first=True,
+                                      padding_value=-1)
+
+        yield (sorted_keys, padded_feats, padding_labels, feats_lengths,
+               label_lengths)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/README.md b/models/speech/speech_recognition/transformer_asr/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c2e1b456d5fe38efdda736439c1361a14dcedcd
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/README.md
@@ -0,0 +1,83 @@
+# Transformer ASR（BeamSearch）
+
+## Description
+
+Beam search allows us to exert control over the output of text generation. This is useful because we sometimes know exactly what we want inside the output. For example, in a Neural Machine Translation task, we might know which words must be included in the final translation with a dictionary lookup.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install speechbrain==0.5.13
+```
+
+### Download
+
+Pretrained model: <https://drive.google.com/drive/folders/1_2zN6lbu4zUc0-iq8XbABEm6fl9mohkv>
+
+Dataset: <https://www.openslr.org/resources/33/data_aishell.tgz> to download the Aishell dataset.
+
+```bash
+# Make sure the checkpoint path is results/transformer/8886/save
+mkdir -p results/transformer/8886/save
+# The data path like below:
+results/transformer/8886
+├── cer.txt
+├── dev.csv
+├── env.log
+├── hyperparams.yaml
+├── inference_encoder_ctc.py
+├── inference.py
+├── log.txt
+├── save
+│   ├── CKPT+2023-03-29+06-31-40+00
+│   │   ├── brain.ckpt
+│   │   ├── CKPT.yaml
+│   │   ├── counter.ckpt
+│   │   ├── model.ckpt
+│   │   ├── noam_scheduler.ckpt
+│   │   └── normalizer.ckpt
+│   └── tokenizer.ckpt
+├── test.csv
+├── train.csv
+└── train_log.txt
+
+# Make sure the dataset path is results/transformer/8886/save
+mkdir -p /home/data/speechbrain/aishell/csv_data
+ln -s /PATH/to/data_aishell /home/data/speechbrain/aishell/
+cp results/transformer/8886/*.csv /home/data/speechbrain/aishell/csv_data
+```
+
+## Inference
+
+### Build faster kernels
+
+```bash
+bash build.sh
+```
+
+### Build engine
+
+max_batch_size and max_seq_len depend on the situation.
+
+```bash
+python3 builder.py \
+--ckpt_path results/transformer/8886/save \
+--head_num 4 \
+--max_batch_size 64  \
+--max_seq_len 1024 \
+--engine_path transformer.engine
+```
+
+### Run engine
+
+```bash
+python3 inference.py hparams/train_ASR_transformer.yaml --data_folder=/home/data/speechbrain/aishell --engine_path transformer.engine 
+```
+
+## Results
+
+| Model           | BatchSize | Precision | QPS   | CER  |
+| --------------- | --------- | --------- | ----- | ---- |
+| Transformer ASR | 32        | FP16      | 15.64 | 5.95 |
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/aishell_prepare.py b/models/speech/speech_recognition/transformer_asr/ixrt/aishell_prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba31939402691ec29525480cf0070e3016654b8d
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/aishell_prepare.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import shutil
+import logging
+from speechbrain.dataio.dataio import read_audio
+from speechbrain.utils.data_utils import download_file
+import glob
+import csv
+import argparse
+
+logger = logging.getLogger(__name__)
+
+
+def prepare_aishell(data_folder, save_folder, skip_prep=False):
+    """
+    This function prepares the AISHELL-1 dataset.
+    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.
+
+    data_folder : path to AISHELL-1 dataset.
+    save_folder: path where to store the manifest csv files.
+    skip_prep: If True, skip data preparation.
+
+    """
+    if skip_prep:
+        return
+
+    # If the data folders do not exist, we need to extract the data
+    if not os.path.isdir(os.path.join(data_folder, "data_aishell/wav")):
+        # # Check for zip file and download if it doesn't exist
+        # zip_location = os.path.join(data_folder, "data_aishell.tgz")
+        # if not os.path.exists(zip_location):
+        #     url = "https://www.openslr.org/resources/33/data_aishell.tgz"
+        #     download_file(url, zip_location, unpack=True)
+        # logger.info("Extracting data_aishell.tgz...")
+        # shutil.unpack_archive(zip_location, data_folder)
+
+        wav_dir = os.path.join(data_folder, "data_aishell/wav")
+        tgz_list = glob.glob(wav_dir + "/*.tar.gz")
+        for tgz in tgz_list:
+            shutil.unpack_archive(tgz, wav_dir)
+            os.remove(tgz)
+
+    # Create filename-to-transcript dictionary
+    filename2transcript = {}
+    with open(
+        os.path.join(
+            data_folder, "data_aishell/transcript/aishell_transcript_v0.8.txt"
+        ),
+        "r",
+    ) as f:
+        lines = f.readlines()
+        for line in lines:
+            key = line.split()[0]
+            value = " ".join(line.split()[1:])
+            filename2transcript[key] = value
+
+    splits = [
+        # "train",
+        "dev",
+        "test",
+    ]
+    ID_start = 0  # needed to have a unique ID for each audio
+    for split in splits:
+        new_filename = os.path.join(save_folder, split) + ".csv"
+        if os.path.exists(new_filename):
+            continue
+        logger.info("Preparing %s..." % new_filename)
+
+        csv_output = [["ID", "duration", "wav", "transcript"]]
+        entry = []
+
+        all_wavs = glob.glob(
+            os.path.join(data_folder, "data_aishell/wav") + "/" + split + "/*/*.wav"
+        )
+        for i in range(len(all_wavs)):
+            filename = all_wavs[i].split("/")[-1].split(".wav")[0]
+            if filename not in filename2transcript:
+                continue
+            signal = read_audio(all_wavs[i])
+            duration = signal.shape[0] / 16000
+            transcript_ = filename2transcript[filename]
+            csv_line = [
+                ID_start + i,
+                str(duration),
+                all_wavs[i],
+                transcript_,
+            ]
+            entry.append(csv_line)
+
+        csv_output = csv_output + entry
+
+        with open(new_filename, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
+            )
+            for line in csv_output:
+                csv_writer.writerow(line)
+
+        msg = "\t%s successfully created!" % (new_filename)
+        logger.info(msg)
+
+        ID_start += len(all_wavs)
+
+
+def parse_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_folder",
+        type=str,
+        default="/home/data/speechbrain/aishell",
+        help="data folder",
+    )
+    parser.add_argument(
+        "--save_folder",
+        type=str,
+        default="/home/data/speechbrain/aishell/csv_data",
+        help="csv save folder",
+    )
+
+    config = parser.parse_args()
+    print("Config:", config)
+    return config
+
+
+if __name__ == "__main__":
+
+    config = parse_config()
+    prepare_aishell(config.data_folder, config.save_folder, skip_prep=False)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/beam_search.py b/models/speech/speech_recognition/transformer_asr/ixrt/beam_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e5c794ad9ff8c0d517c666c08295f36551e463
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/beam_search.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import torch
+from ctc import CTCPrefixScorer
+import time
+
+def forward(self, enc_states, wav_len):  # noqa: C901
+    """Applies beamsearch and returns the predicted tokens."""
+    enc_lens = torch.round(enc_states.shape[1] * wav_len).int()
+    device = enc_states.device
+    batch_size = enc_states.shape[0]
+
+    memory = self.reset_mem(batch_size * self.beam_size, device=device)
+
+    if self.lm_weight > 0:
+        lm_memory = self.reset_lm_mem(batch_size * self.beam_size, device)
+
+    if self.ctc_weight > 0:
+        # (batch_size * beam_size, L, vocab_size)
+        ctc_outputs = self.ctc_forward_step(enc_states)
+        ctc_scorer = CTCPrefixScorer(
+            ctc_outputs,
+            enc_lens,
+            batch_size,
+            self.beam_size,
+            self.blank_index,
+            self.eos_index,
+            self.ctc_window_size,
+        )
+        ctc_memory = None
+
+    # Inflate the enc_states and enc_len by beam_size times
+    enc_states = inflate_tensor(enc_states, times=self.beam_size, dim=0)
+    enc_lens = inflate_tensor(enc_lens, times=self.beam_size, dim=0)
+
+    # Using bos as the first input
+    inp_tokens = (
+        torch.zeros(batch_size * self.beam_size, device=device)
+        .fill_(self.bos_index)
+        .long()
+    )
+
+    # The first index of each sentence.
+    self.beam_offset = (
+        torch.arange(batch_size, device=device) * self.beam_size
+    )
+
+    # initialize sequence scores variables.
+    sequence_scores = torch.empty(
+        batch_size * self.beam_size, device=device
+    )
+    sequence_scores.fill_(float("-inf"))
+
+    # keep only the first to make sure no redundancy.
+    sequence_scores.index_fill_(0, self.beam_offset, 0.0)
+
+    # keep the hypothesis that reaches eos and their corresponding score and log_probs.
+    hyps_and_scores = [[] for _ in range(batch_size)]
+
+    # keep the sequences that still not reaches eos.
+    alived_seq = torch.empty(
+        batch_size * self.beam_size, 0, device=device
+    ).long()
+
+    # Keep the log-probabilities of alived sequences.
+    alived_log_probs = torch.empty(
+        batch_size * self.beam_size, 0, device=device
+    )
+
+    min_decode_steps = int(enc_states.shape[1] * self.min_decode_ratio)
+    max_decode_steps = int(enc_states.shape[1] * self.max_decode_ratio)
+
+    # Initialize the previous attention peak to zero
+    # This variable will be used when using_max_attn_shift=True
+    prev_attn_peak = torch.zeros(batch_size * self.beam_size, device=device)
+
+    for t in range(max_decode_steps):
+        # terminate condition
+        if self._check_full_beams(hyps_and_scores, self.beam_size):
+            break
+        
+        log_probs, memory, attn = self.forward_step(
+            inp_tokens, memory, enc_states, enc_lens
+        )
+        log_probs = self.att_weight * log_probs
+        
+        # Keep the original value
+        log_probs_clone = log_probs.clone().reshape(batch_size, -1)
+        vocab_size = log_probs.shape[-1]
+
+        if self.using_max_attn_shift:
+            # Block the candidates that exceed the max shift
+            cond, attn_peak = self._check_attn_shift(attn, prev_attn_peak)
+            log_probs = mask_by_condition(
+                log_probs, cond, fill_value=self.minus_inf
+            )
+            prev_attn_peak = attn_peak
+
+        # Set eos to minus_inf when less than minimum steps.
+        if t < min_decode_steps:
+            log_probs[:, self.eos_index] = self.minus_inf
+
+        # Set the eos prob to minus_inf when it doesn't exceed threshold.
+        if self.using_eos_threshold:
+            cond = self._check_eos_threshold(log_probs)
+            log_probs[:, self.eos_index] = mask_by_condition(
+                log_probs[:, self.eos_index],
+                cond,
+                fill_value=self.minus_inf,
+            )
+
+        # adding LM scores to log_prob if lm_weight > 0
+        if self.lm_weight > 0:
+            lm_log_probs, lm_memory = self.lm_forward_step(
+                inp_tokens, lm_memory
+            )
+            log_probs = log_probs + self.lm_weight * lm_log_probs
+
+        # adding CTC scores to log_prob if ctc_weight > 0
+        if self.ctc_weight > 0:
+            g = alived_seq
+            # block blank token
+            log_probs[:, self.blank_index] = self.minus_inf
+            if self.ctc_weight != 1.0 and self.ctc_score_mode == "partial":
+                # pruning vocab for ctc_scorer
+                _, ctc_candidates = log_probs.topk(
+                    self.beam_size * 2, dim=-1
+                )
+            else:
+                ctc_candidates = None
+
+            ctc_log_probs, ctc_memory = ctc_scorer.forward_step(
+                g, ctc_memory, ctc_candidates, attn
+            )
+            log_probs = log_probs + self.ctc_weight * ctc_log_probs
+    
+        scores = sequence_scores.unsqueeze(1).expand(-1, vocab_size)
+        scores = scores + log_probs
+
+        # length normalization
+        if self.length_normalization:
+            scores = scores / (t + 1)
+
+        # keep topk beams
+        scores, candidates = scores.view(batch_size, -1).topk(
+            self.beam_size, dim=-1
+        )
+
+        # The input for the next step, also the output of current step.
+        inp_tokens = (candidates % vocab_size).view(
+            batch_size * self.beam_size
+        )
+
+        scores = scores.view(batch_size * self.beam_size)
+        sequence_scores = scores
+
+        # recover the length normalization
+        if self.length_normalization:
+            sequence_scores = sequence_scores * (t + 1)
+
+        # The index of which beam the current top-K output came from in (t-1) timesteps.
+        predecessors = (
+            torch.div(candidates, vocab_size, rounding_mode="floor")
+            + self.beam_offset.unsqueeze(1).expand_as(candidates)
+        ).view(batch_size * self.beam_size)
+
+        # Permute the memory to synchoronize with the output.
+        memory = self.permute_mem(memory, index=predecessors)
+        if self.lm_weight > 0:
+            lm_memory = self.permute_lm_mem(lm_memory, index=predecessors)
+
+        if self.ctc_weight > 0:
+            ctc_memory = ctc_scorer.permute_mem(ctc_memory, candidates)
+
+        # If using_max_attn_shift, then the previous attn peak has to be permuted too.
+        if self.using_max_attn_shift:
+            prev_attn_peak = torch.index_select(
+                prev_attn_peak, dim=0, index=predecessors
+            )
+
+        # Add coverage penalty
+        if self.coverage_penalty > 0:
+            cur_attn = torch.index_select(attn, dim=0, index=predecessors)
+
+            # coverage: cumulative attention probability vector
+            if t == 0:
+                # Init coverage
+                self.coverage = cur_attn
+
+            # the attn of transformer is [batch_size*beam_size, current_step, source_len]
+            if len(cur_attn.size()) > 2:
+                self.converage = torch.sum(cur_attn, dim=1)
+            else:
+                # Update coverage
+                self.coverage = torch.index_select(
+                    self.coverage, dim=0, index=predecessors
+                )
+                self.coverage = self.coverage + cur_attn
+
+            # Compute coverage penalty and add it to scores
+            penalty = torch.max(
+                self.coverage, self.coverage.clone().fill_(0.5)
+            ).sum(-1)
+            penalty = penalty - self.coverage.size(-1) * 0.5
+            penalty = penalty.view(batch_size * self.beam_size)
+            penalty = (
+                penalty / (t + 1) if self.length_normalization else penalty
+            )
+            scores = scores - penalty * self.coverage_penalty
+
+        # Update alived_seq
+        alived_seq = torch.cat(
+            [
+                torch.index_select(alived_seq, dim=0, index=predecessors),
+                inp_tokens.unsqueeze(1),
+            ],
+            dim=-1,
+        )
+
+        # Takes the log-probabilities
+        beam_log_probs = log_probs_clone[
+            torch.arange(batch_size).unsqueeze(1), candidates
+        ].reshape(batch_size * self.beam_size)
+        alived_log_probs = torch.cat(
+            [
+                torch.index_select(
+                    alived_log_probs, dim=0, index=predecessors
+                ),
+                beam_log_probs.unsqueeze(1),
+            ],
+            dim=-1,
+        )
+
+        is_eos = self._update_hyp_and_scores(
+            inp_tokens,
+            alived_seq,
+            alived_log_probs,
+            hyps_and_scores,
+            scores,
+            timesteps=t,
+        )
+
+        # Block the paths that have reached eos.
+        sequence_scores.masked_fill_(is_eos, float("-inf"))
+
+    if not self._check_full_beams(hyps_and_scores, self.beam_size):
+        # Using all eos to fill-up the hyps.
+        eos = (
+            torch.zeros(batch_size * self.beam_size, device=device)
+            .fill_(self.eos_index)
+            .long()
+        )
+        _ = self._update_hyp_and_scores(
+            eos,
+            alived_seq,
+            alived_log_probs,
+            hyps_and_scores,
+            scores,
+            timesteps=max_decode_steps,
+        )
+
+    (
+        topk_hyps,
+        topk_scores,
+        topk_lengths,
+        log_probs,
+    ) = self._get_top_score_prediction(hyps_and_scores, topk=self.topk,)
+    # pick the best hyp
+    predictions = topk_hyps[:, 0, :]
+    predictions = batch_filter_seq2seq_output(
+        predictions, eos_id=self.eos_index
+    )
+
+    if self.return_log_probs:
+        return predictions, topk_scores, log_probs
+    else:
+        return predictions, topk_scores
+
+
+def inflate_tensor(tensor, times, dim):
+    """This function inflates the tensor for times along dim.
+
+    Arguments
+    ---------
+    tensor : torch.Tensor
+        The tensor to be inflated.
+    times : int
+        The tensor will inflate for this number of times.
+    dim : int
+        The dim to be inflated.
+
+    Returns
+    -------
+    torch.Tensor
+        The inflated tensor.
+
+    Example
+    -------
+    >>> tensor = torch.Tensor([[1,2,3], [4,5,6]])
+    >>> new_tensor = inflate_tensor(tensor, 2, dim=0)
+    >>> new_tensor
+    tensor([[1., 2., 3.],
+            [1., 2., 3.],
+            [4., 5., 6.],
+            [4., 5., 6.]])
+    """
+    return torch.repeat_interleave(tensor, times, dim=dim)
+
+def batch_filter_seq2seq_output(prediction, eos_id=-1):
+    """Calling batch_size times of filter_seq2seq_output.
+
+    Arguments
+    ---------
+    prediction : list of torch.Tensor
+        A list containing the output ints predicted by the seq2seq system.
+    eos_id : int, string
+        The id of the eos.
+
+    Returns
+    ------
+    list
+        The output predicted by seq2seq model.
+
+    Example
+    -------
+    >>> predictions = [torch.IntTensor([1,2,3,4]), torch.IntTensor([2,3,4,5,6])]
+    >>> predictions = batch_filter_seq2seq_output(predictions, eos_id=4)
+    >>> predictions
+    [[1, 2, 3], [2, 3]]
+    """
+    outputs = []
+    for p in prediction:
+        res = filter_seq2seq_output(p.tolist(), eos_id=eos_id)
+        outputs.append(res)
+    return outputs
+
+def filter_seq2seq_output(string_pred, eos_id=-1):
+    """Filter the output until the first eos occurs (exclusive).
+
+    Arguments
+    ---------
+    string_pred : list
+        A list containing the output strings/ints predicted by the seq2seq system.
+    eos_id : int, string
+        The id of the eos.
+
+    Returns
+    ------
+    list
+        The output predicted by seq2seq model.
+
+    Example
+    -------
+    >>> string_pred = ['a','b','c','d','eos','e']
+    >>> string_out = filter_seq2seq_output(string_pred, eos_id='eos')
+    >>> string_out
+    ['a', 'b', 'c', 'd']
+    """
+    if isinstance(string_pred, list):
+        try:
+            eos_index = next(
+                i for i, v in enumerate(string_pred) if v == eos_id
+            )
+        except StopIteration:
+            eos_index = len(string_pred)
+        string_out = string_pred[:eos_index]
+    else:
+        raise ValueError("The input must be a list.")
+    return string_out
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a899123463c3cd453fcc7a0677c81b9235e410d2
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/build.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+for i in fast*
+do
+	cd $i
+	bash build.sh
+	cd ..
+done
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/builder.py b/models/speech/speech_recognition/transformer_asr/ixrt/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c19a9f4bdd2133138621d0e39aebacf09e133f9
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/builder.py
@@ -0,0 +1,466 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import torch
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+from tensorrt.deploy.ir.data_type import DataType
+from tensorrt.deploy.ir.variable import Variable, VariableOptions
+from tensorrt.deploy.ir.graph import Graph
+from collections import OrderedDict
+import math
+import re
+import glob
+import os
+from onnx import numpy_helper
+import subprocess
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="build ixrt engine", usage=""
+    )
+    parser.add_argument(
+        "--ckpt_path",
+        type=str,
+        required=True,
+        help="",
+    )
+    parser.add_argument(
+        "--head_num",
+        type=int,
+        required=True,
+        help="",
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        required=True,
+        help="",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        required=True,
+        help="",
+    )
+    parser.add_argument(
+        "--onnx_path",
+        type=str,
+        default=".tmp.onnx",
+        help="",
+    )
+    parser.add_argument(
+        "--engine_path",
+        type=str,
+        required=True,
+        help="",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def add_make_mask_op(graph, state_dict, args):
+    attributes = {}
+
+    t = graph
+    inputs = [
+        graph.make_variable('length_radio', dtype=DataType.FLOAT16),
+        graph.make_variable('input', dtype=DataType.FLOAT16),
+    ]
+
+    outputs = [t.make_variable("attention_mask", dtype=DataType.INT32)]
+
+    t.make_operator(
+        "MakeMaskByRadio_IxRT", inputs=inputs, outputs=outputs, **attributes
+    )
+
+
+def add_custom_linear_op(graph, state_dict, args):
+    linear_keys = [
+        "1.custom_src_module.layers.0.w.weight",
+        "1.custom_src_module.layers.0.w.bias"
+    ]
+    W = numpy_helper.from_array(state_dict[linear_keys[0]].cpu().numpy(), name="W")
+    B = numpy_helper.from_array(state_dict[linear_keys[1]].cpu().numpy(), name="B")
+    attributes = {
+        "out_dims": state_dict["1.custom_src_module.layers.0.w.weight"].size(0),
+        "type_id": 1,
+        "W": W,
+        "B": B,
+    }
+    assert state_dict['1.custom_src_module.layers.0.w.weight'].size(
+        0) == state_dict["1.custom_src_module.layers.0.w.bias"].size(0)
+
+    t = graph
+    inputs = [
+        graph.get_variable('input'),
+    ]
+
+    outputs = [t.make_variable("custom_src_output")]
+    t.make_operator(
+        "CustomFCPluginDynamic_IxRT", inputs=inputs, outputs=outputs, **attributes
+    )
+
+
+# def add_custom_linear_op(graph, state_dict, args):
+#     linear_keys = [
+#         "1.custom_src_module.layers.0.w.weight",
+#         "1.custom_src_module.layers.0.w.bias"
+#     ]
+#     attributes = {
+#         "linear_dim": state_dict["1.custom_src_module.layers.0.w.weight"].size(0),
+#         "hidden_size": state_dict["1.custom_src_module.layers.0.w.weight"].size(1),
+#         "has_bias": 1,
+#         "act_type": "none",
+#     }
+#     assert state_dict['1.custom_src_module.layers.0.w.weight'].size(
+#         0) == state_dict["1.custom_src_module.layers.0.w.bias"].size(0)
+#
+#     t = graph
+#     inputs = [
+#         graph.get_variable('input'),
+#     ]
+#
+#     outputs = [t.make_variable("custom_src_output",dtype=DataType.FLOAT16)]
+#     for key in linear_keys:
+#         inputs.append(t.make_variable(name=key, value=state_dict[key].half()))
+#     t.make_operator(
+#         "LinearFP16", inputs=inputs, outputs=outputs, **attributes
+#     )
+
+
+def add_pos_encode_op(graph, state_dict, args):
+    attributes = {}
+    t = graph
+    inputs = [
+        graph.get_variable('custom_src_output'),
+    ]
+    outputs = [t.make_variable("hidden_state", dtype=DataType.FLOAT16)]
+    t.make_operator(
+        "PosEncodeSinCos_IxRT", inputs=inputs, outputs=outputs, **attributes
+    )
+
+
+def add_transformer_op(graph, state_dict, args):
+    enc_tensor_layer_fp16_keys = OrderedDict([
+        ["1.encoder.layers.{}.norm1.norm.weight", [args.hidden_size]],
+        ["1.encoder.layers.{}.norm1.norm.bias", [args.hidden_size]],
+        ["1.encoder.layers.{}.self_att.att.in_proj_weight",
+         [args.hidden_size * 3, args.hidden_size]],
+        ["1.encoder.layers.{}.self_att.att.in_proj_bias", [args.hidden_size * 3]],
+        ["1.encoder.layers.{}.self_att.att.out_proj.weight",
+         [args.hidden_size, args.hidden_size]],
+        ["1.encoder.layers.{}.self_att.att.out_proj.bias", [args.hidden_size]],
+        ["1.encoder.layers.{}.pos_ffn.ffn.0.weight",
+         [args.inner_size, args.hidden_size]],
+        ["1.encoder.layers.{}.pos_ffn.ffn.0.bias", [args.inner_size]],
+        ["1.encoder.layers.{}.pos_ffn.ffn.3.weight",
+         [args.hidden_size, args.inner_size]],
+        ["1.encoder.layers.{}.pos_ffn.ffn.3.bias", [args.hidden_size]],
+        ["1.encoder.layers.{}.norm2.norm.weight", [args.hidden_size]],
+        ["1.encoder.layers.{}.norm2.norm.bias", [args.hidden_size]],
+    ])
+    attributes_legcy = {
+        "hidden_size": args.hidden_size,
+        "num_layers": args.num_layers,
+        "head_num": args.head_num,
+        "head_dim": args.head_dim,
+        "inner_size": args.inner_size,
+        "act_type": "gelu",
+        "normalize_before": 1,
+        "is_fmha": 1,
+        "atten_scaler": 1 / math.sqrt(args.head_dim)
+    }
+    
+    
+    attributes = {
+        "hidden_size": int(args.hidden_size),
+        "num_layers": int(args.num_layers),
+        "head_num": int(args.head_num),
+        "head_dim": int(args.head_dim),
+        "inner_size": int(args.inner_size),
+        "act_type": 12, #gelu
+        "normalize_before": 1,
+        "is_fmha": 1,
+        "atten_scaler": 1.0 / math.sqrt(args.head_dim),
+        "max_seq_len": int(args.max_seq_len),
+        "max_batch_size": int(args.max_batch_size),
+        
+    }
+    
+    t = graph
+    inputs = [
+        graph.get_variable('hidden_state'),
+        graph.get_variable('attention_mask'),
+    ]
+    outputs = [t.make_variable("encoder_out", dtype=DataType.FLOAT16)]
+    for layer_id in range(args.num_layers):
+        for key, shape in enc_tensor_layer_fp16_keys.items():
+            # we need cat qkv gemm's weight and bias
+            new_key = key.format(layer_id)
+            w = state_dict[new_key]
+            if list(w.shape) != shape:
+                print("weights shape error!")
+                print("key: ", key)
+                print("need shape: ", shape)
+                print("weight shape: ", w.shape)
+                exit(1)
+            inputs.append(t.make_variable(name=new_key, value=w.half()))
+    t.make_operator(
+        "TransformerEncoderFp16_IxRT", inputs=inputs, outputs=outputs, **attributes
+    )
+
+
+def add_layer_norm_op(graph, state_dict, args):
+    enc_ln_tensor_fp16_keys = OrderedDict([
+        ["1.encoder.norm.norm.weight", [args.hidden_size]],
+        ["1.encoder.norm.norm.bias", [args.hidden_size]],
+    ])
+    attributes = {
+        "epsilon": 1e-5,
+        "axis": -1,
+        "stash_type": 1
+    }
+    t = graph
+    inputs = [
+        graph.get_variable('encoder_out'),
+    ]
+    outputs = [t.make_variable("encoder_ln_out")]
+    for key, shape in enc_ln_tensor_fp16_keys.items():
+        new_key = key
+        w = state_dict[new_key]
+        if list(w.shape) != shape:
+            print("weights shape error!")
+            print("key: ", key)
+            print("need shape: ", shape)
+            print("weight shape: ", w.shape)
+            exit(1)
+        inputs.append(t.make_variable(name=new_key, value=w.half()))
+    t.make_operator(
+        "LayerNormalization", inputs=inputs, outputs=outputs, **attributes
+    )
+
+
+# def add_layer_norm_op(graph, state_dict, args):
+#     enc_ln_tensor_fp16_keys = OrderedDict([
+#         ["1.encoder.norm.norm.weight", [args.hidden_size]],
+#         ["1.encoder.norm.norm.bias", [args.hidden_size]],
+#     ])
+#     attributes = {
+#         "hidden_size": args.hidden_size,
+#     }
+#     t = graph
+#     inputs = [
+#         graph.get_variable('encoder_out'),
+#     ]
+#     outputs = [t.make_variable("encoder_ln_out",dtype=DataType.FLOAT16)]
+#     for key, shape in enc_ln_tensor_fp16_keys.items():
+#         new_key = key
+#         w = state_dict[new_key]
+#         if list(w.shape) != shape:
+#             print("weights shape error!")
+#             print("key: ", key)
+#             print("need shape: ", shape)
+#             print("weight shape: ", w.shape)
+#             exit(1)
+#         inputs.append(t.make_variable(name=new_key, value=w.half()))
+#     t.make_operator(
+#         "LayerNormFp16", inputs=inputs, outputs=outputs, **attributes
+#     )
+
+def add_linear_op(graph, state_dict, args):
+    linear_keys = [
+        "3.w.weight",
+        "3.w.bias"
+    ]
+    W = numpy_helper.from_array(state_dict[linear_keys[0]].cpu().numpy(), name="W")
+    B = numpy_helper.from_array(state_dict[linear_keys[1]].cpu().numpy(), name="B")
+    attributes = {
+        "out_dims": state_dict["3.w.weight"].size(0),
+        "type_id": 1,
+        "W": W,
+        "B": B,
+    }
+    assert state_dict['3.w.weight'].size(0) == state_dict["3.w.bias"].size(0)
+
+    t = graph
+    inputs = [
+        graph.get_variable('encoder_ln_out'),
+    ]
+
+    outputs = [t.make_variable("lin_output")]
+    t.make_operator(
+        "CustomFCPluginDynamic_IxRT", inputs=inputs, outputs=outputs, **attributes
+    )
+
+
+#
+# def add_linear_op(graph, state_dict, args):
+#     lin_keys = [
+#         "3.w.weight",
+#         "3.w.bias"
+#     ]
+#     attributes = {
+#         "linear_dim": state_dict["3.w.weight"].size(0),
+#         "hidden_size": state_dict["3.w.weight"].size(1),
+#         "has_bias": 1,
+#         "act_type": "none",
+#     }
+#     assert state_dict['3.w.weight'].size(0) == state_dict["3.w.bias"].size(0)
+#
+#     t = graph
+#     inputs = [
+#         graph.get_variable('encoder_ln_out'),
+#     ]
+#
+#     outputs = [t.make_variable("lin_output",dtype=DataType.FLOAT16)]
+#     for key in lin_keys:
+#         inputs.append(t.make_variable(name=key, value=state_dict[key].half()))
+#     t.make_operator(
+#         "LinearFP16", inputs=inputs, outputs=outputs, **attributes
+#     )
+
+
+def add_log_softmax_op(graph, state_dict, args):
+    attributes = {
+        "axis": "-1",
+    }
+
+    t = graph
+    inputs = [
+        graph.get_variable('lin_output'),
+    ]
+
+    outputs = [t.make_variable("log_softmax_output", dtype=DataType.FLOAT16)]
+
+    t.make_operator(
+        "LogSoftmax", inputs=inputs, outputs=outputs, **attributes
+    )
+
+
+def add_search_node(graph, state_dict, args):
+    attributes = {
+        "vocab_size": args.vocab_size,
+        "eos_id": args.vocab_size,
+        "pad_id": -10000,
+        "beam_size": 1,
+        "attr1": 1.0,
+        "min_decode_ratio": 0.0,
+        "max_decode_ratio": 1.0,
+        "ctc_weight": 0.40,
+        "using_eos_threshold": 0,
+        "length_normalization": 1,
+    }
+    t = graph
+    inputs = [
+        graph.get_variable('lin_output'),
+    ]
+
+    outputs = [t.make_variable("output_tokens", dtype=DataType.INT32)]
+    list_value_half = []
+    list_key_half = []
+    for key in state_dict.keys():
+        if "decoder" in key or "custom_tgt_module" in key or "2.w.weight" in key or "2.w.bias" in key:
+            list_key_half.append(key)
+            list_value_half.append(state_dict[key].half())
+    for i, item in enumerate(list_key_half):
+        inputs.append(t.make_variable(name=list_key_half[i], value=list_value_half[i]))
+    t.make_operator(
+        "Search", inputs=inputs, outputs=outputs, **attributes
+    )
+
+
+def get_num_layers(state_dict):
+    num_layers = -1
+    for key in state_dict:
+        layer_id = re.search(
+            "1.encoder.layers.([0-9]+).pos_ffn.ffn.0.bias", key)
+        if layer_id:
+            layer_id = layer_id.group(1)
+            num_layers = max(num_layers, int(layer_id) + 1)
+    assert num_layers > 0
+    return num_layers
+
+
+def build_engine(onnx_file, engine_file, max_batch_size,max_seq_len):
+    cmd = f"ixrtexec --onnx {onnx_file} --min_shape input:1x32x5120,length_radio:1 --opt_shape input:8x64x5120,length_radio:8 --max_shape input:{max_batch_size}x{max_seq_len}x5120,length_radio:64 --plugins ixrt_plugin --save_engine {engine_file}"
+    subprocess.run(cmd.split(), check=True)
+
+
+def main(args):
+    graph = Graph()
+    transform = GraphTransform(graph)
+    ckpt_path = glob.glob(os.path.join(args.ckpt_path, "*/model.ckpt"))[0]
+    print("load ckpt from: ", ckpt_path)
+    state_dict = torch.load(ckpt_path)
+
+    # print([i for i in state_dict ])
+    # print(state_dict['3.w.bias'])
+    args.hidden_size = state_dict['1.encoder.layers.0.norm1.norm.weight'].size(
+        0)
+    args.head_dim = args.hidden_size / args.head_num
+    args.inner_size = state_dict['1.encoder.layers.0.pos_ffn.ffn.0.bias'].size(
+        0)
+    args.vocab_size = state_dict['3.w.weight'].size(0)
+
+    args.num_layers = get_num_layers(state_dict)
+
+    args.src_len = state_dict["1.custom_src_module.layers.0.w.weight"].size(1)
+
+    # args.num_layers = 1
+    add_make_mask_op(transform, state_dict, args)
+    add_custom_linear_op(transform, state_dict, args)
+    add_pos_encode_op(transform, state_dict, args)
+    add_transformer_op(transform, state_dict, args)
+    add_layer_norm_op(transform, state_dict, args)
+    # add_linear_op(transform, state_dict, args)
+    # add_log_softmax_op(transform, state_dict, args)
+    # add_search_node(transform, state_dict, args)
+
+    # IO attributes
+    length_radio = graph.get_variable('length_radio')
+    length_radio.set_shape(["batch_size"])
+    length_radio.dtype = "float16"
+    graph.add_input(length_radio)
+
+    input = graph.get_variable('input')
+    input.set_shape(["batch_size", "seq_len", "src_len"])
+    input.dtype = "float16"
+    graph.add_input(input)
+
+    output = graph.get_variable('encoder_ln_out')
+    output.dtype = "float16"
+    graph.add_output(output)
+
+    create_target(saved_path=args.onnx_path).export(graph)
+
+    build_engine(args.onnx_path, args.engine_path, args.max_batch_size, args.max_seq_len)
+    print("save engine: ", args.engine_path)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    ckpt_path = args.ckpt_path
+
+    main(args)
+
+"""
+python3 builder.py \
+--ckpt_path results/transformer/8886/save \
+--head_num 4 \
+--max_batch_size 64  \
+--max_seq_len 1024 \
+--engine_path transformer.engine
+"""
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/convert.py b/models/speech/speech_recognition/transformer_asr/ixrt/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d71a566c7d03daff6f063e46fb5984665714cb
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/convert.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import torch
+from faster_layer_norm import FasterLayerNorm
+
+def replace_layer_norm(model):
+    module_output = model
+
+    if isinstance(model, torch.nn.modules.normalization.LayerNorm):
+        return FasterLayerNorm(model.weight, model.bias)
+
+    for name, child in model.named_children():
+        module_output.add_module(
+            name, replace_layer_norm(child)
+        )
+    return module_output
+
+
+def convert_decoder_model(model):
+    model = replace_layer_norm(model)
+    # for layer in model.layers:
+    #     norm = layer.norm1.norm
+    #     print(type(norm))
+    #     exit()
+    #     new_norm = FasterLayerNorm(norm.weight, norm.bias)
+    #     layer.norm1.norm = new_norm
+
+    #     norm = layer.norm2.norm
+    #     new_norm = FasterLayerNorm(norm.weight, norm.bias)
+    #     layer.norm2.norm = new_norm
+
+    #     norm = layer.norm3.norm
+    #     new_norm = FasterLayerNorm(norm.weight, norm.bias)
+    #     layer.norm3.norm = new_norm
+    return model
+
+# def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+#     if type(module) in layers:
+#         return {name: module}
+#     res = {}
+#     for name1, child in module.named_children():
+#         res.update(find_layers(
+#             child, layers=layers, name=name + '.' + name1 if name != '' else name1
+#         ))
+#     return res
+
+def find_node(module):
+    if type(module) in [torch.nn.LayerNorm]:
+        print(module)
+        return
+    res = {}
+    for name1, child in module.named_children():
+        find_node(child)
+    return
+
+
+def patch_get_lookahead_mask(padded_input):
+    """Creates a binary mask for each sequence which maskes future frames.
+
+    Arguments
+    ---------
+    padded_input: torch.Tensor
+        Padded input tensor.
+
+    Example
+    -------
+    >>> a = torch.LongTensor([[1,1,0], [2,3,0], [4,5,0]])
+    >>> get_lookahead_mask(a)
+    tensor([[0., -inf, -inf],
+            [0., 0., -inf],
+            [0., 0., 0.]])
+    """
+    seq_len = padded_input.shape[1]
+    mask = (
+            torch.triu(torch.ones((seq_len, seq_len), device=padded_input.device))
+            == 1
+    ).transpose(0, 1)
+    mask = (
+        mask.float()
+        .masked_fill(mask == 0, float("-inf"))
+        .masked_fill(mask == 1, float(0.0))
+    )
+    return mask.detach().to(padded_input.device).to(torch.float16)
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/ctc.py b/models/speech/speech_recognition/transformer_asr/ixrt/ctc.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db6ab7e1b92a58b579af85805ee9c0b98b8f3c0
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/ctc.py
@@ -0,0 +1,394 @@
+"""Decoders and output normalization for CTC.
+
+Authors
+ * Mirco Ravanelli 2020
+ * Aku Rouhe 2020
+ * Sung-Lin Yeh 2020
+"""
+import torch
+from itertools import groupby
+from speechbrain.dataio.dataio import length_to_mask
+from faster_logsumexp import FasterLogSumExp
+from faster_stack import FasterStack
+from faster_cat import FastCat
+
+
+class CTCPrefixScorer:
+    """This class implements the CTC prefix scorer of Algorithm 2 in
+    reference: https://www.merl.com/publications/docs/TR2017-190.pdf.
+    Official implementation: https://github.com/espnet/espnet/blob/master/espnet/nets/ctc_prefix_score.py
+
+    Arguments
+    ---------
+    x : torch.Tensor
+        The encoder states.
+    enc_lens : torch.Tensor
+        The actual length of each enc_states sequence.
+    batch_size : int
+        The size of the batch.
+    beam_size : int
+        The width of beam.
+    blank_index : int
+        The index of the blank token.
+    eos_index : int
+        The index of the end-of-sequence (eos) token.
+    ctc_window_size: int
+        Compute the ctc scores over the time frames using windowing based on attention peaks.
+        If 0, no windowing applied.
+    """
+
+    def __init__(
+        self,
+        x,
+        enc_lens,
+        batch_size,
+        beam_size,
+        blank_index,
+        eos_index,
+        ctc_window_size=0,
+    ):
+        self.blank_index = blank_index
+        self.eos_index = eos_index
+        self.max_enc_len = x.size(1)
+        self.batch_size = batch_size
+        self.beam_size = beam_size
+        self.vocab_size = x.size(-1)
+        self.device = x.device
+        self.minus_inf = -1e4
+        self.last_frame_index = enc_lens - 1
+        self.ctc_window_size = ctc_window_size
+
+        # mask frames > enc_lens
+        mask = 1 - length_to_mask(enc_lens)
+        mask = mask.unsqueeze(-1).expand(-1, -1, x.size(-1)).eq(1)
+        x.masked_fill_(mask, self.minus_inf)
+        x[:, :, 0] = x[:, :, 0].masked_fill_(mask[:, :, 0], 0)
+
+        # dim=0: xnb, nonblank posteriors, dim=1: xb, blank posteriors
+        xnb = x.transpose(0, 1)
+        xb = (
+            xnb[:, :, self.blank_index]
+            .unsqueeze(2)
+            .expand(-1, -1, self.vocab_size)
+        )
+
+        # (2, L, batch_size * beam_size, vocab_size)
+        # self.x = torch.stack([xnb, xb])
+        self.x = FasterStack([xnb.contiguous(), xb.contiguous()])
+
+        # The first index of each sentence.
+        self.beam_offset = (
+            torch.arange(batch_size, device=self.device) * self.beam_size
+        )
+        # The first index of each candidates.
+        self.cand_offset = (
+            torch.arange(batch_size, device=self.device) * self.vocab_size
+        )
+
+    def forward_step(self, g, state, candidates=None, attn=None):
+        """This method if one step of forwarding operation
+        for the prefix ctc scorer.
+
+        Arguments
+        ---------
+        g : torch.Tensor
+            The tensor of prefix label sequences, h = g + c.
+        state : tuple
+            Previous ctc states.
+        candidates : torch.Tensor
+            (batch_size * beam_size, ctc_beam_size), The topk candidates for rescoring.
+            The ctc_beam_size is set as 2 * beam_size. If given, performing partial ctc scoring.
+        """
+
+        prefix_length = g.size(1)
+        last_char = [gi[-1] for gi in g] if prefix_length > 0 else [0] * len(g)
+        self.num_candidates = (
+            self.vocab_size if candidates is None else candidates.size(-1)
+        )
+        if state is None:
+            # r_prev: (L, 2, batch_size * beam_size)
+            r_prev = torch.full(
+                (self.max_enc_len, 2, self.batch_size, self.beam_size),
+                self.minus_inf,
+                device=self.device,
+                dtype=torch.float16
+            )
+
+            # Accumulate blank posteriors at each step
+            r_prev[:, 1] = torch.cumsum(
+                self.x[0, :, :, self.blank_index], 0
+            ).unsqueeze(2)
+            r_prev = r_prev.view(-1, 2, self.batch_size * self.beam_size)
+            psi_prev = 0.0
+        else:
+            r_prev, psi_prev = state
+        r_prev = r_prev.half()
+
+        # for partial search
+        if candidates is not None:
+            scoring_table = torch.full(
+                (self.batch_size * self.beam_size, self.vocab_size),
+                -1,
+                dtype=torch.long,
+                device=self.device,
+            )
+            # Assign indices of candidates to their positions in the table
+            col_index = torch.arange(
+                self.batch_size * self.beam_size, device=self.device
+            ).unsqueeze(1)
+            scoring_table[col_index, candidates] = torch.arange(
+                self.num_candidates, device=self.device
+            )
+            # Select candidates indices for scoring
+            scoring_index = (
+                candidates
+                + self.cand_offset.unsqueeze(1)
+                .repeat(1, self.beam_size)
+                .view(-1, 1)
+            ).view(-1)
+            x_inflate = torch.index_select(
+                self.x.view(2, -1, self.batch_size * self.vocab_size),
+                2,
+                scoring_index,
+            ).view(2, -1, self.batch_size * self.beam_size, self.num_candidates)
+        # for full search
+        else:
+            scoring_table = None
+            x_inflate = (
+                self.x.unsqueeze(3)
+                .repeat(1, 1, 1, self.beam_size, 1)
+                .view(
+                    2, -1, self.batch_size * self.beam_size, self.num_candidates
+                )
+            )
+
+        # Prepare forward probs
+        r = torch.full(
+            (
+                self.max_enc_len,
+                2,
+                self.batch_size * self.beam_size,
+                self.num_candidates,
+            ),
+            self.minus_inf,
+            device=self.device,
+            dtype=torch.float16
+        )
+        r.fill_(self.minus_inf)
+
+        # (Alg.2-6)
+        if prefix_length == 0:
+            r[0, 0] = x_inflate[0, 0]
+        # (Alg.2-10): phi = prev_nonblank + prev_blank = r_t-1^nb(g) + r_t-1^b(g)
+        r_sum = FasterLogSumExp(r_prev, 1)
+        phi = r_sum.unsqueeze(2).repeat(1, 1, self.num_candidates)
+
+        # (Alg.2-10): if last token of prefix g in candidates, phi = prev_b + 0
+        if candidates is not None:
+            for i in range(self.batch_size * self.beam_size):
+                pos = scoring_table[i, last_char[i]]
+                if pos != -1:
+                    phi[:, i, pos] = r_prev[:, 1, i]
+        else:
+            for i in range(self.batch_size * self.beam_size):
+                phi[:, i, last_char[i]] = r_prev[:, 1, i]
+
+        # Start, end frames for scoring (|g| < |h|).
+        # Scoring based on attn peak if ctc_window_size > 0
+        if self.ctc_window_size == 0 or attn is None:
+            start = max(1, prefix_length)
+            end = self.max_enc_len
+        else:
+            _, attn_peak = torch.max(attn, dim=1)
+            max_frame = torch.max(attn_peak).item() + self.ctc_window_size
+            min_frame = torch.min(attn_peak).item() - self.ctc_window_size
+            start = max(max(1, prefix_length), int(min_frame))
+            end = min(self.max_enc_len, int(max_frame))
+
+        # Compute forward prob log(r_t^nb(h)) and log(r_t^b(h)):
+        for t in range(start, end):
+            # (Alg.2-11): dim=0, p(h|cur step is nonblank) = [p(prev step=y) + phi] * p(c)
+            rnb_prev = r[t - 1, 0]
+            # (Alg.2-12): dim=1, p(h|cur step is blank) = [p(prev step is blank) + p(prev step is nonblank)] * p(blank)
+            rb_prev = r[t - 1, 1]
+            # r_ = torch.stack([rnb_prev, phi[t - 1], rnb_prev, rb_prev]).view(
+            #     2, 2, self.batch_size * self.beam_size, self.num_candidates
+            # )
+            r_ = FasterStack([rnb_prev, phi[t - 1], rnb_prev, rb_prev]).view(
+                2, 2, self.batch_size * self.beam_size, self.num_candidates
+            )
+            r[t] = FasterLogSumExp(r_, 1) + x_inflate[:, t]
+
+        # Compute the predix prob, psi
+        psi_init = r[start - 1, 0].unsqueeze(0)
+        # phi is prob at t-1 step, shift one frame and add it to the current prob p(c)
+        phix = FastCat((phi[0].unsqueeze(0), phi[:-1]), dim=0) + x_inflate[0]
+ 
+        # (Alg.2-13): psi = psi + phi * p(c)
+        if candidates is not None:
+            psi = torch.full(
+                (self.batch_size * self.beam_size, self.vocab_size),
+                self.minus_inf,
+                device=self.device,
+                dtype=torch.float16
+            )
+            psi_ = FasterLogSumExp(
+                FastCat((phix[start:end], psi_init), dim=0), dim=0
+            )
+            # only assign prob to candidates
+            for i in range(self.batch_size * self.beam_size):
+                psi[i, candidates[i]] = psi_[i]
+        else:
+            psi = FastCat((phix[start:end], psi_init), dim=0)
+            psi = FasterLogSumExp(psi, dim=0)
+        
+        # (Alg.2-3): if c = <eos>, psi = log(r_T^n(g) + r_T^b(g)), where T is the length of max frames
+        for i in range(self.batch_size * self.beam_size):
+            psi[i, self.eos_index] = r_sum[
+                self.last_frame_index[i // self.beam_size], i
+            ]
+
+        # Exclude blank probs for joint scoring
+        psi[:, self.blank_index] = self.minus_inf
+
+        return psi - psi_prev, (r, psi, scoring_table)
+
+    def permute_mem(self, memory, index):
+        """This method permutes the CTC model memory
+        to synchronize the memory index with the current output.
+
+        Arguments
+        ---------
+        memory : No limit
+            The memory variable to be permuted.
+        index : torch.Tensor
+            The index of the previous path.
+
+        Return
+        ------
+        The variable of the memory being permuted.
+
+        """
+        r, psi, scoring_table = memory
+        # The index of top-K vocab came from in (t-1) timesteps.
+        best_index = (
+            index
+            + (self.beam_offset.unsqueeze(1).expand_as(index) * self.vocab_size)
+        ).view(-1)
+        # synchronize forward prob
+        psi = torch.index_select(psi.view(-1), dim=0, index=best_index)
+        psi = (
+            psi.view(-1, 1)
+            .repeat(1, self.vocab_size)
+            .view(self.batch_size * self.beam_size, self.vocab_size)
+        )
+
+        # synchronize ctc states
+        if scoring_table is not None:
+            effective_index = (
+                index // self.vocab_size + self.beam_offset.view(-1, 1)
+            ).view(-1)
+            selected_vocab = (index % self.vocab_size).view(-1)
+            score_index = scoring_table[effective_index, selected_vocab]
+            score_index[score_index == -1] = 0
+            best_index = score_index + effective_index * self.num_candidates
+
+        r = torch.index_select(
+            r.view(
+                -1, 2, self.batch_size * self.beam_size * self.num_candidates
+            ),
+            dim=-1,
+            index=best_index,
+        )
+        r = r.view(-1, 2, self.batch_size * self.beam_size)
+
+        return r, psi
+
+
+def filter_ctc_output(string_pred, blank_id=-1):
+    """Apply CTC output merge and filter rules.
+
+    Removes the blank symbol and output repetitions.
+
+    Arguments
+    ---------
+    string_pred : list
+        A list containing the output strings/ints predicted by the CTC system.
+    blank_id : int, string
+        The id of the blank.
+
+    Returns
+    -------
+    list
+        The output predicted by CTC without the blank symbol and
+        the repetitions.
+
+    Example
+    -------
+    >>> string_pred = ['a','a','blank','b','b','blank','c']
+    >>> string_out = filter_ctc_output(string_pred, blank_id='blank')
+    >>> print(string_out)
+    ['a', 'b', 'c']
+    """
+
+    if isinstance(string_pred, list):
+        # Filter the repetitions
+        string_out = [
+            v
+            for i, v in enumerate(string_pred)
+            if i == 0 or v != string_pred[i - 1]
+        ]
+
+        # Remove duplicates
+        string_out = [i[0] for i in groupby(string_out)]
+
+        # Filter the blank symbol
+        string_out = list(filter(lambda elem: elem != blank_id, string_out))
+    else:
+        raise ValueError("filter_ctc_out can only filter python lists")
+    return string_out
+
+
+def ctc_greedy_decode(probabilities, seq_lens, blank_id=-1):
+    """Greedy decode a batch of probabilities and apply CTC rules.
+
+    Arguments
+    ---------
+    probabilities : torch.tensor
+        Output probabilities (or log-probabilities) from the network with shape
+        [batch, probabilities, time]
+    seq_lens : torch.tensor
+        Relative true sequence lengths (to deal with padded inputs),
+        the longest sequence has length 1.0, others a value between zero and one
+        shape [batch, lengths].
+    blank_id : int, string
+        The blank symbol/index. Default: -1. If a negative number is given,
+        it is assumed to mean counting down from the maximum possible index,
+        so that -1 refers to the maximum possible index.
+
+    Returns
+    -------
+    list
+        Outputs as Python list of lists, with "ragged" dimensions; padding
+        has been removed.
+
+    Example
+    -------
+    >>> import torch
+    >>> probs = torch.tensor([[[0.3, 0.7], [0.0, 0.0]],
+    ...                       [[0.2, 0.8], [0.9, 0.1]]])
+    >>> lens = torch.tensor([0.51, 1.0])
+    >>> blank_id = 0
+    >>> ctc_greedy_decode(probs, lens, blank_id)
+    [[1], [1]]
+    """
+    if isinstance(blank_id, int) and blank_id < 0:
+        blank_id = probabilities.shape[-1] + blank_id
+    batch_max_len = probabilities.shape[1]
+    batch_outputs = []
+    for seq, seq_len in zip(probabilities, seq_lens):
+        actual_size = int(torch.round(seq_len * batch_max_len))
+        scores, predictions = torch.max(seq.narrow(0, 0, actual_size), dim=1)
+        out = filter_ctc_output(predictions.tolist(), blank_id=blank_id)
+        batch_outputs.append(out)
+    return batch_outputs
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/__init__.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..537d35c57ca840042a6e694f2ab29c333246d625
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/__init__.py
@@ -0,0 +1,13 @@
+import torch
+from faster_cat import sp_opt
+
+def FastCat(inputs,dim=0):
+    if len(inputs) == 2 and dim==0:
+        a,b = inputs
+        in_shape = a.shape
+        if len(in_shape)>1:
+            res, = sp_opt.test_opt_2(a.view(a.shape[0],-1),b.view(b.shape[0],-1))
+            new_shape = (a.shape[0]+b.shape[0],) + in_shape[1:]
+            res = res.view(*new_shape)
+            return res
+    return torch.cat(inputs,dim=dim)
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f679258d4f3e5e66187918d5b2126613a736dd6b
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/build.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+set -euox pipefail
+
+rm -rf build
+rm -rf *.so
+
+python3 setup.py build
+
+cp build/lib*/*.so .
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/kernel.cu b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..022fac397611cccca7f35c6cad0406969b123bf2
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/kernel.cu
@@ -0,0 +1,79 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include <stdexcept>
+
+namespace iluvatar::inferrt::transformer {
+
+__global__ void Cat(half* a, half* b, half* output, int m1, int m2, int k) {
+  int i = blockIdx.y * blockDim.x + threadIdx.x;
+  // a
+  if (blockIdx.x < m1) {
+    half2* h2_a = reinterpret_cast<half2*>(a + blockIdx.x * k);
+    half2* h2_out_a = reinterpret_cast<half2*>(output + blockIdx.x * k);
+    if (i < k / 2) {
+      h2_out_a[i] = h2_a[i];
+    }
+  }
+  // b
+  if (blockIdx.x < m2) {
+    half2* h2_b = reinterpret_cast<half2*>(b + blockIdx.x * k);
+    half2* h2_out_b =
+        reinterpret_cast<half2*>(output + blockIdx.x * k + m1 * k);
+    if (i < k / 2) {
+      h2_out_b[i] = h2_b[i];
+    }
+  }
+}
+
+void IxinferCatLauncher(half* a, half* b, half* output, int m1, int m2, int k,
+                        cudaStream_t stream) {
+  if (k % 2 != 0) {
+    throw std::runtime_error("IxinferStackLauncher: size error!");
+  }
+  int m = std::max(m1, m2);
+  int num_threads = 1024;
+  int half_k = k / 2;
+  int num_roll = (half_k - 1 + num_threads) / num_threads;
+  dim3 grid(m, num_roll);
+  dim3 block(num_threads);
+  Cat<<<grid, block, 0, stream>>>(a, b, output, m1, m2, k);
+}
+
+}  // namespace iluvatar::inferrt::transformer
+
+std::vector<at::Tensor> one_test_opt_2(at::Tensor a, at::Tensor b) {
+  TORCH_CHECK(a.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(a.is_cuda());
+  TORCH_CHECK(a.is_contiguous());
+
+  TORCH_CHECK(b.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(b.is_contiguous());
+
+  TORCH_CHECK(a.dim() == 2);
+  TORCH_CHECK(b.dim() == 2);
+
+  int m1 = a.size(0);
+  int m2 = b.size(0);
+
+  int k = a.size(1);
+
+  TORCH_CHECK(b.size(1) == k);
+
+  at::Tensor output = a.new_empty({(m1 + m2), k});
+
+  half* p_a = (half*)a.data_ptr();
+  half* p_b = (half*)b.data_ptr();
+  half* p_out = (half*)output.data_ptr();
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  iluvatar::inferrt::transformer::IxinferCatLauncher(p_a, p_b, p_out, m1, m2, k,
+                                                     stream);
+  return {output};
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/setup.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a031577c2d36f4451bab736af4306b4bd741adc4
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/setup.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import glob
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# cpp_files = glob.glob(os.path.join(CUR_DIR,"*.cpp"))
+# cu_files = glob.glob(os.path.join(CUR_DIR,'*.cu'))
+# source_files = cpp_files + cu_files
+# print("source files:")
+# for i in source_files:
+#     print(i)
+source_files = [
+    os.path.join(CUR_DIR,'test.cpp'),
+    os.path.join(CUR_DIR,'kernel.cu'),
+]
+
+for i in source_files:
+    assert os.path.isfile(i)
+    print(i)
+
+setup(
+    name="test",
+    ext_modules=[
+        CUDAExtension(
+            name="sp_opt",
+            libraries=["cuinfer"],
+            sources=source_files)
+    ],
+    cmdclass={
+        "build_ext": BuildExtension
+    }
+)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.cpp b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1172081109fd7c970de661427851854f13313b21
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.cpp
@@ -0,0 +1,21 @@
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <torch/extension.h>
+
+#include <cassert>
+#include <iostream>
+#include <vector>
+
+
+std::vector<at::Tensor> one_test_opt_2(at::Tensor a, at::Tensor b);
+
+std::vector<at::Tensor> test_opt_2(at::Tensor a, at::Tensor b) {
+  return one_test_opt_2(a, b);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("test_opt_2", &test_opt_2, "");
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2713dae297bf44340caa556ba9f0dd3860219326
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import torch
+import sp_opt
+
+if __name__ == "__main__":
+    m1 = 320
+    m2 = 321
+    hidden_size = 5000
+
+    a = torch.randn([m1,hidden_size]).cuda().half()
+    b = torch.randn([m2,hidden_size]).cuda().half()
+   
+
+    res_pt = torch.cat([a,b],dim=0)
+    
+    res_cu, = sp_opt.test_opt_2(a,b)
+    
+
+    diff = torch.abs(res_pt-res_cu)
+    print(diff)
+    print(diff.max())
+
+    for i in range(20):
+        res_cu, = sp_opt.test_opt_2(a,b)
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/__init__.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..20603650006a6e4d586957a9c38193a3c12937a9
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/__init__.py
@@ -0,0 +1,16 @@
+import torch
+from faster_layer_norm import sp_opt
+
+class FasterLayerNorm(torch.nn.Module):
+    def __init__(self, weight, bias):
+        super(FasterLayerNorm, self).__init__()
+        self.weight = weight
+        self.bias = bias
+    
+    def forward(self, inputs, *args, **kwargs):
+        hidden_size = self.weight.size(0)
+        in_shape = inputs.shape
+        inputs = inputs.view(-1,hidden_size)
+        output, = sp_opt.test_opt(inputs,self.weight,self.bias)
+        output = output.view(*in_shape)
+        return output
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f679258d4f3e5e66187918d5b2126613a736dd6b
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/build.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+set -euox pipefail
+
+rm -rf build
+rm -rf *.so
+
+python3 setup.py build
+
+cp build/lib*/*.so .
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/kernel.cu b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..852db917b04c3c5c93d31e677e0d74aeb45f6edc
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/kernel.cu
@@ -0,0 +1,168 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include <stdexcept>
+
+#include "transformer_helper.cuh"
+
+namespace iluvatar::inferrt::transformer {
+
+template <int UNROLL_FACTOR>
+__global__ void LnOpt2Kernel(half* input, half* ln_weight, half* ln_bias,
+                             half* output, int hidden_size,
+                             float layernorm_eps) {
+  input += blockIdx.x * hidden_size;
+  output += blockIdx.x * hidden_size;
+
+  half2* p_in = reinterpret_cast<half2*>(input);
+  half2* p_out = reinterpret_cast<half2*>(output);
+  half2* p_wei = reinterpret_cast<half2*>(ln_weight);
+  half2* p_bias = reinterpret_cast<half2*>(ln_bias);
+  int half_hidden_size = hidden_size / 2;
+
+  extern __shared__ half2 shmem[];
+
+  float s_mean;
+  float s_variance;
+  float x_sum = 0.0f;
+  float x2_sum = 0.0f;
+#pragma unroll UNROLL_FACTOR
+  for (int i = 0; i < UNROLL_FACTOR; ++i) {
+    int index = i * blockDim.x + threadIdx.x;
+    if (index < half_hidden_size) {
+      half2 value = p_in[index];
+      shmem[index] = value;
+      float val_1 = __half2float(value.x);
+      float val_2 = __half2float(value.y);
+      x_sum += val_1 + val_2;
+      x2_sum += val_1 * val_1 + val_2 * val_2;
+    }
+  }
+  float sums[2];  // 和，平方和
+  sums[0] = x_sum;
+  sums[1] = x2_sum;
+  blockReduceSumV2<float, 2>(sums);
+
+  s_mean = sums[0] / hidden_size;
+  s_variance = rsqrtf(sums[1] / hidden_size - s_mean * s_mean + layernorm_eps);
+
+#pragma unroll UNROLL_FACTOR
+  for (int i = 0; i < UNROLL_FACTOR; ++i) {
+    int index = i * blockDim.x + threadIdx.x;
+    if (index < half_hidden_size) {
+      half2 wei_value = p_wei[index];
+      half2 bias_value = p_bias[index];
+      half2 vals_value = shmem[index];
+
+      float2 norm_value;
+      norm_value.x = (__half2float(vals_value.x) - s_mean) * s_variance *
+                         __half2float(wei_value.x) +
+                     __half2float(bias_value.x);
+      norm_value.y = (__half2float(vals_value.y) - s_mean) * s_variance *
+                         __half2float(wei_value.y) +
+                     __half2float(bias_value.y);
+
+      __half2 res;
+      res.x = __float2half(norm_value.x);
+      res.y = __float2half(norm_value.y);
+
+      p_out[index] = res;
+    }
+  }
+}
+
+// FasterTransformer/src/fastertransformer/kernels/layernorm_kernels.cu
+void IxinferLnLauncherOpt2(__half* input, __half* ln_weight, __half* ln_bias,
+                           __half* output, int batch_tokens, int hidden_size,
+                           cudaStream_t stream) {
+  const float layernorm_eps = 1e-5;
+  if (hidden_size % 2 != 0) {
+    throw std::runtime_error("layer norm error: hidden_size % 2 != 0");
+  }
+  dim3 grid(batch_tokens);
+  int half_n = hidden_size / 2;
+  int half_n_warp = (half_n + warpSize - 1) / warpSize * warpSize;
+  dim3 block(std::min(half_n_warp, 1024));
+  int rolls_per_thread = (half_n + block.x - 1) / block.x;
+  switch (rolls_per_thread) {
+    case 1:
+      LnOpt2Kernel<1><<<grid, block, hidden_size * sizeof(half), stream>>>(
+          input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+      break;
+    case 2:
+      LnOpt2Kernel<2><<<grid, block, hidden_size * sizeof(half), stream>>>(
+          input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+      break;
+    case 3:
+      LnOpt2Kernel<3><<<grid, block, hidden_size * sizeof(half), stream>>>(
+          input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+      break;
+    case 4:
+      LnOpt2Kernel<4><<<grid, block, hidden_size * sizeof(half), stream>>>(
+          input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+      break;
+    case 5:
+      LnOpt2Kernel<5><<<grid, block, hidden_size * sizeof(half), stream>>>(
+          input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+      break;
+    case 6:
+      LnOpt2Kernel<6><<<grid, block, hidden_size * sizeof(half), stream>>>(
+          input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+      break;
+    case 7:
+      LnOpt2Kernel<7><<<grid, block, hidden_size * sizeof(half), stream>>>(
+          input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+      break;
+    case 8:
+      LnOpt2Kernel<8><<<grid, block, hidden_size * sizeof(half), stream>>>(
+          input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+      break;
+    default:
+      std::cout << "hidden_size: " << hidden_size << std::endl;
+      throw std::runtime_error("layer norm error, unsupport hidden size! ");
+      break;
+  }
+}
+}  // namespace iluvatar::inferrt::transformer
+
+std::vector<at::Tensor> one_test_opt(at::Tensor input, at::Tensor ln_weight,
+                                     at::Tensor ln_bias) {
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(input.is_cuda());
+  TORCH_CHECK(input.is_contiguous());
+
+  TORCH_CHECK(ln_weight.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(ln_weight.is_cuda());
+  TORCH_CHECK(ln_weight.is_contiguous());
+
+  TORCH_CHECK(ln_bias.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(ln_bias.is_cuda());
+  TORCH_CHECK(ln_bias.is_contiguous());
+
+  TORCH_CHECK(input.dim() == 2);
+  TORCH_CHECK(ln_weight.dim() == 1);
+  TORCH_CHECK(ln_bias.dim() == 1);
+
+  int batch_tokens = input.size(0);
+  int hidden_size = input.size(1);
+
+  TORCH_CHECK(ln_weight.size(0) == hidden_size);
+  TORCH_CHECK(ln_bias.size(0) == hidden_size);
+
+  at::Tensor output = at::empty_like(input);
+
+  half* p_in = (half*)input.data_ptr();
+  half* p_wei = (half*)ln_weight.data_ptr();
+  half* p_bias = (half*)ln_bias.data_ptr();
+  half* p_out = (half*)output.data_ptr();
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  iluvatar::inferrt::transformer::IxinferLnLauncherOpt2(
+      p_in, p_wei, p_bias, p_out, batch_tokens, hidden_size, stream);
+  return {output};
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/setup.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a031577c2d36f4451bab736af4306b4bd741adc4
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/setup.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import glob
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# cpp_files = glob.glob(os.path.join(CUR_DIR,"*.cpp"))
+# cu_files = glob.glob(os.path.join(CUR_DIR,'*.cu'))
+# source_files = cpp_files + cu_files
+# print("source files:")
+# for i in source_files:
+#     print(i)
+source_files = [
+    os.path.join(CUR_DIR,'test.cpp'),
+    os.path.join(CUR_DIR,'kernel.cu'),
+]
+
+for i in source_files:
+    assert os.path.isfile(i)
+    print(i)
+
+setup(
+    name="test",
+    ext_modules=[
+        CUDAExtension(
+            name="sp_opt",
+            libraries=["cuinfer"],
+            sources=source_files)
+    ],
+    cmdclass={
+        "build_ext": BuildExtension
+    }
+)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/test.cpp b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f925c1b468189dbea8e5d8bfaaef623b989f3163
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/test.cpp
@@ -0,0 +1,22 @@
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <torch/extension.h>
+
+#include <cassert>
+#include <iostream>
+#include <vector>
+
+std::vector<at::Tensor> one_test_opt(at::Tensor input, at::Tensor ln_weight,
+                                     at::Tensor ln_bias);
+
+std::vector<at::Tensor> test_opt(at::Tensor input, at::Tensor ln_weight,
+                                 at::Tensor ln_bias) {
+  return one_test_opt(input, ln_weight, ln_bias);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("test_opt", &test_opt, "fast depthwise conv1d forward");
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/transformer_helper.cuh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/transformer_helper.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f8a57622c7d84e5549f99d419cda6bb5011a6ffa
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/transformer_helper.cuh
@@ -0,0 +1,295 @@
+#pragma once
+#include <cuda.h>
+#include <cuda_fp16.h>
+
+namespace iluvatar {
+namespace inferrt {
+namespace transformer {
+
+__forceinline__ int nearest_4(int x) {
+  if (x % 4 == 0) {
+    return x;
+  } else {
+    int padding = 4 - x % 4;
+    return x + padding;
+  }
+}
+
+__forceinline__ int nearest_2(int x) {
+  if (x % 2 == 0) {
+    return x;
+  } else {
+    int padding = 2 - x % 2;
+    return x + padding;
+  }
+}
+
+__forceinline__ int nearest_num(int x, int value) {
+  if (x % value == 0) {
+    return x;
+  } else {
+    int padding = value - x % value;
+    return x + padding;
+  }
+}
+
+__device__ int8_t float2int8(float x, float quant_scale) {
+  float i8_f = x * quant_scale;
+  int32_t i8 = floorf(i8_f + 0.5);
+  i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
+  return int8_t(i8);
+}
+
+__device__ void WelfordCombine(float val, float *mean, float *m2,
+                               float *count) {
+  // Use Welford Online algorithem to compute mean and variance
+  // For more details you can refer to:
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  *count += 1;
+  float delta1 = val - *mean;
+  *mean += delta1 / *count;
+  float delta2 = val - *mean;
+  *m2 += delta1 * delta2;
+}
+
+__device__ void WelfordCombine(float b_mean, float b_m2, float b_count,
+                               float *mean, float *m2, float *count) {
+  if (b_count == 0) {
+    return;
+  }
+  float new_count = *count + b_count;
+  float nb_over_n = b_count / new_count;
+  float delta = b_mean - *mean;
+  *mean += delta * nb_over_n;
+  *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
+  *count = new_count;
+}
+
+__device__ void WelfordWarpReduce(float thread_mean, float thread_m2,
+                                  float thread_count, float *mean, float *m2,
+                                  float *count) {
+  *mean = thread_mean;
+  *m2 = thread_m2;
+  *count = thread_count;
+  for (int mask = warpSize / 2; mask > 0; mask /= 2) {
+    float b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
+    float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask);
+    float b_count = __shfl_down_sync(0xffffffff, *count, mask);
+    WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
+  }
+}
+
+// load 两个 half2, 保存到 float4
+__device__ void load_float4_from_half(float4 &vals, __half2 *input, int index) {
+  __half2 i1 = input[index * 2];
+  __half2 i2 = input[index * 2 + 1];
+
+  vals.x = __half2float(i1.x);
+  vals.y = __half2float(i1.y);
+  vals.z = __half2float(i2.x);
+  vals.w = __half2float(i2.y);
+}
+
+__device__ char4 float42char4(float4 vals, float quant_scale) {
+  char4 res;
+  res.x = float2int8(vals.x, quant_scale);
+  res.y = float2int8(vals.y, quant_scale);
+  res.z = float2int8(vals.z, quant_scale);
+  res.w = float2int8(vals.w, quant_scale);
+  return res;
+}
+
+__device__ float4 char4addhalf2_dequant(char4 input_4, half2 residual_1,
+                                        half2 residual_2, float dequant_scale) {
+  float4 res;
+  res.x =
+      __int2float_rn(input_4.x) * dequant_scale + __half2float(residual_1.x);
+  res.y =
+      __int2float_rn(input_4.y) * dequant_scale + __half2float(residual_1.y);
+  res.z =
+      __int2float_rn(input_4.z) * dequant_scale + __half2float(residual_2.x);
+  res.w =
+      __int2float_rn(input_4.w) * dequant_scale + __half2float(residual_2.y);
+  return res;
+}
+
+__device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2,
+                                            int hidden_size, float epsilon,
+                                            half2 scale_1, half2 scale_2,
+                                            half2 bias_1, half2 bias_2) {
+  float4 norm_value;
+  norm_value.x = (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) *
+                     __half2float(scale_1.x) +
+                 __half2float(bias_1.x);
+  norm_value.y = (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) *
+                     __half2float(scale_1.y) +
+                 __half2float(bias_1.y);
+  norm_value.z = (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) *
+                     __half2float(scale_2.x) +
+                 __half2float(bias_2.x);
+  norm_value.w = (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) *
+                     __half2float(scale_2.y) +
+                 __half2float(bias_2.y);
+  return norm_value;
+}
+
+// softmax
+__forceinline__ __host__ __device__ int log2_ceil(int value) {
+  int log2_value = 0;
+  while ((1 << log2_value) < value) ++log2_value;
+  return log2_value;
+}
+template <typename T>
+__device__ T WARP_SHFL_XOR(T value, int laneMask, int width) {
+  unsigned int mask = 0xffffffff;
+#if !(defined(__HIP_PLATFORM_HCC__) || defined(__ILUVATAR__))
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename T>
+struct Add {
+  __device__ T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct Max {
+  __device__ T operator()(T a, T b) const { return a < b ? b : a; }
+};
+template <typename acc_t, int REDUCE_WARP_SIZE,
+          template <typename> class ReduceOp>
+__device__ void warp_reduce(acc_t *sum) {
+  ReduceOp<acc_t> r;
+#pragma unroll
+  for (int offset = REDUCE_WARP_SIZE / 2; offset > 0; offset /= 2) {
+    acc_t b = WARP_SHFL_XOR(*sum, offset, REDUCE_WARP_SIZE);
+    *sum = r(*sum, b);
+  }
+}
+
+__device__ void warp_argmax(float &value, int32_t &idx) {
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    float next_value = WARP_SHFL_XOR(value, offset, warpSize);
+    float next_idx = WARP_SHFL_XOR(idx, offset, warpSize);
+    if (next_value > value) {
+      value = next_value;
+      idx = next_idx;
+    }
+  }
+}
+
+// gelu
+//  IxinferBiasGeluI8II8OKernel
+template <typename T>
+__device__ T tanhf_exp(T x) {
+  // float e1 = __expf(x);
+  // float e2 = 1.0f / e1;
+  // return (e1 - e2) / (e1 + e2);
+
+  return (2.f / (1.f + __expf(-2.f * x)) - 1.f);
+}
+
+template <typename T>
+__device__ T gelu(T x) {
+  float cdf =
+      0.5f *
+      (1.0f + tanhf_exp((0.7978845608028654f * (x + 0.044715f * x * x * x))));
+  return x * cdf;
+}
+
+/* fp16 gelu */
+template <>
+__forceinline__ __device__ __half2 gelu<__half2>(__half2 val) {
+  __half2 val_pow3 = __hmul2(val, __hmul2(val, val));
+  float2 tmp_pow = __half22float2(val_pow3);
+  float2 tmp = __half22float2(val);
+
+  tmp.x =
+      0.5f *
+      (1.0f + tanhf((0.7978845608028654f * (tmp.x + 0.044715f * tmp_pow.x))));
+  tmp.y =
+      0.5f *
+      (1.0f + tanhf((0.7978845608028654f * (tmp.y + 0.044715f * tmp_pow.y))));
+  return __hmul2(val, __float22half2_rn(tmp));
+}
+
+/* Convert vector index to 3-dim tensor index */
+__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1,
+                                                        int dim2, int *id0,
+                                                        int *id1, int *id2) {
+  *id2 = src % dim2;
+  src /= dim2;
+
+  *id1 = src % dim1;
+  *id0 = src / dim1;
+}
+
+template <typename T, int NUM>
+__inline__ __device__ T warpReduceSumV2(T *val) {
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+#pragma unroll
+    for (int mask = warpSize / 2; mask > 0; mask >>= 1)
+      val[i] += __shfl_xor_sync(0xffffffff, val[i], mask, warpSize);
+  }
+  return (T)(0.0f);
+}
+
+template <typename T, int NUM>
+__inline__ __device__ T blockReduceSumV2(T *val) {
+  static __shared__ T shared[NUM][warpSize + 1];
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  warpReduceSumV2<T, NUM>(val);
+
+  if (lane == 0) {
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+      shared[i][wid] = val[i];
+    }
+  }
+
+  __syncthreads();
+
+  bool is_mask = lane < (blockDim.x / warpSize);
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+    val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+  }
+  warpReduceSumV2<T, NUM>(val);
+  return (T)0.0f;
+}
+
+__inline__ __device__ void warpReduceSum2Number(float *x, float *y) {
+#pragma unroll
+  for (int mask = warpSize / 2; mask > 0; mask >>= 1) {
+    *x += __shfl_xor_sync(0xffffffff, *x, mask, warpSize);
+    *y += __shfl_xor_sync(0xffffffff, *y, mask, warpSize);
+  }
+}
+
+__inline__ __device__ void blockReduceSum2Number(float *x, float *y) {
+  static __shared__ float shared[2][warpSize + 1];
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  warpReduceSum2Number(x, y);
+  if (lane == 0) {
+    shared[0][wid] = *x;
+    shared[1][wid] = *y;
+  }
+  __syncthreads();
+  bool is_mask = lane < (blockDim.x / warpSize);
+  *x = is_mask ? shared[0][lane] : 0.0f;
+  *y = is_mask ? shared[0][lane] : 0.0f;
+
+  warpReduceSum2Number(x, y);
+}
+
+}  // namespace transformer
+
+}  // namespace inferrt
+}  // namespace iluvatar
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/__init__.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d50b3758c439e1c8c73f3d1ad07d104526be71ab
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/__init__.py
@@ -0,0 +1,38 @@
+import torch
+from faster_logsumexp import sp_opt
+
+# class FasterLogSumExp(torch.nn.Module):
+#     def __init__(self, weight, bias):
+#         super(FasterLogSumExp, self).__init__()
+#         self.weight = weight
+#         self.bias = bias
+    
+#     def forward(self, inputs, *args, **kwargs):
+#         hidden_size = self.weight.size(0)
+#         in_shape = inputs.shape
+#         inputs = inputs.view(-1,hidden_size)
+#         output, = sp_opt.test_opt(inputs,self.weight,self.bias)
+#         output = output.view(*in_shape)
+#         return output
+
+def FasterLogSumExp(inputs,dim):
+    # print(inputs.shape, dim)
+    if dim == 1 and len(inputs.shape)>2 and inputs.size(1)==2:
+        in_shape = inputs.shape
+        inputs = inputs.view(in_shape[0],in_shape[1],-1)
+        res, = sp_opt.test_opt(inputs)
+        new_shape = (in_shape[0],) + in_shape[2:]
+        res = res.view(*new_shape)
+        return res
+    # dim==0 现在的实现会有bug?
+    # if dim == 0 and len(inputs.shape)>=2:
+    #     in_shape = inputs.shape
+    #     inputs = inputs.view(in_shape[0],-1)
+    #     res, = sp_opt.test_opt_dim0(inputs)
+    #     new_shape = in_shape[1:]
+    #     res = res.view(*new_shape)
+    #     return res
+    # print(f"not support shape: {inputs.shape} dim: {dim}")
+    res = torch.logsumexp(inputs, dim=dim)
+    return res
+        
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f679258d4f3e5e66187918d5b2126613a736dd6b
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/build.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+set -euox pipefail
+
+rm -rf build
+rm -rf *.so
+
+python3 setup.py build
+
+cp build/lib*/*.so .
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/kernel.cu b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56eb0810bcb46f121761cd43cba931d285b0635c
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/kernel.cu
@@ -0,0 +1,155 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include <stdexcept>
+
+namespace iluvatar::inferrt::transformer {
+
+__global__ void LogSumExpWith2(half* input, half* output, int H) {
+  half2* h2_in1 = reinterpret_cast<half2*>(input + blockIdx.x * 2 * H);
+  half2* h2_in2 = reinterpret_cast<half2*>(input + blockIdx.x * 2 * H + H);
+  half2* h2_out = reinterpret_cast<half2*>(output + blockIdx.x * H);
+
+  int i = blockIdx.y * blockDim.x + threadIdx.x;
+  if (i < H / 2) {
+    float2 res;
+    half2 value1 = h2_in1[i];
+    half2 value2 = h2_in2[i];
+
+    res.x = std::log(__expf(__half2float(value1.x)) +
+                     __expf(__half2float(value2.x)));
+    res.y = std::log(__expf(__half2float(value1.y)) +
+                     __expf(__half2float(value2.y)));
+
+    half2 res_h2;
+    res_h2.x = __float2half(res.x);
+    res_h2.y = __float2half(res.y);
+    h2_out[i] = res_h2;
+  }
+}
+
+void IxinferLogSumExpLauncher(half* input, half* output, int N, int C, int H,
+                              cudaStream_t stream) {
+  const float layernorm_eps = 1e-5;
+  if (H % 2 != 0) {
+    throw std::runtime_error("IxinferLogSumExpLauncher: size error!");
+  }
+  int num_threads = 1024;
+  int half_h = H / 2;
+  int num_roll = (half_h - 1 + num_threads) / num_threads;
+  dim3 grid(N, num_roll);
+  dim3 block(num_threads);
+  switch (C) {
+    case 2:
+      LogSumExpWith2<<<grid, block, 0, stream>>>(input, output, H);
+      break;
+    default:
+      throw std::runtime_error(
+          "IxinferLogSumExpLauncher error, unsupport size! ");
+      break;
+  }
+}
+
+// https://zhuanlan.zhihu.com/p/153535799
+__global__ void LogSumExpDim0(half* input, half* output, int N, int H) {
+  half2* h2_out = reinterpret_cast<half2*>(output);
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  float2 res;
+  res.x = 0.f;
+  res.y = 0.f;
+
+  float2 max_values;
+  max_values.x = -1000.f;
+  max_values.y = -1000.f;
+
+  for (int batch_idx = 0; batch_idx < N; batch_idx++) {
+    half2* h2_in = reinterpret_cast<half2*>(input + batch_idx * H);
+    half2 value = h2_in[i];
+
+    if (max_values.x < __half2float(value.x)) {
+      max_values.x = __half2float(value.x);
+    }
+    if (max_values.y < __half2float(value.y)) {
+      max_values.y = __half2float(value.y);
+    }
+  }
+
+  for (int batch_idx = 0; batch_idx < N; batch_idx++) {
+    half2* h2_in = reinterpret_cast<half2*>(input + batch_idx * H);
+    half2 value = h2_in[i];
+
+    res.x += __expf(__half2float(value.x) - max_values.x);
+    res.y += __expf(__half2float(value.y) - max_values.y);
+  }
+
+  half2 res_h2;
+  res_h2.x = __float2half(std::log(res.x) + max_values.x);
+  res_h2.y = __float2half(std::log(res.y) + max_values.y);
+
+  h2_out[i] = res_h2;
+}
+
+void IxinferLogSumExpLauncher(half* input, half* output, int N, int H,
+                              cudaStream_t stream) {
+  if (H % 2 != 0) {
+    throw std::runtime_error("IxinferLogSumExpLauncher: size error!");
+  }
+  int num_threads = 1024;
+  int half_h = H / 2;
+  int num_roll = (half_h - 1 + num_threads) / num_threads;
+  dim3 grid(num_roll);
+  dim3 block(num_threads);
+  LogSumExpDim0<<<grid, block, 0, stream>>>(input, output, N, H);
+}
+
+}  // namespace iluvatar::inferrt::transformer
+
+std::vector<at::Tensor> one_test_opt(at::Tensor input) {
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(input.is_cuda());
+  TORCH_CHECK(input.is_contiguous());
+
+  TORCH_CHECK(input.dim() == 3);
+
+  int N = input.size(0);
+  int C = input.size(1);
+  int H = input.size(2);
+
+  at::Tensor output = input.new_empty({N, H});
+
+  half* p_in = (half*)input.data_ptr();
+  half* p_out = (half*)output.data_ptr();
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  iluvatar::inferrt::transformer::IxinferLogSumExpLauncher(p_in, p_out, N, C, H,
+                                                           stream);
+  return {output};
+}
+
+std::vector<at::Tensor> one_test_dim0(at::Tensor input) {
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(input.is_cuda());
+  TORCH_CHECK(input.is_contiguous());
+
+  TORCH_CHECK(input.dim() == 2);
+
+  int N = input.size(0);
+  int H = input.size(1);
+
+  at::Tensor output = input.new_empty({H});
+
+  half* p_in = (half*)input.data_ptr();
+  half* p_out = (half*)output.data_ptr();
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  iluvatar::inferrt::transformer::IxinferLogSumExpLauncher(p_in, p_out, N, H,
+                                                           stream);
+  return {output};
+}
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/setup.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a031577c2d36f4451bab736af4306b4bd741adc4
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/setup.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import glob
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# cpp_files = glob.glob(os.path.join(CUR_DIR,"*.cpp"))
+# cu_files = glob.glob(os.path.join(CUR_DIR,'*.cu'))
+# source_files = cpp_files + cu_files
+# print("source files:")
+# for i in source_files:
+#     print(i)
+source_files = [
+    os.path.join(CUR_DIR,'test.cpp'),
+    os.path.join(CUR_DIR,'kernel.cu'),
+]
+
+for i in source_files:
+    assert os.path.isfile(i)
+    print(i)
+
+setup(
+    name="test",
+    ext_modules=[
+        CUDAExtension(
+            name="sp_opt",
+            libraries=["cuinfer"],
+            sources=source_files)
+    ],
+    cmdclass={
+        "build_ext": BuildExtension
+    }
+)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.cpp b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5eaf6fe16e38d1a5694c391de30a6c4b82ed2af5
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.cpp
@@ -0,0 +1,27 @@
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <torch/extension.h>
+
+#include <cassert>
+#include <iostream>
+#include <vector>
+
+std::vector<at::Tensor> one_test_opt(at::Tensor input);
+
+std::vector<at::Tensor> test_opt(at::Tensor input) {
+  return one_test_opt(input);
+}
+
+std::vector<at::Tensor> one_test_dim0(at::Tensor input);
+
+std::vector<at::Tensor> test_opt_dim0(at::Tensor input) {
+  return one_test_dim0(input);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("test_opt", &test_opt, "");
+  m.def("test_opt_dim0", &test_opt_dim0, "");
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b22dbddab13f94854c0e334ca53348d9c41f2ba
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import torch
+import sp_opt
+
+if __name__ == "__main__":
+    batch_tokens = 2
+    c = 2
+    hidden_size = 320*5000
+
+    inputs = torch.randn([batch_tokens,c, hidden_size]).cuda().half()
+
+    # res1 = torch.log(torch.sum(torch.exp(inputs),dim=-1))
+    # res2 = torch.logsumexp(inputs,dim=-1)
+    # diff = torch.abs(res1-res2)
+    # print(diff.max())
+
+    res_pt = torch.logsumexp(inputs,dim=1)
+
+    res_cu, = sp_opt.test_opt(inputs)
+
+    diff = torch.abs(res_pt - res_cu)
+    print(diff.max())
+
+    for i in range(20):
+        res_cu, = sp_opt.test_opt(inputs)
+    
+    batch_tokens = 55
+    hidden_size = 320*5000
+    inputs = torch.randn([batch_tokens,hidden_size]).cuda().half()
+    res_pt = torch.logsumexp(inputs,dim=0)
+    res_cu, = sp_opt.test_opt_dim0(inputs)
+
+    diff = torch.abs(res_pt - res_cu)
+    print(diff.max())
+    for i in range(20):
+        res_cu, = sp_opt.test_opt_dim0(inputs)
+
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/__init__.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48d0cf5b0f3bdd03e18f19dbfded0704b7048b2f
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/__init__.py
@@ -0,0 +1,33 @@
+import torch
+from faster_stack import sp_opt
+
+# class FasterLogSumExp(torch.nn.Module):
+#     def __init__(self, weight, bias):
+#         super(FasterLogSumExp, self).__init__()
+#         self.weight = weight
+#         self.bias = bias
+    
+#     def forward(self, inputs, *args, **kwargs):
+#         hidden_size = self.weight.size(0)
+#         in_shape = inputs.shape
+#         inputs = inputs.view(-1,hidden_size)
+#         output, = sp_opt.test_opt(inputs,self.weight,self.bias)
+#         output = output.view(*in_shape)
+#         return output
+
+def FasterStack(inputs):
+    if len(inputs) == 4:
+        a,b,c,d = inputs
+        in_shape = a.shape
+        res, = sp_opt.test_opt(a.view(-1),b.view(-1),c.view(-1),d.view(-1))
+        new_shape = (4,) + in_shape
+        res = res.view(*new_shape)
+        return res
+    if len(inputs) == 2:
+        a,b = inputs
+        in_shape = a.shape
+        res, = sp_opt.test_opt_2(a.view(-1),b.view(-1))
+        new_shape = (2,) + in_shape
+        res = res.view(*new_shape)
+        return res
+    return torch.stack(inputs)
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f679258d4f3e5e66187918d5b2126613a736dd6b
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/build.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+set -euox pipefail
+
+rm -rf build
+rm -rf *.so
+
+python3 setup.py build
+
+cp build/lib*/*.so .
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/kernel.cu b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0fdff64992ad17bb85b43632a0be28f9ae2419be
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/kernel.cu
@@ -0,0 +1,146 @@
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include <stdexcept>
+
+namespace iluvatar::inferrt::transformer {
+
+__global__ void Stack(half* a, half* b, half* c, half* d, half* output, int H) {
+  half2* h2_a = reinterpret_cast<half2*>(a);
+  half2* h2_b = reinterpret_cast<half2*>(b);
+  half2* h2_c = reinterpret_cast<half2*>(c);
+  half2* h2_d = reinterpret_cast<half2*>(d);
+
+  half2* h2_out_a = reinterpret_cast<half2*>(output);
+  half2* h2_out_b = reinterpret_cast<half2*>(output + H);
+  half2* h2_out_c = reinterpret_cast<half2*>(output + H * 2);
+  half2* h2_out_d = reinterpret_cast<half2*>(output + H * 3);
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < H / 2) {
+    h2_out_a[i] = h2_a[i];
+    h2_out_b[i] = h2_b[i];
+    h2_out_c[i] = h2_c[i];
+    h2_out_d[i] = h2_d[i];
+  }
+}
+
+void IxinferStackLauncher(half* a, half* b, half* c, half* d, half* output,
+                          int H, cudaStream_t stream) {
+  if (H % 2 != 0) {
+    throw std::runtime_error("IxinferStackLauncher: size error!");
+  }
+  int num_threads = 1024;
+  int half_h = H / 2;
+  int num_roll = (half_h - 1 + num_threads) / num_threads;
+  dim3 grid(num_roll);
+  dim3 block(num_threads);
+  Stack<<<grid, block, 0, stream>>>(a, b, c, d, output, H);
+}
+
+__global__ void Stack(half* a, half* b, half* output, int H) {
+  half2* h2_a = reinterpret_cast<half2*>(a);
+  half2* h2_b = reinterpret_cast<half2*>(b);
+
+  half2* h2_out_a = reinterpret_cast<half2*>(output);
+  half2* h2_out_b = reinterpret_cast<half2*>(output + H);
+
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < H / 2) {
+    h2_out_a[i] = h2_a[i];
+    h2_out_b[i] = h2_b[i];
+  }
+}
+
+void IxinferStackLauncher(half* a, half* b, half* output, int H,
+                          cudaStream_t stream) {
+  if (H % 2 != 0) {
+    throw std::runtime_error("IxinferStackLauncher: size error!");
+  }
+  int num_threads = 1024;
+  int half_h = H / 2;
+  int num_roll = (half_h - 1 + num_threads) / num_threads;
+  dim3 grid(num_roll);
+  dim3 block(num_threads);
+  Stack<<<grid, block, 0, stream>>>(a, b, output, H);
+}
+
+}  // namespace iluvatar::inferrt::transformer
+
+std::vector<at::Tensor> one_test_opt(at::Tensor a, at::Tensor b, at::Tensor c,
+                                     at::Tensor d) {
+  TORCH_CHECK(a.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(a.is_cuda());
+  TORCH_CHECK(a.is_contiguous());
+
+  TORCH_CHECK(b.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(b.is_contiguous());
+
+  TORCH_CHECK(c.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(c.is_cuda());
+  TORCH_CHECK(c.is_contiguous());
+
+  TORCH_CHECK(d.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(d.is_cuda());
+  TORCH_CHECK(d.is_contiguous());
+
+  TORCH_CHECK(a.dim() == 1);
+  TORCH_CHECK(b.dim() == 1);
+  TORCH_CHECK(c.dim() == 1);
+  TORCH_CHECK(d.dim() == 1);
+
+  int N = a.size(0);
+
+  TORCH_CHECK(b.size(0) == N);
+  TORCH_CHECK(c.size(0) == N);
+  TORCH_CHECK(d.size(0) == N);
+
+  at::Tensor output = a.new_empty({N * 4});
+
+  half* p_a = (half*)a.data_ptr();
+  half* p_b = (half*)b.data_ptr();
+  half* p_c = (half*)c.data_ptr();
+  half* p_d = (half*)d.data_ptr();
+  half* p_out = (half*)output.data_ptr();
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  iluvatar::inferrt::transformer::IxinferStackLauncher(p_a, p_b, p_c, p_d,
+                                                       p_out, N, stream);
+  return {output};
+}
+
+std::vector<at::Tensor> one_test_opt_2(at::Tensor a, at::Tensor b) {
+  TORCH_CHECK(a.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(a.is_cuda());
+  TORCH_CHECK(a.is_contiguous());
+
+  TORCH_CHECK(b.scalar_type() == at::ScalarType::Half);
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(b.is_contiguous());
+
+  TORCH_CHECK(a.dim() == 1);
+  TORCH_CHECK(b.dim() == 1);
+
+  int N = a.size(0);
+
+  TORCH_CHECK(b.size(0) == N);
+
+  at::Tensor output = a.new_empty({N * 2});
+
+  half* p_a = (half*)a.data_ptr();
+  half* p_b = (half*)b.data_ptr();
+  half* p_out = (half*)output.data_ptr();
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  iluvatar::inferrt::transformer::IxinferStackLauncher(p_a, p_b, p_out, N,
+                                                       stream);
+  return {output};
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/setup.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a031577c2d36f4451bab736af4306b4bd741adc4
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/setup.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import os
+import glob
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# cpp_files = glob.glob(os.path.join(CUR_DIR,"*.cpp"))
+# cu_files = glob.glob(os.path.join(CUR_DIR,'*.cu'))
+# source_files = cpp_files + cu_files
+# print("source files:")
+# for i in source_files:
+#     print(i)
+source_files = [
+    os.path.join(CUR_DIR,'test.cpp'),
+    os.path.join(CUR_DIR,'kernel.cu'),
+]
+
+for i in source_files:
+    assert os.path.isfile(i)
+    print(i)
+
+setup(
+    name="test",
+    ext_modules=[
+        CUDAExtension(
+            name="sp_opt",
+            libraries=["cuinfer"],
+            sources=source_files)
+    ],
+    cmdclass={
+        "build_ext": BuildExtension
+    }
+)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.cpp b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..087030645cd95a86222ddfd0db55958edc3a49c6
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.cpp
@@ -0,0 +1,29 @@
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <torch/extension.h>
+
+#include <cassert>
+#include <iostream>
+#include <vector>
+
+std::vector<at::Tensor> one_test_opt(at::Tensor a, at::Tensor b, at::Tensor c,
+                                     at::Tensor d);
+
+std::vector<at::Tensor> test_opt(at::Tensor a, at::Tensor b, at::Tensor c,
+                                 at::Tensor d) {
+  return one_test_opt(a, b, c, d);
+}
+
+std::vector<at::Tensor> one_test_opt_2(at::Tensor a, at::Tensor b);
+
+std::vector<at::Tensor> test_opt_2(at::Tensor a, at::Tensor b) {
+  return one_test_opt_2(a, b);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("test_opt", &test_opt, "");
+  m.def("test_opt_2", &test_opt_2, "");
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..185b829b9cb9372cf846f5828e668eaa984ab442
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import torch
+import sp_opt
+
+if __name__ == "__main__":
+    batch_tokens = 320
+    hidden_size = 5000
+
+    a = torch.randn([batch_tokens,hidden_size]).cuda().half()
+    b = torch.randn([batch_tokens,hidden_size]).cuda().half()
+    c = torch.randn([batch_tokens,hidden_size]).cuda().half()
+    d = torch.randn([batch_tokens,hidden_size]).cuda().half()
+
+    res_pt = torch.stack([a,b,c,d])
+    
+    res_cu, = sp_opt.test_opt(a.view(-1),b.view(-1),c.view(-1),d.view(-1))
+    res_cu = res_cu.view(4,batch_tokens,hidden_size)
+
+    diff = torch.abs(res_pt-res_cu)
+    print(diff)
+    print(diff.max())
+
+    for i in range(20):
+        res_cu, = sp_opt.test_opt(a.view(-1),b.view(-1),c.view(-1),d.view(-1))
+
+    res_pt = torch.stack([a,b])
+    
+    res_cu, = sp_opt.test_opt_2(a.view(-1),b.view(-1))
+    res_cu = res_cu.view(2,batch_tokens,hidden_size)
+
+    diff = torch.abs(res_pt-res_cu)
+    print(diff)
+    print(diff.max())
+    for i in range(20):
+        res_cu, = sp_opt.test_opt_2(a.view(-1),b.view(-1))
+    # # res1 = torch.log(torch.sum(torch.exp(inputs),dim=-1))
+    # # res2 = torch.logsumexp(inputs,dim=-1)
+    # # diff = torch.abs(res1-res2)
+    # # print(diff.max())
+
+    # res_pt = torch.logsumexp(inputs,dim=1)
+
+    # res_cu, = sp_opt.test_opt(inputs)
+
+    # diff = torch.abs(res_pt - res_cu)
+    # print(diff.max())
+
+    # for i in range(20):
+    #     res_cu, = sp_opt.test_opt(inputs)
+
+    # batch_tokens = 55
+    # hidden_size = 320*5000
+    # inputs = torch.randn([batch_tokens,hidden_size]).cuda().half()
+    # res_pt = torch.logsumexp(inputs,dim=0)
+    # res_cu, = sp_opt.test_opt_dim0(inputs)
+
+    # diff = torch.abs(res_pt - res_cu)
+    # print(diff.max())
+    # for i in range(20):
+    #     res_cu, = sp_opt.test_opt_dim0(inputs)
+
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/hparams/train_ASR_transformer.yaml b/models/speech/speech_recognition/transformer_asr/ixrt/hparams/train_ASR_transformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..859d09f31020a99c41462a367d47b8e986576841
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/hparams/train_ASR_transformer.yaml
@@ -0,0 +1,253 @@
+# ############################################################################
+# Model: E2E ASR with Transformer
+# Encoder: Transformer Encoder
+# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch
+# Tokens: BPE with unigram
+# losses: CTC + KLdiv (Label Smoothing loss)
+# Training: AISHELL-1
+# Authors:  Jianyuan Zhong, Titouan Parcollet
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 8886
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref results/transformer/<seed>
+cer_file: !ref <output_folder>/cer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER # e,g./path/to/aishell
+# noise/ris dataset will automatically be downloaded
+data_folder_rirs: !ref <data_folder> # Change this is needed
+skip_prep: False
+ckpt_interval_minutes: 15 # save checkpoint every N min
+train_data: !ref <data_folder>/csv_data/train.csv
+valid_data: !ref <data_folder>/csv_data/dev.csv
+test_data: !ref <data_folder>/csv_data/test.csv
+tokenizer_file: speechbrain/asr-transformer-aishell/tokenizer.ckpt
+
+# Training parameters
+number_of_epochs: 50
+batch_size: 64
+ctc_weight: 0.3
+gradient_accumulation: 4
+loss_reduction: 'batchmean'
+sorting: ascending
+
+dynamic_batching: False
+dynamic_batch_sampler:
+    feats_hop_size: 0.01
+    max_batch_len: 15 # in terms of "duration" in annotations by default, second here
+    left_bucket_len: 200 # old implementation attributs
+    multiplier: 1.1 # old implementation attributs
+    shuffle_ex: False # if true re-creates batches at each epoch shuffling examples.
+    num_buckets: 10 # floor(log(max_batch_len/left_bucket_len, multiplier)) + 1
+    batch_ordering: ascending
+
+num_workers: 6
+
+# stages related parameters
+stage_one_epochs: 40
+lr_adam: 1.0
+lr_sgd: 0.000025
+
+# Feature parameters
+sample_rate: 16000
+n_fft: 400
+n_mels: 80
+
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+    shuffle: True
+
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+
+test_dataloader_opts:
+    batch_size: !ref <batch_size>
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 256
+nhead: 4
+num_encoder_layers: 12
+num_decoder_layers: 6
+d_ffn: 2048
+transformer_dropout: 0.1
+activation: !name:torch.nn.GELU
+output_neurons: 5000
+
+# Outputs
+blank_index: 0
+label_smoothing: 0.1
+pad_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+min_decode_ratio: 0.0
+max_decode_ratio: 1.0 # 1.0
+valid_search_interval: 10
+valid_beam_size: 10
+test_beam_size: 1
+ctc_weight_decode: 0.40
+
+############################## models ################################
+
+CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
+    input_shape: (8, 10, 80)
+    num_blocks: 2
+    num_layers_per_block: 1
+    out_channels: (256, 256)
+    kernel_sizes: (3, 3)
+    strides: (2, 2)
+    residuals: (False, False)
+
+Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
+    input_size: 5120
+    tgt_vocab: !ref <output_neurons>
+    d_model: !ref <d_model>
+    nhead: !ref <nhead>
+    num_encoder_layers: !ref <num_encoder_layers>
+    num_decoder_layers: !ref <num_decoder_layers>
+    d_ffn: !ref <d_ffn>
+    dropout: !ref <transformer_dropout>
+    activation: !ref <activation>
+    normalize_before: True
+
+tokenizer: !new:sentencepiece.SentencePieceProcessor
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <d_model>
+    n_neurons: !ref <output_neurons>
+
+seq_lin: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <d_model>
+    n_neurons: !ref <output_neurons>
+
+env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt
+    openrir_folder: !ref <data_folder_rirs>
+    babble_prob: 0.0
+    reverb_prob: 0.0
+    noise_prob: 1.0
+    noise_snr_low: 0
+    noise_snr_high: 15
+
+modules:
+    CNN: !ref <CNN>
+    Transformer: !ref <Transformer>
+    seq_lin: !ref <seq_lin>
+    ctc_lin: !ref <ctc_lin>
+    env_corrupt: !ref <env_corrupt>
+
+model: !new:torch.nn.ModuleList
+    - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
+
+# define two optimizers here for two-stage training
+Adam: !name:torch.optim.Adam
+    lr: 0
+    betas: (0.9, 0.98)
+    eps: 0.000000001
+
+SGD: !name:torch.optim.SGD
+    lr: !ref <lr_sgd>
+    momentum: 0.99
+    nesterov: True
+
+
+valid_search: !new:speechbrain.decoders.S2STransformerBeamSearch
+    modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    blank_index: !ref <blank_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <valid_beam_size>
+    ctc_weight: !ref <ctc_weight_decode>
+    using_eos_threshold: False
+    length_normalization: True
+
+test_search: !new:speechbrain.decoders.S2STransformerBeamSearch
+    modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
+    bos_index: !ref <bos_index>
+    eos_index: !ref <eos_index>
+    blank_index: !ref <blank_index>
+    min_decode_ratio: !ref <min_decode_ratio>
+    max_decode_ratio: !ref <max_decode_ratio>
+    beam_size: !ref <test_beam_size>
+    ctc_weight: !ref <ctc_weight_decode>
+    using_eos_threshold: False
+    length_normalization: True
+
+log_softmax: !new:torch.nn.LogSoftmax
+    dim: -1
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+    blank_index: !ref <blank_index>
+    reduction: !ref <loss_reduction>
+
+seq_cost: !name:speechbrain.nnet.losses.kldiv_loss
+    label_smoothing: !ref <label_smoothing>
+    reduction: !ref <loss_reduction>
+
+noam_annealing: !new:speechbrain.nnet.schedulers.NoamScheduler
+    lr_initial: !ref <lr_adam>
+    n_warmup_steps: 25000
+    model_size: !ref <d_model>
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        noam_scheduler: !ref <noam_annealing>
+        normalizer: !ref <normalize>
+        counter: !ref <epoch_counter>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+normalize: !new:speechbrain.processing.features.InputNormalization
+    norm_type: global
+    update_until_epoch: 4
+
+augmentation: !new:speechbrain.lobes.augment.SpecAugment
+    time_warp: True
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    freq_mask: True
+    n_freq_mask: 2
+    time_mask: True
+    n_time_mask: 2
+    replace_with_zero: False
+    freq_mask_width: 30
+    time_mask_width: 40
+
+compute_features: !new:speechbrain.lobes.features.Fbank
+    sample_rate: !ref <sample_rate>
+    n_fft: !ref <n_fft>
+    n_mels: !ref <n_mels>
+
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+# AISHELL-1 has spaces between words in the transcripts,
+# which Chinese writing normally does not do.
+# If remove_spaces, spaces are removed
+# from the transcript before computing CER.
+# (e.g., 祝 可爱 的 你 —> 祝可爱的你)
+remove_spaces: True
+split_tokens: !apply:operator.not_ [!ref <remove_spaces>]
+
+cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
+    split_tokens: !ref <split_tokens>
+acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
+
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    collect_in: !ref <save_folder>
+    loadables:
+        tokenizer: !ref <tokenizer>
+    paths:
+        tokenizer: !ref <tokenizer_file>
+engine_path: transformer.engine
+ckpt_path: /home/data/speechbrain/results
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/inference.py b/models/speech/speech_recognition/transformer_asr/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..68ef0e4038c34975c92bb3739f3b7607cbf6eebb
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/inference.py
@@ -0,0 +1,606 @@
+#!/usr/bin/env/python3
+"""
+
+AISHELL-1 transformer model recipe. (Adapted from the LibriSpeech recipe.)
+
+"""
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import sys
+import time
+import torch
+import logging
+import speechbrain as sb
+from speechbrain import Stage
+from speechbrain.dataio.dataloader import LoopedLoader
+from speechbrain.utils.distributed import run_on_main
+from hyperpyyaml import load_hyperpyyaml
+from speechbrain.utils.checkpoints import Checkpointer
+import numpy as np
+from speechbrain.utils import data_utils
+import tensorrt
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import convert
+import beam_search
+from load_ixrt_plugin import load_ixrt_plugin
+from tensorrt import Dims
+from speechbrain.lobes.models.transformer import Transformer
+Transformer.get_lookahead_mask = convert.patch_get_lookahead_mask
+load_ixrt_plugin()
+logger = logging.getLogger(__name__)
+
+
+def volume(shape):
+    result = 1
+    for i in shape:
+        result *= i
+    return result
+
+
+class ASR(sb.core.Brain):
+    def __init__(self, engine_path, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        #
+        self.forward_time = 0
+        # ixrt
+        self.logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+        with open(engine_path, "rb") as f, tensorrt.Runtime(self.logger) as self.runtime:
+            self.engine = self.runtime.deserialize_cuda_engine(f.read())
+            assert self.engine
+            self.context = self.engine.create_execution_context()
+            assert self.context
+        self.encoder_ln_out = torch.zeros((64,2048,256), dtype=torch.float16).cuda()
+        self.infer_time = 0
+        self.hparams.valid_search.return_log_probs = True
+        self.modules.CNN = self.modules.CNN.half()
+        self.hparams.valid_search = self.hparams.valid_search.half()
+        self.hparams.valid_search.model = self.hparams.valid_search.model.half()
+        self.hparams.valid_search.fc = self.hparams.valid_search.fc.half()
+        self.hparams.valid_search.ctc_fc = self.hparams.valid_search.ctc_fc.half()
+        self.hparams.valid_search.minus_inf = -10000
+        self.hparams.valid_search.softmax = self.hparams.valid_search.softmax.half()
+        self.hparams.valid_search.model.decoder = convert.convert_decoder_model(self.hparams.valid_search.model.decoder)
+    # Given all input/output bindings, run in a dynamic shape way
+    def ixrt_infer(self, engine, context, bindings):
+        assert engine.num_bindings == len(bindings)
+        io_buffers = [0] * engine.num_bindings
+        for name, arr in bindings.items():
+            idx = engine.get_binding_index(name)
+            io_buffers[idx] = arr.data_ptr()
+            # dynamic input
+            if engine.binding_is_input(idx):
+                context.set_binding_shape(idx, Dims(arr.shape))
+
+        forward_start_time = time.time()
+        assert context.execute_v2(io_buffers)
+
+        torch.cuda.synchronize()
+        self.forward_time += time.time() - forward_start_time
+        outputs = {}
+        for name, arr in bindings.items():
+            idx = engine.get_binding_index(name)
+            if not engine.binding_is_input(idx):
+                # dynamic output
+                shape = context.get_binding_shape(idx)
+                outputs[name] = arr.view(-1)[:volume(shape)].view(*shape)
+        return outputs
+
+    def compute_forward(self, batch, stage):
+        """Forward computations from the waveform batches to the output probabilities."""
+
+        batch = batch.to(self.device)
+        wavs, wav_lens = batch.sig
+        tokens_bos, _ = batch.tokens_bos
+
+        # Add augmentation if specified
+        if stage == sb.Stage.TRAIN:
+            if hasattr(self.modules, "env_corrupt"):
+                wavs_noise = self.modules.env_corrupt(wavs, wav_lens)
+                wavs = torch.cat([wavs, wavs_noise], dim=0)
+                wav_lens = torch.cat([wav_lens, wav_lens])
+                tokens_bos = torch.cat([tokens_bos, tokens_bos], dim=0)
+
+        torch.cuda.synchronize()
+        start_time = time.time()
+
+        # compute features
+        feats = self.hparams.compute_features(wavs)
+        current_epoch = self.hparams.epoch_counter.current
+        feats = self.hparams.normalize(feats, wav_lens, epoch=current_epoch)
+
+        if stage == sb.Stage.TRAIN:
+            if hasattr(self.hparams, "augmentation"):
+                feats = self.hparams.augmentation(feats)
+
+        # forward modules
+        src = self.modules.CNN(feats.half())
+
+        # Orignal PyTorch implementation, comment this to compare
+        # enc_out, _ = self.modules.Transformer(
+        #     src, tokens_bos, wav_lens, pad_idx=self.hparams.pad_index
+        # )
+        # logits = self.modules.ctc_lin(enc_out)
+        # p_ctc = self.hparams.log_softmax(logits)
+        # hyps, _  = self.hparams.test_search(
+        #         enc_out.detach(), wav_lens
+        # )
+        # return p_ctc, wav_lens, hyps
+
+        # transformer
+        if src.ndim == 4:
+            bz, t, ch1, ch2 = src.shape
+            src = src.reshape(bz, t, ch1 * ch2)
+
+        # ixrt inference
+        t1 = time.time()
+        bindings = {"input": src.half(), "length_radio": wav_lens.half(),
+                    "encoder_ln_out": self.encoder_ln_out}
+
+        infer_result = self.ixrt_infer(self.engine, self.context, bindings)
+        encoder_ln_out = infer_result["encoder_ln_out"]
+        t2 = time.time()
+
+        hyps, _, p_ctc = beam_search.forward(self.hparams.valid_search, encoder_ln_out.half(), wav_lens.half())
+        torch.cuda.synchronize()
+        infer_time = time.time() - start_time
+
+        self.infer_time += infer_time
+
+        return p_ctc, wav_lens, hyps
+
+    def compute_objectives(self, predictions, batch, stage):
+        """Computes the loss (CTC+NLL) given predictions and targets."""
+
+        # (
+        #     p_ctc,
+        #     p_seq,
+        #     wav_lens,
+        #     hyps,
+        # ) = predictions
+
+        # 去除 seq2seq log-probabilities
+        (
+            p_ctc,
+            wav_lens,
+            hyps,
+        ) = predictions
+
+        ids = batch.id
+        tokens_eos, tokens_eos_lens = batch.tokens_eos
+        tokens, tokens_lens = batch.tokens
+
+        if hasattr(self.modules, "env_corrupt") and stage == sb.Stage.TRAIN:
+            tokens_eos = torch.cat([tokens_eos, tokens_eos], dim=0)
+            tokens_eos_lens = torch.cat(
+                [tokens_eos_lens, tokens_eos_lens], dim=0)
+            tokens = torch.cat([tokens, tokens], dim=0)
+            tokens_lens = torch.cat([tokens_lens, tokens_lens], dim=0)
+
+        # 去除 seq2seq 部分 loss
+        # loss = self.hparams.ctc_cost(p_ctc, tokens, wav_lens, tokens_lens)
+
+        if stage != sb.Stage.TRAIN:
+            current_epoch = self.hparams.epoch_counter.current
+            valid_search_interval = self.hparams.valid_search_interval
+
+            if current_epoch % valid_search_interval == 0 or (stage == sb.Stage.TEST):
+                # Decode token terms to words
+                predicted_words = [
+                    tokenizer.decode_ids(utt_seq).split(" ") for utt_seq in hyps
+                ]
+                target_words = [wrd.split(" ") for wrd in batch.wrd]
+                if self.hparams.remove_spaces:
+                    predicted_words = ["".join(p) for p in predicted_words]
+                    target_words = ["".join(t) for t in target_words]
+                    self.cer_metric.append(ids, predicted_words, target_words)
+
+            # 不计算 acc 部分
+            # # compute the accuracy of the one-step-forward prediction
+            # self.acc_metric.append(p_seq, tokens_eos, tokens_eos_lens)
+        return -torch.ones([1])
+
+    def fit_batch(self, batch):
+        """Train the parameters given a single batch in input"""
+        # check if we need to switch optimizer
+        # if so change the optimizer from Adam to SGD
+        self.check_and_reset_optimizer()
+
+        predictions = self.compute_forward(batch, sb.Stage.TRAIN)
+        loss = self.compute_objectives(predictions, batch, sb.Stage.TRAIN)
+
+        # normalize the loss by gradient_accumulation step
+        (loss / self.hparams.gradient_accumulation).backward()
+
+        if self.step % self.hparams.gradient_accumulation == 0:
+            # gradient clipping & early stop if loss is not fini
+            self.check_gradients(loss)
+
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+
+            # anneal lr every update
+            self.hparams.noam_annealing(self.optimizer)
+
+        return loss.detach()
+
+    def evaluate_batch(self, batch, stage):
+        """Computations needed for validation/test batches"""
+        with torch.no_grad():
+            predictions = self.compute_forward(batch, stage=stage)
+            loss = self.compute_objectives(predictions, batch, stage=stage)
+        return loss
+
+    def on_stage_start(self, stage, epoch):
+        """Gets called at the beginning of each epoch"""
+        if stage != sb.Stage.TRAIN:
+            # self.acc_metric = self.hparams.acc_computer()
+            self.cer_metric = self.hparams.cer_computer()
+
+    def on_stage_end(self, stage, stage_loss, epoch):
+        """Gets called at the end of a epoch."""
+        # Compute/store important stats
+        stage_stats = {"forward time": self.forward_time}
+        if stage == sb.Stage.TRAIN:
+            self.train_stats = stage_stats
+        else:
+            # stage_stats["ACC"] = self.acc_metric.summarize()
+            current_epoch = self.hparams.epoch_counter.current
+            valid_search_interval = self.hparams.valid_search_interval
+            if current_epoch % valid_search_interval == 0 or stage == sb.Stage.TEST:
+                stage_stats["CER"] = self.cer_metric.summarize("error_rate")
+
+        # log stats and save checkpoint at end-of-epoch
+        if stage == sb.Stage.VALID and sb.utils.distributed.if_main_process():
+
+            # report different epoch stages according current stage
+            current_epoch = self.hparams.epoch_counter.current
+            if current_epoch <= self.hparams.stage_one_epochs:
+                lr = self.hparams.noam_annealing.current_lr
+                steps = self.hparams.noam_annealing.n_steps
+                optimizer = self.optimizer.__class__.__name__
+            else:
+                lr = self.hparams.lr_sgd
+                steps = -1
+                optimizer = self.optimizer.__class__.__name__
+
+            epoch_stats = {
+                "epoch": epoch,
+                "lr": lr,
+                "steps": steps,
+                "optimizer": optimizer,
+            }
+            self.hparams.train_logger.log_stats(
+                stats_meta=epoch_stats,
+                train_stats=self.train_stats,
+                valid_stats=stage_stats,
+            )
+            self.checkpointer.save_and_keep_only(
+                meta={"ACC": stage_stats["ACC"], "epoch": epoch},
+                max_keys=["ACC"],
+                num_to_keep=10,
+            )
+
+        elif stage == sb.Stage.TEST:
+            self.hparams.train_logger.log_stats(
+                stats_meta={
+                    "Epoch loaded": self.hparams.epoch_counter.current},
+                test_stats=stage_stats,
+            )
+            with open(self.hparams.cer_file, "w") as w:
+                self.cer_metric.write_stats(w)
+
+    def check_and_reset_optimizer(self):
+        """reset the optimizer if training enters stage 2"""
+        current_epoch = self.hparams.epoch_counter.current
+        if not hasattr(self, "switched"):
+            self.switched = False
+            if isinstance(self.optimizer, torch.optim.SGD):
+                self.switched = True
+
+        if self.switched is True:
+            return
+
+        if current_epoch > self.hparams.stage_one_epochs:
+            self.optimizer = self.hparams.SGD(self.modules.parameters())
+
+            if self.checkpointer is not None:
+                self.checkpointer.add_recoverable("optimizer", self.optimizer)
+
+            self.switched = True
+
+    def on_fit_start(self):
+        """Initialize the right optimizer on the training start"""
+        super().on_fit_start()
+
+        # if the model is resumed from stage two, reinitialize the optimizer
+        current_epoch = self.hparams.epoch_counter.current
+        current_optimizer = self.optimizer
+        if current_epoch > self.hparams.stage_one_epochs:
+            del self.optimizer
+            self.optimizer = self.hparams.SGD(self.modules.parameters())
+
+            # Load latest checkpoint to resume training if interrupted
+            if self.checkpointer is not None:
+
+                # do not reload the weights if training is interrupted right before stage 2
+                group = current_optimizer.param_groups[0]
+                if "momentum" not in group:
+                    return
+
+                self.checkpointer.recover_if_possible(
+                    device=torch.device(self.device))
+
+    def on_evaluate_start(self, max_key=None, min_key=None):
+        """perform checkpoint averge if needed"""
+        super().on_evaluate_start()
+
+        ckpts = self.checkpointer.find_checkpoints(
+            max_key=max_key, min_key=min_key)
+        ckpt = sb.utils.checkpoints.average_checkpoints(
+            ckpts, recoverable_name="model", device=self.device
+        )
+
+        self.hparams.model.load_state_dict(ckpt, strict=True)
+        self.hparams.model.eval()
+
+    def evaluate(
+            self,
+            test_set,
+            max_key=None,
+            min_key=None,
+            progressbar=None,
+            test_loader_kwargs={},
+    ):
+        self.debug = False
+        self.debug_batches = 1
+        if progressbar is None:
+            progressbar = not self.noprogressbar
+
+        if not (
+                isinstance(test_set, DataLoader)
+                or isinstance(test_set, LoopedLoader)
+        ):
+            test_loader_kwargs["ckpt_prefix"] = None
+            test_set = self.make_dataloader(
+                test_set, Stage.TEST, **test_loader_kwargs
+            )
+        self.on_evaluate_start(max_key=max_key, min_key=min_key)
+        self.on_stage_start(Stage.TEST, epoch=None)
+        self.modules.eval()
+        avg_test_loss = 0.0
+        self.step = 0
+        with torch.no_grad():
+            for batch in tqdm(
+                    test_set, dynamic_ncols=True, disable=not progressbar
+            ):
+                self.step += 1
+                loss = self.evaluate_batch(batch, stage=Stage.TEST)
+                avg_test_loss = self.update_average(loss, avg_test_loss)
+
+                # Profile only if desired (steps allow the profiler to know when all is warmed up)
+                if self.profiler is not None:
+                    if self.profiler.record_steps:
+                        self.profiler.step()
+
+                # Debug mode only runs a few batches
+                if self.debug and self.step == self.debug_batches:
+                    break
+
+            # Only run evaluation "on_stage_end" on main process
+            run_on_main(
+                self.on_stage_end, args=[Stage.TEST, avg_test_loss, None]
+            )
+        self.step = 0
+        return avg_test_loss
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions."""
+    data_folder = hparams["data_folder"]
+
+    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["train_data"],
+        replacements={"data_root": data_folder},
+    )
+
+    if hparams["sorting"] == "ascending":
+        # we sort training data to speed up training and get better results.
+        train_data = train_data.filtered_sorted(sort_key="duration")
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        train_data = train_data.filtered_sorted(
+            sort_key="duration", reverse=True)
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending")
+
+    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["valid_data"],
+        replacements={"data_root": data_folder},
+    )
+    valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["test_data"],
+        replacements={"data_root": data_folder},
+    )
+    test_data = test_data.filtered_sorted(sort_key="duration")
+
+    datasets = [train_data, valid_data, test_data]
+
+    # Defining tokenizer and loading it
+    tokenizer = hparams["tokenizer"]
+
+    # 2. Define audio pipeline:
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def audio_pipeline(wav):
+        sig = sb.dataio.dataio.read_audio(wav)
+        return sig
+
+    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)
+
+    # 3. Define text pipeline:
+    @sb.utils.data_pipeline.takes("transcript")
+    @sb.utils.data_pipeline.provides(
+        "wrd", "tokens_list", "tokens_bos", "tokens_eos", "tokens"
+    )
+    def text_pipeline(wrd):
+        yield wrd
+        tokens_list = tokenizer.encode_as_ids(wrd)
+        yield tokens_list
+        tokens_bos = torch.LongTensor([hparams["bos_index"]] + (tokens_list))
+        yield tokens_bos
+        tokens_eos = torch.LongTensor(tokens_list + [hparams["eos_index"]])
+        yield tokens_eos
+        tokens = torch.LongTensor(tokens_list)
+        yield tokens
+
+    sb.dataio.dataset.add_dynamic_item(datasets, text_pipeline)
+
+    # 4. Set output:
+    sb.dataio.dataset.set_output_keys(
+        datasets,
+        ["id", "sig", "wrd", "tokens_bos", "tokens_eos", "tokens"],
+    )
+
+    # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+    train_batch_sampler = None
+    valid_batch_sampler = None
+    if hparams["dynamic_batching"]:
+        from speechbrain.dataio.sampler import DynamicBatchSampler  # noqa
+
+        dynamic_hparams = hparams["dynamic_batch_sampler"]
+        num_buckets = dynamic_hparams["num_buckets"]
+
+        train_batch_sampler = DynamicBatchSampler(
+            train_data,
+            dynamic_hparams["max_batch_len"],
+            num_buckets=num_buckets,
+            length_func=lambda x: x["duration"],
+            shuffle=dynamic_hparams["shuffle_ex"],
+            batch_ordering=dynamic_hparams["batch_ordering"],
+        )
+
+        valid_batch_sampler = DynamicBatchSampler(
+            valid_data,
+            dynamic_hparams["max_batch_len"],
+            num_buckets=num_buckets,
+            length_func=lambda x: x["duration"],
+            shuffle=dynamic_hparams["shuffle_ex"],
+            batch_ordering=dynamic_hparams["batch_ordering"],
+        )
+
+    return (
+        train_data,
+        valid_data,
+        test_data,
+        tokenizer,
+        train_batch_sampler,
+        valid_batch_sampler,
+    )
+
+
+if __name__ == "__main__":
+    # CLI:
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    with open(hparams_file) as fin:
+        hparams = load_hyperpyyaml(fin, overrides)
+
+    # If --distributed_launch then
+    # create ddp_group with the right communication protocol
+    sb.utils.distributed.ddp_init_group(run_opts)
+
+    # Create experiment directory
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+
+    # 1.  # Dataset prep (parsing Librispeech)
+    from aishell_prepare import prepare_aishell  # noqa
+
+    # multi-gpu (ddp) save data preparation
+    run_on_main(
+        prepare_aishell,
+        kwargs={
+            "data_folder": hparams["data_folder"],
+            "save_folder": hparams["output_folder"],
+            "skip_prep": hparams["skip_prep"],
+        },
+    )
+
+    # here we create the datasets objects as well as tokenization and encoding
+    (
+        train_data,
+        valid_data,
+        test_data,
+        tokenizer,
+        train_bsampler,
+        valid_bsampler,
+    ) = dataio_prepare(hparams)
+
+    hparams["pretrainer"].collect_files(default_source=hparams['ckpt_path'])
+    hparams["pretrainer"].load_collected(device=run_opts["device"])
+
+    # Trainer initialization
+    asr_brain = ASR(
+        modules=hparams["modules"],
+        opt_class=hparams["Adam"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+        engine_path=hparams['engine_path']
+    )
+
+    asr_brain.tokenizer = tokenizer
+
+    # Changing the samplers if dynamic batching is activated
+    train_dataloader_opts = hparams["train_dataloader_opts"]
+    valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+    if train_bsampler is not None:
+        train_dataloader_opts = {
+            "batch_sampler": train_bsampler,
+            "num_workers": hparams["num_workers"],
+        }
+    if valid_bsampler is not None:
+        valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+    # evaluation
+    print("*** start evaluation ***")
+    start_time = time.time()
+    asr_brain.evaluate(
+        test_data, test_loader_kwargs=hparams["test_dataloader_opts"])
+    eval_time = asr_brain.infer_time
+
+    ## 统计数据总音频时长
+    duration = 0.0
+    for value in test_data.data.values():
+        duration = duration + value['duration']
+    num_samples = len(test_data)
+    print(f"samples: {num_samples}, QPS: {num_samples / eval_time} ")
+    print(f"infer time :{eval_time},RTF: {eval_time / duration} ")
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/load_ixrt_plugin.py b/models/speech/speech_recognition/transformer_asr/ixrt/load_ixrt_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bb0abc21bd5806c51d6b908e3e3407cfdb62cc8
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import ctypes
+import tensorrt
+from os.path import join, dirname, exists
+def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path)
+    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
diff --git a/toolbox/ByteMLPerf/.gitignore b/toolbox/ByteMLPerf/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2e06b074245514d00a97d339bae65d92a7fae2ab
--- /dev/null
+++ b/toolbox/ByteMLPerf/.gitignore
@@ -0,0 +1,28 @@
+__pycache__
+*.pyc
+*.prototxt
+*.deploy
+.vscode/
+*.npy
+*.tar
+span.log
+byte_micro_perf/backends/*/venv/
+byte_micro_perf/reports/
+byte_infer_perf/general_perf/tools/venv/
+byte_infer_perf/general_perf/backends/*/venv/
+byte_infer_perf/general_perf/model_zoo/*
+!byte_infer_perf/general_perf/model_zoo/*.json
+byte_infer_perf/general_perf/download/*.*
+!byte_infer_perf/general_perf/download/README.md
+byte_infer_perf/general_perf/datasets/open_imagenet/preprocessed/
+byte_infer_perf/general_perf/datasets/*
+!byte_infer_perf/general_perf/datasets/fake_dataset
+!*.py
+byte_infer_perf/general_perf/reports/*
+!byte_infer_perf/general_perf/_inference/general_perf/reports/README
+format_code.sh
+init_env.sh
+
+byte_infer_perf/llm_perf/download
+byte_infer_perf/llm_perf/model_zoo/sota
+byte_infer_perf/llm_perf/reports
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/CONTRIBUTING.md b/toolbox/ByteMLPerf/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8de88fde2de45151b6a3f3ea52ba4267226c7ce
--- /dev/null
+++ b/toolbox/ByteMLPerf/CONTRIBUTING.md
@@ -0,0 +1,52 @@
+<!-- omit in toc -->
+# Contributing to Byte MLPerf
+
+First of all, thanks for taking the time to contribute!
+
+All types of contributions are encouraged and valued. See the [Table of Contents](#table-of-contents) for different ways to help and details about how this project handles them. Please make sure to read the relevant section before making your contribution. It will make it a lot easier for our maintainers and smooth out the experience for all involved. The community looks forward to your contributions.
+
+> And if you like the project, but just don't have time to contribute, that's fine. There are other easy ways to support the project and show your appreciation, which we would also be very happy about:
+> - Star the project
+> - Tweet about it
+> - Refer this project in your project's readme
+> - Mention the project at local meetups and tell your friends/colleagues
+
+<!-- omit in toc -->
+## Table of Contents
+
+- [Contributor License Agreementt](#contributor-license-agreement)
+- [Pull Requests](#pull-requests)
+- [I Have a Question](#i-have-a-question)
+
+## Contributor License Agreement
+
+Thank you for your interest in contributing to open source projects hosted or managed by Bytedance Ltd. and/or its Affiliates ("ByteDance"). In order to clarify the intellectual property license granted with Contributions from any person or entity, ByteDance must have a Contributor License Agreement ("CLA") on file that has been signed by each Contributor, indicating agreement to the license terms below. This license is for your protection as a Contributor as well as the protection of ByteDance and its users; it does not change your rights to use your own Contributions for any other purpose.
+
+- If you work for a company that wants to allow you to contribute your work, then you'll need to sign a corporate CLA.
+
+- If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an individual CLA.
+
+- If you have not already done so, please complete and sign, then scan and email a pdf file of this Agreement to opensource-cla@bytedance.com. Please read this document carefully before signing and keep a copy for your records.   
+
+##  Pull Requests
+We actively welcome your pull requests.
+
+- Fork the repo and create your branch from `master`.
+- If you've changed APIs, update the documentation.
+- Make sure your code lints.
+- If you haven't already, complete the Contributor License Agreement ("CLA").
+
+
+## I Have a Question
+
+> If you want to ask a question, we assume that you have read the available [Documentation]().
+
+Before you ask a question, it is best to search for existing [Issues](https://github.com/bytedance/ByteMLPerf/issues) that might help you. In case you have found a suitable issue and still need clarification, you can write your question in this issue. It is also advisable to search the internet for answers first.
+
+If you then still feel the need to ask a question and need clarification, we recommend the following:
+
+- Open an [Issue](https://github.com/bytedance/ByteMLPerf/issues/new).
+- Provide as much context as you can about what you're running into.
+- Provide project and platform versions, depending on what seems relevant.
+
+We will then take care of the issue as soon as possible.
diff --git a/toolbox/ByteMLPerf/LICENSE b/toolbox/ByteMLPerf/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/toolbox/ByteMLPerf/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/toolbox/ByteMLPerf/NOTICE b/toolbox/ByteMLPerf/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..4b563f6914ab0ebc4af4279ff29752ed518f71ac
--- /dev/null
+++ b/toolbox/ByteMLPerf/NOTICE
@@ -0,0 +1,2 @@
+ByteMLPerf
+Copyright 2023 ByteDance Ltd. and/or its affiliates.
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/README.md b/toolbox/ByteMLPerf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..418f41afdbd4d815b99775c877de53eba41767d7
--- /dev/null
+++ b/toolbox/ByteMLPerf/README.md
@@ -0,0 +1,41 @@
+<div align="center">
+  <img src="docs/images/icon.png">
+</div>
+
+
+# ByteMLPerf Benchmark Tool
+ByteMLPerf is an AI Accelerator Benchmark that focuses on evaluating AI Accelerators from practical production perspective, including the ease of use and versatility of software and hardware. Byte MLPerf has the following characteristics:
+- Models and runtime environments are more closely aligned with practical business use cases.
+- For ASIC hardware evaluation, besides evaluate performance and accuracy, it also measure metrics like compiler usability and coverage.
+- Performance and accuracy results obtained from testing on the open Model Zoo serve as reference metrics for evaluating ASIC hardware integration.
+
+## Category
+The ByteMLPerf benchmark is structured into three main categories: Inference, Training, and Micro, each targeting different aspects of AI accelerator performance:
+
+- Inference: This category is subdivided into two distinct sections to cater to different types of models:
+
+  - General Performance: This section is dedicated to evaluating the inference capabilities of accelerators using common models such as ResNet-50 and BERT. It aims to provide a broad understanding of the accelerator's performance across a range of typical tasks. Vendors can refer to this document for guidance on building general perf backend: [ByteMLPerf General Perf Guide](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof) [[中文版](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf#TDK8of)]
+
+  - Large Language Model (LLM) Performance: Specifically designed to assess the capabilities of accelerators in handling large language models, this section addresses the unique challenges posed by the size and complexity of these models. Vendors can refer to this document for guidance on building llm perf backend: [ByteMLPerf LLM Perf Guide](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc) [[中文版](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc)]
+
+- Micro: The Micro category focuses on the performance of specific operations or "ops" that are fundamental to AI computations, such as Gemm, Softmax, and various communication operations. This granular level of testing is crucial for understanding the capabilities and limitations of accelerators at a more detailed operational level. Vendors can refer to this document for guidance on building micro perf backend: [ByteMLPerf Micro Perf Guide](https://bytedance.us.larkoffice.com/docx/EpjFdSpRsoOIHWxtKgjuRsMPsFB)[[中文版](https://bytedance.us.larkoffice.com/docx/LJWvdGVAzoxXkTxF9h9uIETbsWc)]
+
+- Training: Currently under development, this category aims to evaluate the performance of AI accelerators in training scenarios. It will provide insights into how well accelerators can handle the computationally intensive process of training AI models, which is vital for the development of new and more advanced AI systems.
+
+Vendors looking to evaluate and improve their AI accelerators can utilize the ByteMLPerf benchmark as a comprehensive guide. The benchmark not only offers a detailed framework for performance and accuracy evaluation but also includes considerations for compiler usability and coverage for ASIC hardware, ensuring a holistic assessment approach.
+
+For more details, you can visit our offical website here: [bytemlperf.ai](https://bytemlperf.ai/)
+
+## Vendor List
+ByteMLPerf Vendor Backend List will be shown below
+
+| Vendor | SKU | Key Parameters | Inference(General Perf) | Inference(LLM Perf) |
+| :---- | :----| :---- | :---- | :---- |
+| Intel | Xeon | - | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) | - |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.md) | - |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) | - |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) | - |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
diff --git a/toolbox/ByteMLPerf/README.zh_CN.md b/toolbox/ByteMLPerf/README.zh_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f4779490be31232c063add2fcf14100b54d6256
--- /dev/null
+++ b/toolbox/ByteMLPerf/README.zh_CN.md
@@ -0,0 +1,39 @@
+<div align="center">
+  <img src="docs/images/icon.png">
+</div>
+
+
+# ByteMLPerf Benchmark Tool
+ByteMLPerf是字节使用的一个基准套件，用于测量推理系统在各种部署场景中运行模型的速度。相比MLPerf，ByteMLPerf有如下特点：
+- 模型和运行环境会更贴近真实业务；
+- 对于新硬件，除了评估性能和精度之外，同时也会评估图编译的易用性、覆盖率等指标；
+- 在开放Model Zoo上测试所得的性能和精度，会作为新硬件引入评估的参考；
+
+## 类别
+ByteMLPerf 基准分为三个主要类别：推理（Inference）、训练（Training）和微观性能（Micro），每个类别针对 AI 加速器性能的不同方面：
+
+- Inference：此类别进一步细分为两个部分，以适应不同类型的模型：
+  - General Perf：此部分致力于使用常见模型（如 ResNet-50 和 BERT）评估加速器的推理能力。其目的是提供加速器在一系列典型任务中性能的广泛理解。想要接入General Perf的厂商可以参考该文档接入测试：[ByteMLPerf Inference General Perf厂商接入指南](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf) 
+  - LLM Perf：专门设计用于评估加速器处理大型语言模型的能力，此部分解决了这些模型的大小和复杂性带来的独特挑战。想要接入LLM Perf的厂商可以参考该文档接入测试：[ByteMLPerf Inference LLM Perf厂商接入指南](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc) 
+
+- Micro：Micro Perf侧重于评估特定操作或“运算”（如 Gemm、Softmax 和各种通信操作）的性能，这些操作是 AI 计算的基础。这种详细级别的测试对于了解加速器在更细致的操作层面的能力和限制至关重要。想要接入Micro Perf的厂商可以参考该文档接入测试：[ByteMLPerf Micro Perf厂商接入指南](https://bytedance.us.larkoffice.com/docx/LJWvdGVAzoxXkTxF9h9uIETbsWc) 
+
+- Training：目前正在开发中的此类别旨在评估 AI 加速器在训练场景中的性能。它将提供关于加速器如何处理训练 AI 模型的计算密集过程的见解，这对于开发新的和更先进的 AI 系统至关重要。
+
+希望评估和改进其 AI 加速器的供应商可以使用 ByteMLPerf 基准作为全面的指南。该基准不仅提供了性能和准确性评估的详细框架，还包括了 ASIC 硬件的编译器可用性和覆盖范围的考虑，确保了全面的评估方法。
+
+更多细节您可以访问我们的官方网站:[bytemlperf.ai](https://bytemlperf.ai/)
+
+## Vendor List
+目前支持的厂商Backend如下:
+
+| Vendor | SKU | Key Parameters | Inference(General Perf) | Inference(LLM Perf) |
+| :---- | :----| :---- | :---- | :---- |
+| Intel | Xeon | - | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) | - |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.md) | - |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) | - |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) | - |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a811abb80315a5928196c6e31b8f8f27de976055
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.md
@@ -0,0 +1,108 @@
+<div align="center">
+  <img src="../../docs/images/icon.png">
+</div>
+
+
+# Byte MLPerf Inference Benchmark Tool
+Byte MLPerf(Inference) is an AI Accelerator Benchmark that focuses on evaluating AI Accelerators from practical production perspective, including the ease of use and versatility of software and hardware. Byte MLPerf has the following characteristics:
+- Models and runtime environments are more closely aligned with practical business use cases.
+- For ASIC hardware evaluation, besides evaluate performance and accuracy, it also measure metrics like compiler usability and coverage.
+- Performance and accuracy results obtained from testing on the open Model Zoo serve as reference metrics for evaluating ASIC hardware integration.
+
+Vendors can refer to this document for guidance on building backend: [ByteMLPerf Guide](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof) [[中文版](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf#TDK8of)]
+
+## Usage
+The user uses launch.py as the entry point. When using Byte MLPerf to evaluate the model, you only need to pass in two parameters --task and --hardware_type, as shown below:
+```bash
+python3 launch.py --task xxx --hardware_type xxx
+```
+
+1. task
+--task parameter is the name of the incoming workload. You need to specify the workload. For example, if you would like to evaluate the workload: bert-tf-fp16.json, you need to specify --task bert-tf-fp16.
+Note: All workloads are defined under general_perf/workloads, and the name needs to be aligned with the file name when passing parameters. The current format is model-framework-precision.
+
+2. hardware_type
+--hardware_type parameter is the incoming hardware_type name, there is no default value, it must be specified by the user. Example: To evaluate Habana Goya, specify --hardware_type GOYA .
+Note: All hardware types are defined under general_perf/backends, and the name needs to be aligned with the folder name when passing parameters.
+
+3. compile_only
+--compile_only parameter will make task stoped once compilation is finished
+
+4. show_task_list
+--show_task_list parameter will print all task name
+
+5. show_hardware_list
+--show_hardware_list parameter will print all hardware backend
+
+### Workload Description
+A workload definition needs to contain the following fields:
+```javascript
+{
+    "model": "bert-torch-fp32",   //The name of the model to be evaluated, which needs to be aligned with the model_zoo name
+    "test_perf": true,            //Evaluate model performance
+    "test_accuracy": true,        //Evaluate model accuracy
+    "test_numeric": true,         //Accuracy：Evaluate model numeric
+    "clients": 3,                 //Performance：Client threads that submit data
+    "iterations": 100,            //Performance：How many iterations are submitted by each thread
+    "batch_sizes":[1,4,8,16,32,64],//Performance：The batch size when each thread submits data
+    "data_percent": 50,           //Accuracy：Ratio of data to assess accuracy, [1-100]
+    "compile_only": false,           //Compile the model only
+}
+```
+
+## Model Zoo List
+Model Zoo&Dataset
+The models supported by Byte MLPerf are collected under the Model Zoo. From the perspective of access rights, they are currently divided into internal models and open models. Released with Byte MLPerf is the open model included in the corresponding version.
+
+Open model collection principles:
+- Basic Model: including Resnet50, Bert and WnD;
+- Popular Model：Includes models currently widely used in the industry;
+- SOTA: including SOTA models corresponding to business domains;
+
+In addition to the complete model structure, Byte MLPerf will also add some typical model substructure subgraphs or OPs (provided that the open model cannot find a suitable model containing such classic substructures), such as transformer encoder/decoder with different sequence lengths , all kinds of common conv ops, such as group conv, depwise-conv, point-wise conv, and rnn common structures, such as gru/lstm, etc.
+
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | tensorflow, pytorch | imagenet2012 | fp32 |
+| bert-base | nlp | regular | tensorflow, pytorch | squad-1.1 | fp32 |
+| wide&deep | rec | regular | tensorflow | criteo | fp32 |
+| videobert | mm  |popular | onnx | cifar100 | fp32 |
+| albert | nlp | popular | pytorch | squad-1.1 | fp32 |
+| conformer | nlp | popular | onnx | none | fp32 |
+| roformer | nlp | popular | tensorflow | cail2019 | fp32 |
+| yolov5 | cv | popular | onnx | none | fp32 |
+| roberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| deberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| swin-transformer | cv | popular | pytorch | imagenet2012 | fp32 |
+| gpt2 | nlp | sota | pytorch | none | fp32 |
+| stable diffusion | cv | sota | onnx | none | fp32 |
+| LlaMa2 7B | nlp | sota | torch | none | fp16 |
+| chatGLM2 6B | nlp | sota | torch | none | fp16 |
+
+### ByteIR
+
+The ByteIR Project is a ByteDance model compilation solution. ByteIR includes compiler, runtime, and frontends, and provides an end-to-end model compilation solution.
+
+Although all ByteIR components (compiler/runtime/frontends) are together to provide an end-to-end solution, and all under the same umbrella of this repository, each component technically can perform independently.
+
+For More Information, please refer to [ByteIR](https://github.com/bytedance/byteir)
+
+Models Supported By ByteIR:
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50_mhlo.tar) | imagenet2012 | fp32 |
+| bert-base | nlp | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/bert_mhlo.tar) | squad-1.1 | fp32 |
+
+## Vendor List
+ByteMLPerf Vendor Backend List will be shown below
+
+| Vendor |  SKU | Key Parameters | Supplement |
+| :---- | :----| :---- | :---- |
+| Intel | Xeon | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](general_perf/backends/STC/README.md) |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](general_perf/backends/IPU/README.md) |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W                           | [SPU Introduction](general_perf/backends/SPU/README.md) |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM                            | [HPU Introduction](general_perf/backends/HPU/README.md) |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.zh_CN.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.zh_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..dadac89f3814b60cc69ad9411f180feb883cc979
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.zh_CN.md
@@ -0,0 +1,105 @@
+<div align="center">
+  <img src="../../docs/images/icon.png">
+</div>
+
+
+# Byte MLPerf Inference Benchmark Tool
+Byte MLPerf（推理）是字节使用的一个基准套件，用于测量推理系统在各种部署场景中运行模型的速度。相比MLPerf，Byte MLPerf有如下特点：
+- 模型和运行环境会更贴近真实业务；
+- 对于新硬件，除了评估性能和精度之外，同时也会评估图编译的易用性、覆盖率等指标；
+- 在开放Model Zoo上测试所得的性能和精度，会作为新硬件引入评估的参考；
+
+厂商可以参考该文档接入测试：[ByteMLPerf厂商接入指南](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf) [[English Version](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof)]
+
+## Usage
+用户使用入口为launch.py, 在使用byte mlperf评估时，只需传入--task 、--hardware_type 两个参数，如下所示：
+```bash
+python3 launch.py --task xxx --hardware_type xxx
+```
+
+1. tasks
+--task 参数为传入的workload 名字，需要指定评估workload，例如：若要评估 open_bert-tf-fp16.json 定义的 workload，则需指定   --task open_bert-tf-fp16 。
+注：所有workload定义在general_perf/workloads下，传参时名字需要和文件名对齐。目前格式为model-framework-precision。
+
+2. hardware_type
+--hardware_type 参数为传入的hardware_type 名字，无默认值，必须用户指定。例如：若要评估 Habana Goya ，则需指定   --hardware_type GOYA 。
+注：所有hardware type定义在general_perf/backends下，传参时名字需要和folder名对齐。
+
+3. compile_only
+--compile_only 参数将在模型编译完成后停止任务
+
+4. show_task_list
+--show_task_list 参数会打印所有任务名字
+
+5. show_hardware_list
+--show_hardware_list 参数会打印目前所有支持的硬件Backend名称
+
+### Workload说明
+一个workload定义需包含如下字段:
+```javascript
+{
+    "model": "bert-torch-fp32",   //待评估模型的名字，需要和model_zoo名字对齐
+    "test_perf": true,            //是否评估模型性能
+    "test_accuracy": true,        //是否评估模型精度
+    "test_numeric": true,         //精度：是否评估数值误差
+    "clients": 3,                 //性能：提交数据的client threads
+    "iterations": 100,            //性能：每个thread提交多少iteration
+    "batch_sizes":[1,4,8,16,32],  //性能：每个thread提交数据时的bs
+    "data_percent": 50,           //精度：使用百分多少数据集评估精度, [1-100]
+    "compile_only": false,        //是否仅编译模型
+}
+```
+
+## Model Zoo List
+Model Zoo&Dataset
+Model Zoo下收录了Byte MlPerf支持的模型，从访问权限上，目前分为内部模型、开放模型。随Byte MlPerf 发布的是对应版本收录的开放模型。
+Dataset为模型需要用到数据集，对应的dataloader、accuracy_checker从结构上也归入Dataset。
+
+开放模型收录原则：
+- 基础模型：包含十分常见的Rn50、Bert和WnD；
+- 业务类似：包含目前内部较多的、或结构相似的模型结构；
+- 前沿模型：包含业务领域对应的SOTA模型；
+
+此外，除了完整模型结构，Byte MlPerf还会加入一些典型模型子结构子图或OP（前提是开放模型无法找到合适的完整模型包含这类经典子结构），比如各不同序列长度的transformer encoder/decoder，各类常见conv op，如group conv、depwise-conv、point-wise conv，以及rnn 常见结构，如gru/lstm等。
+
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | tensorflow, pytorch | imagenet2012 | fp32 |
+| bert-base | nlp | regular | tensorflow, pytorch | squad-1.1 | fp32 |
+| wide&deep | rec | regular | tensorflow | criteo | fp32 |
+| videobert | mm  |popular | onnx | cifar100 | fp32 |
+| albert | nlp | popular | pytorch | squad-1.1 | fp32 |
+| conformer | nlp | popular | onnx | none | fp32 |
+| roformer | nlp | popular | tensorflow | cail2019 | fp32 |
+| yolov5 | cv | popular | onnx | none | fp32 |
+| roberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| deberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| swin-transformer | cv | popular | pytorch | imagenet2012 | fp32 |
+| gpt2 | nlp | sota | pytorch | none | fp32 |
+| stable diffusion | cv | sota | onnx | none | fp32 |
+| LlaMa2 7B | nlp | sota | torch | none | fp16 |
+| chatGLM2 6B | nlp | sota | torch | none | fp16 |
+
+### ByteIR
+
+ByteIR项目是字节跳动的模型编译解决方案。ByteIR包括编译器、运行时和前端，并提供端到端的模型编译解决方案。 尽管所有的ByteIR组件（编译器/runtime/前端）一起提供端到端的解决方案，并且都在同一个代码库下，但每个组件在技术上都可以独立运行。
+
+更多信息请查看[ByteIR](https://github.com/bytedance/byteir)
+
+ByteIR 编译支持的模型列表:
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50_mhlo.tar) | imagenet2012 | fp32 |
+| bert-base | nlp | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/bert_mhlo.tar) | squad-1.1 | fp32 |
+
+
+## Vendor List
+目前支持的厂商Backend如下:
+
+| Vendor |  SKU | Key Parameters | Supplement |
+| :---- | :----| :---- | :---- |
+| Intel | Xeon | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.zh_CN.md) |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W                           | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM                            | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) |
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e645a9c4910ed16d8662ea0ee56cf8c34431426
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/__init__.py
@@ -0,0 +1,54 @@
+import sys
+from packaging.version import parse
+import warnings
+
+from .version import __version__
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Defaults to 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    assert 'parrots' not in version_str
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+python3_minimum_version = '3.6.0'
+python_version = digit_version(sys.version.split()[0])
+
+assert (python_version >= digit_version(python3_minimum_version)), \
+    f'PYTHON=={sys.version.split()[0]} is used but incompatible. ' \
+    f'Please install python>={python3_minimum_version}.'
+
+__all__ = ['__version__']
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..400a0ac958faa159bfbc1d279bc2beade8e7bcf2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.py
@@ -0,0 +1,106 @@
+import argparse
+import logging
+import os
+import importlib
+import json
+import sys
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("CPUBase")
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", default='resnet50-tf-fp32')
+    parser.add_argument("--hardware_type", default="CPU")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        help="Batch sizes we will test in performace mode")
+    parser.add_argument(
+        "--data_percent",
+        type=int,
+        help=
+        "Data percent we will used in the whole data set when we will test in accuracy mode"
+    )
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine(object):
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+
+    def start_engine(self):
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        log.info("Runing CPU Base...")
+
+        self.compile_backend = init_compile_backend(self.args.hardware_type)
+        self.runtime_backend = init_runtime_backend(self.args.hardware_type)
+        if self.workload:
+            return self.workload_perf(self.workload)
+
+    def workload_perf(self, workload):
+        # set reports dir
+        output_dir = os.path.abspath('general_perf/reports/' + self.args.hardware_type +
+                                     '/' + workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        model_info = self.get_model_info(workload['model'])
+
+        ds = load_dataset(model_info)
+        ds.preprocess()
+
+        compile_info = self.compile_backend.compile({
+            "workload": workload,
+            'model_info': model_info
+        })
+
+        # load runtime backend
+        runtime_backend = self.runtime_backend
+        runtime_backend.configs = compile_info
+        runtime_backend.workload = workload
+        runtime_backend.model_info = model_info
+        runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        if workload['test_accuracy'] or workload['test_numeric']:
+            ds.rebatch(self.args.batch_size)
+            AccuracyChecker = self.get_accuracy_checker(
+                model_info['dataset_name']
+                if model_info['dataset_name'] else 'fake_dataset')
+            AccuracyChecker.runtime_backend = runtime_backend
+            AccuracyChecker.dataloader = ds
+            AccuracyChecker.output_dir = output_dir
+            AccuracyChecker.configs = compile_info
+            AccuracyChecker.calculate_acc(workload['data_percent'])
+
+        return
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str):
+        with open("general_perf/model_zoo/" + model_name + '.json', 'r') as f:
+            model_info = json.load(f)
+        return model_info
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f786dfa0ba7c3d2e2950f55e0a284db2088d59de
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
@@ -0,0 +1,12 @@
+#！bin/bash
+if [ ! -d "general_perf/backends/CPU/venv" ];then
+    virtualenv -p python3 general_perf/backends/CPU/venv
+    source general_perf/backends/CPU/venv/bin/activate
+    general_perf/backends/CPU/venv/bin/python3 -m pip install --upgrade pip  -q
+    general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+else
+    source general_perf/backends/CPU/venv/bin/activate
+    general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+fi
+
+python3 general_perf/backends/CPU/calculate_cpu_diff.py --task $1 --batch_size $2
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d88a1114f5c132a44a129516b5fb9e5da4fcf3c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
@@ -0,0 +1,97 @@
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+import torch
+import onnxruntime
+import time
+import numpy as np
+
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendCPU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+
+class CompileBackendCPU(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendCPU, self).__init__()
+        self.hardware_type = 'CPU'
+        self.need_reload = False
+        self.model_runtimes = []
+
+    def compile(self, config, dataloader=None):
+        result = {
+            "model":
+            config['model_info']['model'],
+            "framework":
+            config['model_info']['framework'],
+            "compile_precision":
+            config['model_info']['model_precision'],
+            "optimizations":{},
+            "instance_count": 1,
+            "device_count": 128,
+            "input_type":
+            config['model_info']['input_type'].split(","),
+            "max_batch_size":
+            config['model_info']['max_batch_size'],
+            "compile_status":
+            "success",
+            "sg_percent":
+            100,
+            "segments": [
+                {
+                    "sg_idx":
+                    0,
+                    "is_fallback":
+                    False,
+                    "input_tensor_map":
+                    config['model_info']['input_shape'],
+                    "output_tensor_map":
+                    config['model_info']['outputs'],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": config['model_info']['model_path'],
+                        },
+                    ],
+                },
+            ]
+        }
+        self.configs = result
+        self.workload = config['workload']
+        self.model_info = config['model_info']
+        return result
+
+    def get_interact_profile(self, config):
+        model_profile = []
+        file_path = "general_perf/backends/CPU/" + self.hardware_type + '.json'
+        if os.path.exists(file_path):
+            with open(file_path, 'r') as f:
+                model_profile = json.load(f)
+        else:
+            log.info(
+                'File path: {} does not exist, please check'.format(file_path))
+
+        return model_profile
+
+    def get_best_batch_size(self):
+        """
+        Get Best Batch Size for the model
+        """
+        return None
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b12c02eb3832026cd61894304cec4eaf4b08a4a1
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/requirements.txt
@@ -0,0 +1,14 @@
+matplotlib
+scikit-learn
+opencv-python-headless
+transformers
+tokenization
+bert-tensorflow==1.0.1
+torchvision
+onnx
+numpy==1.19.2
+tensorflow==2.4.0
+onnxruntime
+torch==1.13.1
+sentencepiece==0.1.96
+pandas==1.3.3
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec8c98b2ca0b614e7516e2ee4ff991f46d86ee6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
@@ -0,0 +1,184 @@
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+import torch
+import onnxruntime
+import time
+import numpy as np
+
+from general_perf.backends import runtime_backend
+
+log = logging.getLogger("BackendCPU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+
+class RuntimeBackendCPU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendCPU, self).__init__()
+        self.hardware_type = 'CPU'
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.batch_size = -1
+
+    def predict(self, feeds):
+        results = {}
+        if self.framework == "Tensorflow":
+            entry_rt = self.model_runtimes[0].signatures['serving_default']
+            all_sn_inputs = entry_rt.structured_input_signature
+
+            def get_real_feeds(feeds, sn_inputs):
+                sn_inputs = tf.nest.flatten(sn_inputs, True)
+                real_feeds = {}
+                itr = 0
+                for _, val in feeds.items():
+                    real_feeds[sn_inputs[itr].name] = tf.constant(val)
+                    itr += 1
+                return real_feeds
+
+            real_feeds = get_real_feeds(feeds, all_sn_inputs)
+
+            for model_runtime in self.model_runtimes:
+                with tf.device('/CPU:0'):
+                    _results = model_runtime.signatures['serving_default'](
+                        **real_feeds)
+
+            results = {}
+            for key, val in _results.items():
+                results[key] = val.numpy()
+
+            assert len(results) != 0
+
+        elif self.framework == "Pytorch":
+            input_tensors = []
+            i = 0
+            for key, _ in feeds.items():
+                input_tensors.append(
+                    torch.tensor(feeds[key],
+                                 dtype=pt_dtype_map[self.input_type[i]]).to(
+                                     self.device))
+                i += 1
+            with torch.no_grad():
+                for model_runtime in self.model_runtimes:
+                    results = model_runtime(*input_tensors)
+
+            if isinstance(results, dict):
+                for key, val in results.items():
+                    results[key] = val.cpu().detach().numpy()
+            elif isinstance(results, tuple):
+                dic = {}
+                for i, key in enumerate(self.outputs):
+                    dic[key] = list(results)[i]
+            else:
+                results = {self.outputs[0]: results.cpu().numpy()}
+        else:
+            for model_runtime in self.model_runtimes:
+                results = model_runtime.run(None, feeds)
+        return results
+
+    def benchmark(self, dataloader):
+        iterations = self.workload['iterations']
+        batch_size = self.get_loaded_batch_size()
+        times_range = []
+        report = {}
+        report['BS'] = batch_size
+        test_data = self._get_fake_samples(
+            batch_size, self.configs['segments'][0]['input_tensor_map'],
+            self.configs['input_type'])
+
+        for _ in range(30):
+            self.predict(test_data)
+
+        for _ in range(iterations):
+            start_time = time.time()
+            self.predict(test_data)
+            end_time = time.time()
+            times_range.append(end_time - start_time)
+
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * batch_size / avg_latency)
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(batch_size, qps, avg_latency, tail_latency))
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        self.batch_size = batch_size
+        self.model_runtimes = []
+        self.input_type = self.configs['input_type']
+        self.framework = self.configs['framework']
+
+        self.model_name = self.configs['model']
+
+        for i, segment in enumerate(self.configs['segments']):
+            # there is no input/output meta data i the graph so it need to come from config.
+            if not segment['input_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs inputs")
+            if not segment['output_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs outputs")
+
+            self.input_shapes = segment['input_tensor_map']
+            self.outputs = segment['output_tensor_map'].split(",")
+
+            if self.framework == "Tensorflow":
+                with tf.device('/CPU:0'):
+                    model = tf.saved_model.load(
+                        segment['compiled_model'][0]['compiled_obj'])
+            elif self.framework == "Pytorch":
+                self.device = "cpu"
+                model = torch.jit.load(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    torch.device('cpu'))
+                model.eval()
+            else:
+                model = onnxruntime.InferenceSession(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    providers=['CPUExecutionProvider'])
+
+            self.model_runtimes.append(model)
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..402e2dceccd2b9385fc9599de60b6559f1704111
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -0,0 +1,319 @@
+"""
+    ****************************************操作说明*********************************
+    如果不想跑CPU端的性能、精度、数值指标对比，可以直接执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）
+             如果模型提供了pt、pb格式的优先选择torch的配置进行测试；
+             如果执行整个pipeline，需要执行：python3 lauch.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）（跑cpu结果会很耗时）
+
+    功能实现：
+        1、pt、pb模型转换在compile模块预处理过程中实现；
+        2、在天数智芯BI-150显卡上，调用推理引擎tensorrt进行推理，一些onnx模型需要利用前面一步导出的onnx模型再进行插件算子的优化；
+    
+    环境准备：
+        1、sdk版本： 由天数智芯工程师提供
+        2、ixrt版本：由天数智芯工程师提供
+"""
+
+
+"""
+    ***************************11个小模型的测试与测试报告生成的操作方法****************************
+    整个代码运行过程中，主要是从workloads目录下加载对应的模型的配置，主要有test_perf、test_accuracy、test_numeric三项测试内容，用户可以根据自己的需要选择开启与否；
+    一般情况下采用字节默认的配置项即可；需要特别修改的配置下面会进行说明
+
+    输出性能文档里面涉及的字段说明：
+        1、QPS、AVG Latency、P99 Latency：这3个指标是走字节框架，采用天数智芯的推理引擎IxRT会计算H2D、D2H的时间，也就是数据在不同的设备（CPU、GPU）之间传输耗时；
+        2、predict QPS、predict AVG Latency、predict P99 Latency：这部分指标把上面一步计算H2D、D2H的耗时剔除出去了，因此可以看做纯推理耗时，这个耗时可以与利用
+           ixerexec命令跑出来的结果做一定的对比，但是不一定完全对齐，因为走整个框架代码肯定会导致一部分性能损失
+
+    数据集、模型准备：
+        cd ByteMLPerf/byte_infer_perf/general_perf
+
+        bash general_perf/prepare_model_and_dataset.sh bert-torch-fp32 open_squad
+        bash general_perf/prepare_model_and_dataset.sh resnet50-torch-fp32 open_imagenet
+        bash general_perf/prepare_model_and_dataset.sh widedeep-tf-fp32 open_criteo_kaggle
+        bash general_perf/prepare_model_and_dataset.sh albert-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh roformer-tf-fp32 open_cail2019
+        bash general_perf/prepare_model_and_dataset.sh videobert-onnx-fp32 open_cifar
+        bash general_perf/prepare_model_and_dataset.sh yolov5-onnx-fp32 
+        bash general_perf/prepare_model_and_dataset.sh conformer-encoder-onnx-fp32
+        bash general_perf/prepare_model_and_dataset.sh roberta-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh deberta-torch-fp32 
+        bash general_perf/prepare_model_and_dataset.sh swin-large-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh gpt2-torch-fp32 
+
+        上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面
+        如果还缺少什么模型、数据集可以在prepare_model_and_dataset.sh里面执行类似上面的操作即可；
+
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+
+    备注：由于sftp机器崩溃，文件全部丢失，因此已有的获取数据方式可能不存在了
+
+    1、bert模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32/
+
+    2、albert模型：
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+        
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/madlag.tar
+                 tar -zxvf madlag.tar
+                 exit
+
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("madlag/albert-base-v2-squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/madlag/albert-base-v2-squad")  (注意绝对路径根据实际情况修改，需要在ByteMLPerf前面在加一个当前目录最上层的路径，下同)
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/albert-torch-fp32/
+
+    3、debert模型：
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/Palak.tar
+                 tar -zxvf Palak.tar
+                 exit
+
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/Palak/microsoft_deberta-base_squad")
+
+        给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-sim-drop-clip-drop-invaild-cast.onnx
+        将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd yudefu/bytedance_perf ; get deberta-sim-drop-clip-drop-invaild-cast.onnx
+                 exit
+        
+        移动：mv deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/deberta-torch-fp32/
+
+    4、roberta模型：
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/csarron.tar
+                 tar -zxvf csarron.tar
+                 exit
+
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("csarron/roberta-base-squad-v1") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/csarron/roberta-base-squad-v1")
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/roberta-torch-fp32/
+
+    5、videobert模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/videobert-onnx-fp32
+    
+    6、widedeep模型：
+        该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape_new.onnx；
+        将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd yudefu/bytedance_perf ; get widedeep_dynamicshape_new.onnx
+                 exit
+        
+        移动：mv widedeep_dynamicshape_new.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+        
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+    7、swin-transformer模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/swin-large-torch-fp32
+
+    8、resnet50模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+    9、yolov5模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+
+    10、conformer模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task conformer-encoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
+
+    11、roformer模型：
+        该模型经过了特殊的处理，需要采用处理好的onnx模型：roformer_frozen.onnx；
+        将其放到：general_perf/model_zoo/popular/open_roformer/ 
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd yudefu/bytedance_perf ; get roformer_frozen.onnx
+                 exit
+        
+        移动：mv roformer_frozen.onnx general_perf/model_zoo/popular/open_roformer/ 
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/roformer-tf-fp32
+
+    12、gpt2模型：
+        在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task gpt2-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/gpt2-torch-fp32
+"""
+
+"""
+    ***************************大模型操作流程********************
+    说明：
+        此部分侵入了字节代码框架，因此需要重新重构，暂时不需要进行测试
+
+    操作流程：
+        1. 进入ByteMLPerf目录
+        2. 执行
+            1）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chatglm2-torch-fp16-6b --hardware_type ILU, 
+               得到chatglm2-torch-fp16-6b的精度和性能数据
+
+            2）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chinese-llama2-torch-fp16-13b --hardware_type ILU,
+               得到 chinese-llama2-torch-fp16-13b的精度和性能数据
+
+        3. 在byte_infer_perf/llm_perf/reports/ILU目录下查看得到模型精度和性能数据的json文件
+"""
+
+"""
+    ***************************Stable Diffusion模型操作流程********************
+    环境准备：官方的onnx2torch有bug存在，所以需要安装天数智芯适配版本的onnx2torch，采用pytorch推理框架
+
+    操作过程：
+        1、cd ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch
+        2、执行：python3 setup.py install
+        3、cd -
+
+        数据集、模型准备：
+        cd ByteMLPerf/byte_infer_perf/general_perf
+
+        bash general_perf/prepare_model_and_dataset.sh vae-encoder-onnx-fp32
+
+        上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面
+        如果还缺少什么模型、数据集可以在prepare_model_and_dataset.sh里面执行类似上面的操作即可；
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+
+    1、vae-decoder模型:
+        注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置
+            "batch_sizes":[4,8], "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-decoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/vae-decoder-onnx-fp32
+
+    2、vae-encoder模型：
+        注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置
+            "batch_sizes":[4,8], "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-encoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/vae-encoder-onnx-fp32
+
+    2、clip模型：
+        注意事项：为了实现性能测试, 因此需要修改workloads下面的模型启动配置
+            "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task clip-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/clip-onnx-fp32
+"""
+
+
+"""
+    ***************************大模型操作流程-VLLM框架********************
+    说明：
+        此部分代码未侵入框架代码，由于vllm框架未实现精度测试，因此精度测试可以沿用GPU的backends；其次，vllm的tp定义目前与框架定义的tp含义不一样，
+        因此chatglm2、llama2模型的workloads配置里面的tp=2暂时不考虑，待后续商定好解决方案在继续
+
+    环境准备：
+        需要提前下载天数智芯适配的vllm安装包到测试环境下，为了方便看输出日志，省掉不必要的信息，安装完毕后，请注释掉：
+        /usr/local/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py 内部函数async def add_request 下面的logger.info输出日志
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+        
+    1、chatglm2模型：
+        执行：python3 llm_perf/launch.py --task chatglm2-torch-fp16-6b --hardware_type ILUVATAR 
+        生成的测试报告位置：llm_perf/reports/ILUVATAR/chatglm2-torch-fp16-6b
+    
+    2、llama2模型：
+        执行：python3 llm_perf/launch.py --task chinese-llama2-torch-fp16-13b --hardware_type ILUVATAR
+        生成的测试报告位置：llm_perf/reports/ILUVATAR/chinese-llama2-torch-fp16-13b
+"""
+
+
+"""
+    **************************部分小模型的int8精度推理测试************************
+    说明：
+        字节目前想验证部分小模型的int8精度推理的性能，因此需要基于ixrt（tensorrt）推理引擎进行适配支持
+        目前需要验证的小模型包括：resnet50、yolov5、widedeep、bert
+
+        注意如果在测试bert的int8推理时，报错，可能是sdk、ixrt版本问题导致；需要升级；
+        生成的报告，并没有更改里面的精度标识，这里只是给出一个测试case，因此并没有将这部分代码加到代码中
+    
+    环境准备：不需要特别准备，之前如果测试过小模型的性能，相关的环境已经存在了；
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+
+    1、resnet50 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get quantized_Resnet50.onnx  
+            exit
+            mv quantized_Resnet50.onnx general_perf/model_zoo/regular/open_resnet50
+
+        手动更改配置文件：general_perf/model_zoo/resnet50-torch-fp32.json 中的 model_precision 精度为 INT8
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+    2、widedeep 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get quantized_widedeep_staticshape.onnx 
+            exit
+            mv quantized_widedeep_staticshape.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/
+
+        手动更改配置文件：general_perf/model_zoo/widedeep-tf-fp32.json 中的 model_precision 精度为 INT8
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+    3、yolov5 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get quantized_yolov5s.onnx 
+            exit
+            mv quantized_yolov5s.onnx general_perf/model_zoo/popular/open_yolov5/
+
+        手动更改配置文件：general_perf/model_zoo/yolov5-onnx-fp32.json 中的 model_precision 精度为 INT8
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+
+    4、bert 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型；该模型直接拿生成好的engine进行推理
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get bert_zijie_int8_b196.engine  
+            exit
+            mv bert_zijie_int8_b196.engine general_perf/model_zoo/regular/open_bert/
+
+        手动更改配置文件：general_perf/model_zoo/bert-torch-fp32.json 中的 model_precision 精度为 INT8
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32
+"""
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4576743f081ebb7270ab663cafe3535a661105b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import random
+import torch
+import ctypes
+import numpy as np
+from os.path import join, dirname, exists
+
+import pycuda.driver as cuda
+from cuda import cuda,cudart
+import threading
+
+import importlib
+
+tensorrt = None      
+Dims = None                                                                           
+                          
+tvm = None  
+
+def setup_seed(seed):
+     torch.manual_seed(seed)
+     torch.cuda.manual_seed_all(seed)
+     np.random.seed(seed)
+     random.seed(seed)
+     torch.backends.cudnn.deterministic = True
+
+
+def load_ixrt_plugin(logger=None, namespace="", dynamic_path="", model="", precision=""):
+    global tensorrt
+    global Dims
+
+    if tensorrt is not None:
+        return
+    
+    if precision == 'FP16':
+        if model == 'resnet50' or model == 'bert' or model == 'albert' or model == 'deberta' or model == 'yolov5':
+            tensorrt = importlib.import_module("tensorrt_legacy")
+            Dims = getattr(tensorrt, "Dims")
+        else:
+            tensorrt = importlib.import_module("tensorrt")
+            Dims = getattr(tensorrt, "Dims")
+    
+    if precision == 'INT8':
+        tensorrt = importlib.import_module("tensorrt")
+        Dims = getattr(tensorrt, "Dims")
+    
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    tensorrt.init_libnvinfer_plugins(tensorrt.Logger(tensorrt.Logger.INFO), namespace)
+    print(f"Loaded plugin from {dynamic_path}")
+
+
+def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize, BuildFlag):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+
+    profile = builder.create_optimization_profile()
+
+    if model_name == 'resnet50':
+        profile.set_shape(
+                "input", Dims([1, 3, 224, 224]), Dims([32, 3, 224, 224]), Dims([MaxBatchSize, 3, 224, 224]))
+        
+    elif model_name == 'videobert':
+        profile.set_shape(
+            "image", Dims([1, 3, 224, 224]), Dims([32, 3, 224, 224]), Dims([MaxBatchSize, 3, 224, 224]))
+        profile.set_shape(
+            "text", Dims([100, 77]), Dims([100, 77]), Dims([100, 77]))
+        
+    elif model_name == 'yolov5':
+        profile.set_shape(
+                "images", Dims([1, 3, 640, 640]), Dims([32, 3, 640, 640]), Dims([MaxBatchSize, 3, 640, 640]))
+    
+    elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta':
+        profile.set_shape(
+            "input_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "attention_mask.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "token_type_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        
+    elif model_name == 'deberta':
+        profile.set_shape(
+            "input_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "attention_mask.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+    
+    elif model_name == 'widedeep':
+        profile.set_shape(
+            "new_numeric_placeholder:0", Dims([1, 13]), Dims([16, 13]), Dims([MaxBatchSize, 13]))
+        profile.set_shape(
+            "new_categorical_placeholder:0", Dims([1 * 26, 2]), Dims([16 * 26, 2]), Dims([MaxBatchSize * 26, 2]))
+        profile.set_shape(
+            "import/head/predictions/zeros_like:0", Dims([1, 1]), Dims([16, 1]), Dims([MaxBatchSize, 1]))
+        
+    elif model_name == 'conformer':
+        profile.set_shape(
+            "src", Dims([1, 3, 64, 512]), Dims([16, 3, 64, 512]), Dims([MaxBatchSize, 3, 64, 512]))
+        profile.set_shape(
+            "src_pad_mask", Dims([1, 128]), Dims([16, 128]), Dims([MaxBatchSize, 128]))
+        
+    elif model_name == 'roformer':
+        profile.set_shape(
+            "input_segment0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
+        profile.set_shape(
+            "input_token0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
+        
+    elif model_name == 'swin':
+        profile.set_shape(
+                "pixel_values.1", Dims([1, 3, 384, 384]), Dims([32, 3, 384, 384]), Dims([MaxBatchSize, 3, 384, 384]))
+
+    else:
+        pass
+
+    build_config.add_optimization_profile(profile)
+
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(onnx_model_path)
+    
+    if BuildFlag == 'FP16':
+        build_config.set_flag(tensorrt.BuilderFlag.FP16)
+    
+    if BuildFlag == 'INT8':
+        build_config.set_flag(tensorrt.BuilderFlag.INT8)
+
+    # set dynamic shape
+    num_inputs = network.num_inputs
+
+    for i in range(num_inputs):
+        if model_name == 'resnet50':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 224, 224])
+
+        elif model_name == 'videobert':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-1, 3, 224, 224])
+            else:
+                input_tensor.shape = Dims([100, 77])
+
+        elif model_name == 'yolov5':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 640, 640])
+            network.get_input(i).dtype = tensorrt.float16
+
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta':        
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 384])
+        
+        elif model_name == 'widedeep':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-26, 2])
+            elif i == 1:
+                input_tensor.shape = Dims([-1, 13])
+            else:
+                input_tensor.shape = Dims([-1, 1])
+
+        elif model_name == 'conformer':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-1, 3, 64, 512])
+            else:
+                input_tensor.shape = Dims([-1, 128])
+        
+        elif model_name == 'roformer':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 1024])
+
+        elif model_name == 'swin':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 384, 384])
+
+        else:
+            pass
+
+    plan = builder.build_serialized_network(network, build_config)
+
+    with open(engine_path, "wb") as f:
+        f.write(plan)
+
+    print("***Build dynamic shape engine success!***")
+
+
+def build_igie_engine(model_name, model_path, input_dict, model_framework, precision, engine_path):
+    global tvm
+
+    if tvm is not None:
+        return
+    
+    if not os.path.exists(engine_path):
+        tvm = importlib.import_module("tvm")
+        from general_perf.backends.ILUVATAR.utils.import_model import import_model_to_igie
+
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")
+        mod, params = import_model_to_igie(model_path, input_dict, model_framework, backend='igie')
+        lib = tvm.relay.build(mod, target=target, params=params, precision=precision, verbose=False)
+        lib.export_library(engine_path)
+    else:
+        pass
+
+
+def init_by_tensorrt(engine_path):
+    datatype = tensorrt.DataType.FLOAT
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    with open(engine_path, "rb") as f, tensorrt.Runtime(logger) as runtime:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+    
+    return engine, context
+
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+
+        for s in shape:
+            size *= s
+        
+        # allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
+        
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+            "nbytes": size
+        }
+
+        allocations.append(allocation)
+
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+
+    return inputs, outputs, allocations
+
+
+# multi cores inference codes
+class Task:
+    def __init__(self, bs, dataset, device_id, load_fun, benchmark_fun, performance_reports, lock, framework) -> None:
+        self.dataset = dataset
+        self.benchmark_fun = benchmark_fun
+        self.device_id = device_id
+        self.performance_reports = performance_reports
+        checkCudaErrors(cudart.cudaSetDevice(device_id))
+        if framework != 'gpt2':
+            load_fun(bs)
+
+        self.lock = lock
+        self.module = None
+        
+
+    def run(self):
+        checkCudaErrors(cudart.cudaSetDevice(self.device_id))
+        batch_reports = self.benchmark_fun(self.dataset)
+        self.performance_reports.append(batch_reports)
+
+
+class TaskThread(threading.Thread):
+   def __init__(self, func, args):
+      threading.Thread.__init__(self)
+      self.func = func
+      self.args = args
+      
+   def run(self):
+      self.func(*self.args)
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, cuda.CUresult):
+        err, name = cuda.cuGetErrorName(error)
+        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, cudart.cudaError_t):
+        return cudart.cudaGetErrorName(error)[1]
+    else:
+        raise RuntimeError('Unknown error type: {}'.format(error))
+
+
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ea05720034743e5710f1da3565d033f39608ba
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -0,0 +1,264 @@
+# Copyright 2023 Graphcore Ltd.
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+import subprocess
+
+from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
+
+from general_perf.backends.ILUVATAR.common import build_engine
+from general_perf.backends.ILUVATAR.optimizer.passes import *
+from general_perf.tools.torch_to_onnx import torch_to_onnx
+from general_perf.tools.saved_to_onnx import savedmodel_to_onnx
+from general_perf.model_zoo import *
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendILUVATAR")
+
+
+class CompileBackendILUVATAR(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendILUVATAR, self).__init__()
+        self.hardware_type = "ILUVATAR"
+        self.need_reload = False
+        self.model_runtimes = []
+        self.model_config = None
+
+    def version(self) -> str:
+        """Return compile backend version details."""
+        return tensorrt.__version__
+    
+    def compile(self, configs, dataloader=None):
+        model = configs['model_info']['model']
+        model_name = configs['model_info']['model'].split("-")[0]
+        model_path = configs['model_info']['model_path']
+        MaxBatchSize = configs['model_info']['max_batch_size']
+
+        precision = configs['model_info']['model_precision'].replace('FP32', 'FP16')
+
+        if precision == 'FP16':
+            if model_name == 'resnet50' or model_name == 'bert' or model_name == 'albert' or model == 'deberta' or model_name == 'yolov5':
+                import tensorrt
+            else:
+                import tensorrt
+        
+        if precision == 'INT8':
+            import tensorrt
+
+        load_ixrt_plugin(model=model_name, precision=precision)
+
+        if model_name == 'gpt2':
+            from general_perf.backends.ILUVATAR.common import build_igie_engine
+
+        # call the ONNX model and the compiled engine file
+        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+            onnx_model_path = model_path.split(".")[0] + "_end.onnx"
+            engine_path = model_path.split(".")[0] + "_end.engine"
+
+        elif model_name == 'widedeep' or model_name == 'roformer':
+            onnx_model_path = model_path + "/" + model + "_end.onnx"
+            engine_path = model_path + "/" + model + "_end.engine"
+
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin' \
+             or model_name == 'resnet50':
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + "_end.onnx"
+            engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine"
+        
+        else:
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+            engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+
+        # model preprocessing
+        self.get_onnx(configs)
+
+        # build engine
+        if configs['model_info']['model_precision'].replace('FP32', 'FP16') == 'FP16':
+            precision_flag = "FP16"
+            if model_name == 'widedeep':
+                onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'deberta':
+                onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'roformer':
+                onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'gpt2':
+                for bs in configs['workload']['batch_sizes']:
+                    onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+                    engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(bs) + ".so" 
+
+                    for key, val in configs['model_info']['input_shape'].items():
+                        input_dict = {}
+                        val = val = [val[0] * bs] + val[1:] 
+                        input_dict[key] = val
+                        
+                    build_igie_engine(model_name=model_name, model_path=onnx_model_path, input_dict=input_dict, model_framework='onnx', precision='fp16', engine_path=engine_path)
+            
+            elif model == 'vae-decoder-onnx-fp32' or model == 'vae-encoder-onnx-fp32' or model == 'clip-onnx-fp32':
+                pass
+            
+            else:
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+            
+        if configs['model_info']['model_precision'] == 'INT8':
+            precision_flag = "INT8"
+            if model_name == 'widedeep':
+                onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
+            
+            if model_name == 'resnet50':
+                onnx_model_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
+
+            if model_name == 'yolov5':
+                onnx_model_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
+
+            if model_name == 'bert':
+                print(f"\n==========****bert模型的int8精度推理采用直接加载engine文件, 因此不需要build engine! ****===========")
+
+        result = {
+            "model": 
+                configs['model_info']['model'],
+            "model_name": 
+                configs['model_info']['model'].split("-")[0],
+            "model_path":
+                configs['model_info']['model_path'],
+            "framework": 
+                configs['model_info']['framework'],
+            "compile_precision": 
+                precision_flag,
+            "input_type": 
+                configs['model_info']['input_type'].split(","),
+            "max_batch_size": 
+                configs['model_info']['max_batch_size'],
+            "compile_status":
+                "success",
+            "sg_percent": 100,
+            "segments": [
+                {
+                    "sg_idx": 0,
+                    "is_fallback": False,
+                    "input_tensor_map": 
+                        configs['model_info']['input_shape'],
+                    "output_tensor_map": 
+                        configs['model_info']['outputs'],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": configs['model_info']['model_path'],
+                        },
+                    ],
+                },
+            ],
+        }
+
+        self.configs = result
+        self.workload = configs['workload']
+        self.model_info = configs['model_info']
+
+        for key, value in result.items():
+            print('{key}: {value}'.format(key=key, value=value))
+
+        return result
+
+
+    def get_interact_profile(self, configs):
+        """
+            Collect information for core engine to let user interactively fill in configurations.
+        """
+        return []
+
+    def get_best_batch_size(self):
+        """Get Best Batch Size for the model.
+        Usually take the max batch size can be loaded to IPU as the best batch size to
+        get highest throughput.
+        """
+        return None
+    
+    def get_onnx(self, configs):
+        model = configs['model_info']['model']
+        model_name = configs['model_info']['model'].split("-")[0]
+        model_path = configs['model_info']['model_path']
+
+        # model save location
+        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+            onnx_model_path = model_path 
+
+        elif model_name == 'widedeep' or model_name == 'roformer':
+            onnx_model_path = model_path + "/" + model + ".onnx"
+
+        else:
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+
+        framework = configs['model_info']['framework']
+
+        if framework == 'Pytorch':
+            torch_to_onnx(model_path=model_path, output_path=onnx_model_path)
+            print("***Convert pt model to onnx model success!***")
+
+        if framework == 'Tensorflow':
+            savedmodel_to_onnx(model_path=model_path, output_path=onnx_model_path)
+            print("***Convert pb model to onnx model success!***")
+
+        # Convert ONNX model to plugin operator model: Support fusion of dynamic and static graphs
+        if model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or \
+            model_name == 'videobert' or model_name == 'resnet50' or model_name == 'widedeep':
+            
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path}'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'deberta':
+            onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast.onnx"
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path}'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'swin':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type swint'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'yolov5':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type yolo'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'roformer':
+            onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen.onnx"
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type roformer --input_shapes input_segment0:bsx1024,input_token0:bsx1024'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'conformer':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type conformer --hidden_size 512 --num_heads 8'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        else:
+            pass
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..65175643c0e50d8445ef65deae088de4600244f0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
@@ -0,0 +1,44 @@
+## CI Test tool for IxRT
+
+### 1. Install dltest tool
+    
+    python setup.py develop
+
+### 2. Usage
+
+#### 2.1 Fetch log
+
+Commmand:
+
+```shell
+ixdltest-fetch args_or_pipe ${log_path}
+```
+
+Arguments:
+
+- p or patterns, The pattern of fetch log;
+- pn or pattern_names, The name of pattern;
+- use_re, Whether use regular expression;
+- d or nearest_distance, default=10, The nearest distance of matched pattern;
+- start_flag, The flag of start to record log;
+- end_flag, The flag of stop to record log;
+- split_pattern, The pattern is used to match line, If the line is matched, argument `split_sep` to split the line.
+- split_sep, The seperator is used to split line;
+- split_idx, The index of split line;
+- saved, Save result to path;
+- log, Log path.
+
+Example
+Analyse from file
+```
+$ ixdltest-fetch run.log -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
+{'results': [{'Throughput': [188.5461778786721]}]}
+- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
+```
+
+Analyse from command line pipe
+```
+$ cat run.log | ixdltest-fetch -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
+{'results': [{'Throughput': [188.5461778786721]}]}
+- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
+```
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5458f31666f11de72d52a4e834b8a87be9a992d0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
@@ -0,0 +1 @@
+from .utils.infer_args import show_infer_arguments
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..182e895c7fe902a31fc982fab6f96e0c55125c4a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+from typing import List, Iterable, Optional
+
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.log_parser import LogParser
+from dltest.model_compare_config import get_compare_config_with_full_path
+from dltest.utils.misc import get_full_path
+from dltest.utils.subprocess_tools import get_output
+from dltest.model_compare_config import ComparatorConfig
+
+
+FRAMEWORKS = list(ComparatorConfig.get_frameworks())
+
+REMAINDER = '...'
+
+assertion_expr_factory = dict(
+    eq = "a == b",
+    ne = "a != b",
+    ge = "a >= b",
+    le = "a <= b",
+    gt = "a > b",
+    lt = "a < b",
+)
+
+
+class AssertCLI(LogParserCLI):
+
+    def command_name(self):
+        return "assert"
+
+    def predefine_args(self):
+        super(AssertCLI, self).predefine_args()
+        self.parser.add_argument('-b', '--assertion_second_value', type=float, default=None,
+                                 help='It is used in assertion expression.')
+        self.parser.add_argument('--print_result', action="store_true", default=False,
+                                 help='Whether print result')
+        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'],
+                                 help='The method of capture output')
+        # FIXME: Using store_action to replase it
+        self.parser.add_argument('--only_last', type=int, default=0,
+                                 help='Whether use the last result to compare')
+        self.parser.add_argument('--expr', type=str, default="ge",
+                                 help=f"Assertion expression, option keys: {', '.join(assertion_expr_factory.keys())}" +
+                                 ", or a executable code, such as `a > b`, `a > 1`, ...")
+        self.parser.add_argument('--use_predefined_parser_rules', action="store_true", default=False,
+                                 help='Whether use predefined args of parser.')
+        self.parser.add_argument('--log', type=str, default=None, help="Log path")
+        self.parser.add_argument("--run_script", default=[], nargs=REMAINDER)
+
+    def parse_args(self, *args, **kwargs):
+        args = super(AssertCLI, self).parse_args()
+        args.only_last = args.only_last > 0
+        if len(args.run_script) == 0 and args.log is None:
+            raise ValueError("The one of `--run_script` or `--log` must be given.")
+
+        if args.assertion_second_value is None:
+            if args.expr is None:
+                raise ValueError("The one of `--assertion_second_value` or `--expr` must be given.")
+
+            if args.expr in assertion_expr_factory:
+                raise ValueError(
+                    "The comparison operators depend on the argument `assertion_second_value`."
+                )
+
+        return args
+
+    def create_parser(self, args):
+        if args.use_predefined_parser_rules:
+            script_path = self._get_script_path(args.run_script)
+            config = get_compare_config_with_full_path(script_path, to_dict=False)
+
+            return LogParser(
+                patterns=config.patterns, pattern_names=config.pattern_names,
+                use_re=config.use_re, nearest_distance=config.nearest_distance,
+                start_line_pattern_flag=config.start_line_pattern_flag,
+                end_line_pattern_flag=config.end_line_pattern_flag,
+                split_pattern=config.split_pattern,
+                split_sep=config.split_sep,
+                split_idx=config.split_idx
+            )
+
+        return LogParser(
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx
+        )
+
+    def run(self):
+        args = self.parse_args()
+        parser = self.create_parser(args)
+
+        if args.print_result:
+            print(args)
+
+        output = self.get_log(args)
+        parsed_logs = self.parser_log(parser, output, args)
+        self.check_logs(parsed_logs, args)
+
+    def get_log(self, args):
+        if len(args.run_script) == 0:
+            try:
+                with open(args.log) as f:
+                    return f.readlines()
+            except:
+                print(f"ERROR: Read log fail in {args.log}")
+                exit(1)
+        else:
+            return get_output(args.run_script, capture_output_method=args.capture_output)
+
+    def parser_log(self, parser, output, args) -> List[float]:
+        results = parser.parse(output)
+        if args.only_last:
+            results = results[-1:]
+
+        if len(results) == 0:
+            raise ValueError("The parsed results is empty, please check patterns.")
+        if isinstance(results[0], dict):
+            if len(results[0]) == 0:
+                raise ValueError("The parsed results is empty, please check patterns.")
+            key = list(results[0].keys())[0]
+            results = [result[key] for result in results]
+
+        if isinstance(results[0], Iterable):
+            results = [result[0] for result in results]
+
+        return results
+
+    def check_logs(self, parsed_logs, args):
+        if args.print_result:
+            print("Parsed result:", parsed_logs)
+
+        assertion_expr = assertion_expr_factory.get(args.expr, args.expr)
+
+        assert_results = []
+        b = args.assertion_second_value
+        for a in parsed_logs:
+            assert_results.append(eval(assertion_expr))
+
+        if args.print_result:
+            print("The result of assertion expression:", assert_results)
+
+        if any(assert_results):
+            print("SUCCESS")
+            exit(0)
+        print("FAIL")
+        exit(1)
+
+    def _get_script_path(self, run_script: List[str]):
+        # Find shell script by current run_script
+        def _find_real_shell_script(cmd: List[str]):
+            for i, field in enumerate(cmd):
+                if field.endswith('.sh') and self._get_framework(field) in FRAMEWORKS:
+                    return field
+
+        real_shell_script = _find_real_shell_script(run_script)
+
+        # Find shell script by parent process
+        if real_shell_script is None:
+            ppid = os.getppid()
+            import psutil
+            pproc = psutil.Process(ppid)
+            pproc_cmd = pproc.cmdline()
+            real_shell_script = _find_real_shell_script(pproc_cmd)
+
+        if real_shell_script is not None:
+            real_shell_script = self._get_script_abs_path(real_shell_script)
+            return real_shell_script
+
+        raise RuntimeError("The script is not named correctly, " + \
+                           "please use a script name ending with the framework, " + \
+                           f"got `{' '.join(run_script)}`, " + \
+                           "e.g. train_resnet50_torch.sh")
+
+    def _get_framework(self, shell_script: str) -> Optional[str]:
+        try:
+            return shell_script.split('.')[-2].split('_')[-1]
+        except:
+            return None
+
+    def _get_script_abs_path(self, run_script):
+        real_run_script = os.path.realpath(run_script)
+        if os.path.exists(real_run_script):
+            return real_run_script
+
+        if "MODEL_DIR" in os.environ:
+            return os.path.join(os.environ["MODEL_DIR"], run_script)
+
+        if "OLDPWD" in os.environ:
+            real_run_script = os.path.join(os.environ["OLDPWD"], run_script)
+            if os.path.exists(real_run_script):
+                return real_run_script
+
+        raise FileNotFoundError("Not found running script path, " + \
+                                "please set environment variable `MODEL_DIR`, " + \
+                                "e.g /path/to/deeplearningsamples/executables/resnet.")
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..b40f3a72fb949c18104963fb598c58076c65b479
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
@@ -0,0 +1,56 @@
+import os
+
+from .assert_cli import AssertCLI
+from ..utils.subprocess_tools import execute_shell
+
+RUN_MODE_KEY = "RUN_MODE"
+RUN_MODE_STRICT = "strict"
+
+
+class CheckCli(AssertCLI):
+
+    def __init__(self, *args, **kwargs):
+        super(CheckCli, self).__init__(*args, **kwargs)
+        self.args = None
+
+    def command_name(self):
+        return "check"
+
+    def predefine_args(self):
+        self.parser.add_argument("--check_mode", type=str, default="no",
+                                 choices=["all", "strict", "nonstrict", "no"],
+                                 help="which running mode needs to be checked")
+        self.parser.add_argument("--nonstrict_mode_args", type=str, default="",
+                                 help="the arguments are used with nonstric testing")
+        super(CheckCli, self).predefine_args()
+
+    def parse_args(self, *args, **kwargs):
+        if self.args is None:
+            args = super(CheckCli, self).parse_args(*args, **kwargs)
+            args.use_predefined_parser_rules = True
+            args.nonstrict_mode_args = args.nonstrict_mode_args.split(" ")
+
+            if not self.is_strict_testing():
+                args.run_script.extend(args.nonstrict_mode_args)
+
+            if args.check_mode == "all":
+                args.check_mode = self.current_running_mode()
+
+            self.args = args
+        return self.args
+
+    def run(self):
+        args = self.parse_args()
+        if args.check_mode == self.current_running_mode():
+            return super(CheckCli, self).run()
+        else:
+            res = execute_shell(args.run_script)
+            exit(res.returncode)
+
+    def current_running_mode(self):
+        return os.environ.get(RUN_MODE_KEY, RUN_MODE_STRICT)
+
+    def is_strict_testing(self):
+        return self.current_running_mode() == RUN_MODE_STRICT
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..c631f332b6a46c43c7891e4925d011e49741dc5d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from dltest.cli.assert_cli import AssertCLI
+from dltest.cli.log_comparator_cli import LogComparatorCLI
+from dltest.cli.model_validator_cli import ModelValidatorCLI
+from dltest.cli.fetch_log_cli import FetchLog
+from dltest.cli.check_cli import CheckCli
+
+
+#log_comparator_cli = LogComparatorCLI()
+#model_validator_cli = ModelValidatorCLI()
+fetch_log_cli = FetchLog()
+#assert_cli = AssertCLI()
+#check_cli = CheckCli()
+
+
+def make_execute_path():
+    preffix = "dltest.cli.entry_points"
+    clis = []
+    for cli_var in globals():
+        if cli_var.endswith('_cli'):
+            cmd_name = globals()[cli_var].command_name()
+            clis.append(f"ixdltest-{cmd_name}={preffix}:{cli_var}")
+
+    return clis
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..41f3c3cac3151b61362b3ff57609df0f64896181
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import json
+import sys
+from typing import Mapping
+from os.path import basename, join, exists, expanduser, dirname
+
+from dltest.log_parser import LogParser
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.utils.iluvatar import get_iluvatar_card_type, IluvatarGPU
+
+
+
+
+def parse_target(target):
+    result = {}
+    targets = target.split(",")
+    for i in targets:
+        item = i.split(":")
+        assert len(item) == 2
+        key, value = item
+        result[key] = float(value)
+    return result
+        
+
+def load_json(file):
+    file_path = expanduser(file)
+    # 检查文件是否存在
+    if exists(file_path):
+        # 加载json文件
+        with open(file_path, 'r') as file:
+            data = json.load(file)
+    else:
+        # 创建一个空的json文件
+        data = {}
+
+    return data
+
+def process_results(results):
+    result = dict()
+    for i in results["results"]:
+        for k, v in i.items():
+            result[k] = v[0]
+    return result
+
+class FetchLog(LogParserCLI):
+
+    def command_name(self):
+        return "fetch"
+
+    def predefine_args(self):
+        super(FetchLog, self).predefine_args()
+        self.parser.add_argument('log', nargs='?', type=str, help="Log path")
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--saved_entry', type=str, default=None, help='Save to path')
+        self.parser.add_argument('-t_bi150','--target_bi150', type=str, default=-1.)
+        self.parser.add_argument('-t_mr100','--target_mr100', type=str, default=-1.)
+        self.parser.add_argument('-t_mr50','--target_mr50', type=str, default=-1.)
+
+    def run(self):
+        args = self.parse_args()
+        parser = LogParser(
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx
+        )
+
+        results = parser.parse(args.log)
+        if not isinstance(results, Mapping):
+            results = dict(results=results)
+        results = process_results(results)
+        print(results)
+
+        if args.saved is not None:
+            saved = load_json(args.saved)
+            if not args.saved_entry:
+                raise Exception("You need to use --saved_entry to specify entry name of the result")
+
+            saved[args.saved_entry] = results
+            with open(args.saved, 'w') as f:
+                json.dump(saved, f, indent=4)
+        self.compare_results(args, results)
+
+
+    def compare_results(self, args, results):
+        card = get_iluvatar_card_type()
+        if card == IluvatarGPU.UNKNOWN:
+            print("Not known which card is used, can you use ixsmi in the environment?")
+            return
+        user_target = getattr(args, 'target_'+card.name.lower(), "")
+        user_target = parse_target(user_target)
+
+        is_expected = True
+        for key, target in user_target.items():
+            if key not in results:
+                continue
+            if results[key]<target:
+                is_expected = False
+                print(f"- Check {key} on {card.name} failed (result vs target): {results[key]}<{target}")
+            else:
+                print(f"- Check {key} on {card.name} passed (result vs target): {results[key]}>={target}")
+        if not is_expected:
+            sys.exit(1)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..cac8a0a684440371ece5067086cd75eed939f482
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import json
+from pprint import pprint
+
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.log_comparator import compare_logs_with_paths, DEFAULT_NEAREST_MATCH_CHARS
+
+
+class LogComparatorCLI(LogParserCLI):
+
+    def command_name(self):
+        return "compare"
+
+    def predefine_args(self):
+        super(LogComparatorCLI, self).predefine_args()
+        self.parser.add_argument('--log1', type=str, help="First log")
+        self.parser.add_argument('--log2', type=str, help="Second log")
+        self.parser.add_argument('--threshold', type=float, default=0.0001, help="Threshold")
+        self.parser.add_argument('--only_last', type=int, default=1, help='Whether use the last result to compare')
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
+        self.parser.add_argument('--allow_greater_than', action="store_true", default=False, help='Allow log1 greater than log2')
+
+    def parse_args(self, *args, **kwargs):
+        args = super(LogComparatorCLI, self).parse_args(*args, **kwargs)
+        args.only_last = args.only_last >= 1
+
+        return args
+
+    def run(self):
+        args = self.parse_args()
+        satisfied, results = compare_logs_with_paths(
+            log1=args.log1, log2=args.log2,
+            threshold=args.threshold,
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            only_last=args.only_last,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx,
+            allow_greater_than=True
+        )
+
+        if args.print_result:
+            pprint(results)
+
+        if satisfied:
+            print("SUCCESS")
+        else:
+            print("FAIL")
+
+        if args.saved is not None:
+            with open(args.saved, 'w') as f:
+                json.dump(results, f)
+
+
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e2dd1be2d305a83a2969b5d4dbfbfeef2d9fd0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import json
+from typing import Mapping
+
+from dltest.log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
+from dltest.utils.base_cli import BaseCLI
+
+
+class LogParserCLI(BaseCLI):
+
+    def predefine_args(self):
+        self.parser.add_argument('-p', '--patterns', nargs="*", type=str, default=None, help='Fetched patterns')
+        self.parser.add_argument('-pn', '--pattern_names', nargs="*", type=str, default=None, help='The name of pattern')
+        self.parser.add_argument('--use_re', action="store_true", default=False, help='Whether use regular expression')
+        self.parser.add_argument('-d', '--nearest_distance', type=int, default=DEFAULT_NEAREST_MATCH_CHARS, help='The nearest distance of matched pattern')
+        self.parser.add_argument('--start_flag', type=str, default=None, help='The flag of start to record log')
+        self.parser.add_argument('--end_flag', type=str, default=None, help='The flag of stop to record log')
+        self.parser.add_argument('--split_pattern', type=str, default=None, help='The pattern is used to match line')
+        self.parser.add_argument('--split_sep', nargs="*", type=str, default=None, help='The seperator is used to split line')
+        self.parser.add_argument('--split_idx', nargs="*", type=int, default=None, help='The index of split line')
+
+    def parse_args(self, *args, **kwargs):
+        args = super(LogParserCLI, self).parse_args(*args, **kwargs)
+
+        return args
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d0d77d97d8f4f0d4d3528418c886884fa262575
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import json
+import os
+import os.path as ospath
+from pprint import pprint
+from typing import List, Union
+
+from dltest.utils.base_cli import BaseCLI
+from dltest.utils.get_env import get_gpu_type
+from dltest.utils.misc import get_full_path
+from dltest.model_compare_config import get_compare_config_with_full_path
+from dltest.log_comparator import compare_logs_with_paths
+from dltest.utils.subprocess_tools import get_output
+
+
+REMAINDER = '...'
+
+
+class ModelValidatorCLI(BaseCLI):
+
+    def command_name(self):
+        return "validate"
+
+    def predefine_args(self):
+        super(ModelValidatorCLI, self).predefine_args()
+        self.parser.add_argument('-l', '--compare_log', type=str, default=None, help="Compare log")
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--with_exit_code', type=int, default=1, help="Add exit code for the result of compared")
+        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
+        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'], help='The method of capture output')
+        self.parser.add_argument("run_script", nargs=REMAINDER)
+
+    def parse_args(self, *args, **kwargs):
+        args = super(ModelValidatorCLI, self).parse_args()
+        if len(args.run_script) == 0:
+            print("ERROR: Invalid run_script")
+            exit(1)
+
+        return args
+
+    def run(self):
+        args = self.parse_args()
+        output = self._run_script(args.run_script, capture_output_method=args.capture_output)
+        self.compare_logs(
+            output, args.compare_log, args.run_script,
+            args.saved, args.with_exit_code,
+            args.print_result
+        )
+
+    def compare_logs(self, output: List, compare_log: str,
+                     run_script: List[str], saved: str=None,
+                     with_exit_code: int=1, print_result=False):
+        script_path = self._get_script_path(run_script)
+        script_path = get_full_path(script_path)
+        compare_args = get_compare_config_with_full_path(script_path)
+
+        if compare_log is None:
+            epoch = self._get_epoch(run_script)
+            script_name = ospath.basename(script_path)
+            dist_tag = self._get_dist_tag(script_name)
+            compare_log = self._find_comparable_log(script_path, epoch, dist_tag)
+
+            if not ospath.exists(compare_log):
+                print(f"ERROR: {compare_log} not exist. Or please use argument `l` to locate log.")
+                exit(1)
+
+        compare_args['log1'] = output
+        compare_args['log2'] = compare_log
+
+        satisfied, results = compare_logs_with_paths(**compare_args)
+
+        if print_result:
+            pprint(results)
+
+        if satisfied:
+            print("SUCCESS")
+        else:
+            print("FAIL")
+
+        if saved is not None:
+            with open(saved, 'w') as f:
+                json.dump(results, f)
+
+        if with_exit_code:
+            if satisfied:
+                exit(0)
+            else:
+                exit(1)
+
+    def _run_script(self, command: List, capture_output_method: str='tempfile'):
+        return get_output(command, capture_output_method=capture_output_method)
+
+    def _get_script_path(self, run_script: List[str]):
+        for i, field in enumerate(run_script):
+            if field.endswith('.py') or field.endswith('.sh'):
+                return field
+
+        raise RuntimeError("Not found the name of script, " +
+                           "only support python or `sh` script, but got {}.".format(run_script))
+
+    def _find_comparable_log(self, script_path: str, epoch: Union[str, int], dist_tag: str):
+        gpu_type = get_gpu_type().lower()
+
+        # Get the platform of trained log
+        if gpu_type == "nv":
+            gpu_type = 'bi'
+        else:
+            gpu_type = 'nv'
+
+        script_path = get_full_path(script_path)
+        project_dir = self._get_project_dir(script_path)
+        script_name = ospath.basename(script_path)
+
+        log_path = f"{project_dir}/runing_logs/{gpu_type}/{gpu_type}-{script_name}.epoch_{epoch}{dist_tag}.log"
+
+        return log_path
+
+
+    def _get_epoch(self, run_script: List[str]):
+        for i, field in enumerate(run_script):
+            if "--epoch" in field:
+                if "=" in field:
+                    return field.split("=")[1]
+                else:
+                    return run_script[i + 1]
+
+        return 'default'
+
+    def _get_dist_tag(self, script_name: str):
+        try:
+            import torch
+            num_gpus = torch.cuda.device_count()
+        except:
+            num_gpus = os.environ.get("CUDA_VISIBLE_DEVICES", "all")
+
+        if '_dist_' in script_name or '_multigpu_' in script_name:
+            return f".{num_gpus}card"
+        return ""
+
+    def _get_project_dir(self, abs_path):
+        abs_path = ospath.abspath(abs_path)
+        script_dir = ospath.dirname(abs_path)
+        executables_dir = ospath.dirname(script_dir)
+        return ospath.dirname(executables_dir)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..9da2c0cd579a3407b6d743bfd2a4cdbbd28a687c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from typing import List, Mapping, Union, Tuple
+from .log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
+
+LogLines = List[Mapping]
+CompareResult = Tuple[bool, Union[List, Mapping]]
+
+
+def _compute_errors(value1: Mapping, value2: Mapping, threshold: Mapping, allow_greater_than=False) -> CompareResult:
+    if not isinstance(threshold, Mapping):
+        _thds = dict()
+        for key in value1.keys():
+            _thds[key] = threshold
+        threshold = _thds
+
+    result = dict()
+    satisfied = True
+    for key, _thd in threshold.items():
+        v1, v2 = value1[key], value2[key]
+        origin_value_type = list
+        if not isinstance(v1, (tuple, list)):
+            origin_value_type = float
+            v1 = [v1]
+            v2 = [v2]
+
+        real_errors = []
+        for v1_i, v2_i in zip(v1, v2):
+            real_error = v1_i - v2_i
+            real_errors.append(real_error)
+            if satisfied and abs(real_error) > _thd:
+                if allow_greater_than and real_error > 0:
+                    continue
+                satisfied = False
+
+        if origin_value_type is float and len(real_errors) > 0:
+            real_errors = real_errors[0]
+
+        result[key] = real_errors
+
+    return satisfied, result
+
+
+def compare_logs(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
+    total_lines = len(log1[0])
+    real_errors = []
+    satisfied = True
+    for line_idx in range(total_lines):
+        _satisfied, _error = _compute_errors(log1[line_idx], log2[line_idx], threshold, allow_greater_than=allow_greater_than)
+        real_errors.append(_error)
+        if satisfied and not _satisfied:
+            satisfied = False
+
+    return satisfied, real_errors
+
+
+def compare_logs_by_last_result(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
+    if len(log1) == 0 or len(log2) == 0:
+        return False, []
+    return _compute_errors(log1[-1], log2[-1], threshold, allow_greater_than=allow_greater_than)
+
+
+def compare_logs_with_paths(log1, log2, threshold: Union[float, Mapping],
+                            patterns: List[str],
+                            pattern_names: List[str] = None,
+                            use_re: bool = False,
+                            nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS,
+                            start_line_pattern_flag: str = None,
+                            end_line_pattern_flag: str = None,
+                            only_last: bool=True,
+                            split_pattern: Union[str, List] = None,
+                            split_sep: List = None,
+                            split_idx: List = None,
+                            allow_greater_than: bool = False):
+    parser = LogParser(
+        patterns=patterns, pattern_names=pattern_names,
+        use_re=use_re, nearest_distance=nearest_distance,
+        start_line_pattern_flag=start_line_pattern_flag,
+        end_line_pattern_flag=end_line_pattern_flag,
+        split_pattern=split_pattern,
+        split_sep=split_sep,
+        split_idx=split_idx
+    )
+
+    log1 = parser.parse(log1)
+    log2 = parser.parse(log2)
+
+    if only_last:
+        compare_result = compare_logs_by_last_result(log1, log2, threshold, allow_greater_than=allow_greater_than)
+    else:
+        compare_result = compare_logs(log1, log2, threshold, allow_greater_than=allow_greater_than)
+
+    return compare_result[0], dict(log1=log1, log2=log2, errors=compare_result[-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c690d8f677b3ae470322e29c266e84993a74266
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from typing import List, Optional, Union, Mapping
+import re
+import sys
+
+
+DEFAULT_NEAREST_MATCH_CHARS = 10
+
+
+def read_file(file):
+    with open(file, 'r') as f:
+        return f.readlines()
+
+def read_pipe():
+    result = []
+    for line in sys.stdin:
+        result.append(line)
+    return result
+
+def postprocess_search_result(results: List[str]) -> List[float]:
+    if len(results) != 0:
+        results = list(map(float, results))
+    return results
+
+
+def extract_nearest_value_by_key_inline(content: str, key: str,
+                                        nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
+    pattern = "%s[\s\S]{0,%d}?(\d+(?:\.\d+)?)" % (key, nearest_distance)
+    return extract_value_by_pattern_inline(content, pattern)
+
+
+def extract_value_by_pattern_inline(content: str, pattern: str) -> List[float]:
+    results = re.findall(pattern, content)
+    return postprocess_search_result(results)
+
+
+def extract_value(content: str, pattern: str,
+                  inline=True, use_re=False,
+                  nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
+    if inline:
+        if use_re:
+            return extract_value_by_pattern_inline(content, pattern)
+        else:
+            return extract_nearest_value_by_key_inline(content, pattern, nearest_distance)
+    else:
+        raise NotImplementedError()
+
+
+class LogParser:
+
+    def __init__(self,
+                 patterns: List[str]=None,
+                 pattern_names: List[str]=None,
+                 use_re: bool=False,
+                 nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS,
+                 start_line_pattern_flag: str=None,
+                 end_line_pattern_flag: str=None,
+                 split_pattern: Union[str, List]=None,
+                 split_sep: List[str]=None,
+                 split_idx: List[int]=None):
+        if patterns is None and split_sep is None:
+            raise ValueError("The one of argument `patterns` or `split_sep` must be given.")
+
+        if pattern_names is not None:
+            if isinstance(patterns, (tuple, list)) and patterns is not None and len(patterns) != len(pattern_names):
+                raise ValueError("The length of `pattern_names` argument not equal to `patterns`.")
+            if isinstance(split_sep, (tuple, list)) and split_sep is not None and len(split_sep) != len(pattern_names):
+                raise ValueError("The length of `pattern_names` argument not equal to `split_sep`.")
+
+        if split_sep is not None and (split_idx is None or not isinstance(split_idx, (int, tuple, list))):
+            raise ValueError("Invalid index to split text, got {}.".format(split_idx))
+
+        if split_sep is not None and split_pattern is None:
+            raise ValueError("Invalid pattern to split text, got {}.".format(split_pattern))
+
+        self.patterns = patterns
+        self.use_re = use_re
+        self.nearest_distance = nearest_distance
+        self.start_line_pattern_flag = start_line_pattern_flag
+        self.end_line_pattern_flag = end_line_pattern_flag
+
+        if not isinstance(split_sep, (tuple, list)) and split_sep is not None:
+            split_sep = [split_sep]
+
+            if not isinstance(split_idx, (tuple, list)):
+                split_idx = [split_idx]
+
+        self.split_sep = split_sep
+        self.split_idx = split_idx
+
+        if pattern_names is None:
+            if patterns is None:
+                pattern_names = split_idx
+            else:
+                pattern_names = patterns
+        self.pattern_names = pattern_names
+
+        if not isinstance(split_pattern, (tuple, list)) and split_sep is not None:
+            split_pattern = [split_pattern] * len(split_sep)
+        self.split_pattern = split_pattern
+
+        self.start_record = start_line_pattern_flag is None
+
+    def parse(self, path_or_logs: Union[str, List]) -> List[dict]:
+        """
+        : return: [{matric_name: value}, ...]
+        """
+
+        
+        if path_or_logs:
+            path_or_logs = read_file(path_or_logs)
+        else:
+            path_or_logs = read_pipe()
+
+        ret = []
+        for line in path_or_logs:
+            result = self.parse_inline(line)
+            if len(result) == 0:
+                continue
+            ret.append(result)
+        return ret
+
+    def parse_inline(self, line) -> dict:
+        if not self.can_record(line):
+            return {}
+
+        if self.split_sep is None:
+            return self._parse_inline_by_match(line)
+        return self._parse_inline_by_split(line)
+
+    def _parse_inline_by_match(self, line: str):
+        ret = {}
+        for name, pattern in zip(self.pattern_names, self.patterns):
+            result = extract_value(
+                line, pattern, inline=True, use_re=self.use_re,
+                nearest_distance=self.nearest_distance
+            )
+            if len(result) == 0:
+                continue
+            ret[name] = result
+        return ret
+
+    def _parse_inline_by_split(self, line: str, to_type=float):
+        ret = {}
+        for name, sep, idx, pattern in zip(self.pattern_names,
+                                  self.split_sep,
+                                  self.split_idx,
+                                  self.split_pattern):
+            if not self.can_matched(line, pattern):
+                continue
+            if '\t' in sep:
+                segs = line.strip().split(sep)
+            else:
+                segs = line.strip().replace('\t', ' ').split(sep)
+            segs = list(filter(lambda kv: kv.strip() not in ["", " ", None], segs))
+            if len(segs) <= idx:
+                continue
+            ret[name] = to_type(segs[idx])
+        return ret
+
+    def can_record(self, line: str):
+        if self.start_line_pattern_flag is None:
+            self.start_record = True
+        elif not self.start_record:
+            self.start_record = self.can_matched(line, self.start_line_pattern_flag)
+
+        if self.start_record:
+            if self.end_line_pattern_flag is not None and self.can_matched(line, self.end_line_pattern_flag):
+                self.start_record = False
+
+        return self.start_record
+
+    def can_matched(self, content: str, pattern: str):
+        result = re.findall(pattern, content)
+        return len(result) != 0
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab7c60d3a6f0758bdac30b12fe82c83dab6cd520
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os.path as ospath
+
+from typing import NamedTuple, Union, List, Mapping
+
+from dltest.log_parser import DEFAULT_NEAREST_MATCH_CHARS
+
+
+class LogComparatorArgs(NamedTuple):
+    threshold: Union[float, Mapping]
+    patterns: List[str] = None
+    pattern_names: List[str] = None
+    use_re: bool = False
+    nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS
+    start_line_pattern_flag: str = None
+    end_line_pattern_flag: str = None
+    split_pattern: Union[str, List] = None
+    split_sep: List = None
+    split_idx: List = None
+    only_last: bool = True
+    allow_greater_than: bool = True
+
+    def to_dict(self):
+        return self._asdict()
+
+
+class ArgsModelsTuple(NamedTuple):
+
+    args: LogComparatorArgs
+    models: List[str]
+
+
+class BaseConfig:
+
+    def __getitem__(self, item):
+        return self.__class__.__dict__[item]
+
+    def __getattr__(self, item):
+        return self.__class__.__dict__[item]
+
+    def __iter__(self):
+        for attr, value in self.__class__.__dict__.items():
+            if isinstance(value, ArgsModelsTuple):
+                yield attr
+
+    def iter_items(self):
+        for attr, value in self.__class__.__dict__.items():
+            if isinstance(value, ArgsModelsTuple):
+                yield attr, value
+
+
+class _TFComparatorConfig(BaseConfig):
+
+    cnn_benchmarks = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["Accuracy @ 1 =", "Accuracy @ 5 ="],
+            pattern_names=["Acc@1", "Acc@5"]
+        ),
+        models=["alexnet", "inceptionv3", "resnet50", "resnet101", "vgg16"]
+    )
+
+    dist_cnn_becnmarks = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_sep=[' ', ' '],
+            split_idx=[9, 10],
+            split_pattern="[\s\S]*?images/sec:[\s\S]*?jitter",
+            pattern_names=['Acc@1', 'Acc@5']
+        ),
+        models=[
+            "alexnet_dist", "inceptionv3_dist", "resnet50_dist", "resnet101_dist", "vgg16_dist"
+        ]
+    )
+
+    bert = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["eval_accuracy ="],
+            pattern_names=["Accuracy"]
+        ),
+        models=["bert"]
+    )
+
+    ssd = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["acc="],
+            pattern_names=["Acc@1"]
+        ),
+        models=["ssd"]
+    )
+
+    yolov3 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.8,
+            patterns=["mAP"]
+        ),
+        models=["yolov3"]
+    )
+
+    vnet = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["background_dice", "anterior_dice", "posterior_dice"]
+        ),
+        models=["vnet"]
+    )
+
+
+class _TorchComparatorConfig(BaseConfig):
+    classification = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=8.0, patterns=['Acc@1', 'Acc@5'],
+            start_line_pattern_flag="Start training",
+        ),
+        models=[
+            'googlenet', 'inceptionv3', 'mobilenetv3', 'resnet', 'shufflenetv2',
+            'vgg', 'resnet50_dali', 'resnext', 'densenet'
+        ]
+    )
+
+    detection = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.03,
+            patterns=[
+                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
+            ],
+            pattern_names=["mAP"],
+            start_line_pattern_flag="IoU metric: bbox",
+            end_line_pattern_flag="IoU metric: segm"
+        ),
+        models=[
+            'maskrcnn', 'retinanet', 'ssd'
+        ]
+    )
+
+    bert_cola = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['mcc']
+        ),
+        models=['bert_cola']
+    )
+
+    bert_mrpc = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['acc']
+        ),
+        models=['bert_mrpc']
+    )
+
+    bert_pretrain_apex = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['eval_mlm_accaracy']
+        ),
+        models=['bert_pretrain_apex']
+    )
+
+    segmentation = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=8.0,
+            patterns=['mean IoU:'],
+            pattern_names=['mIoU']
+        ),
+        models=[
+            'deeplabv3', 'fcn'
+        ]
+    )
+
+    t5 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=5.0,
+            split_pattern="eval_bleu[\s\S]*?=",
+            split_sep=["="],
+            split_idx=[1],
+            pattern_names=['EvalBleu']
+        ),
+        models=['t5']
+    )
+
+    yolov3 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["mAP"]
+        ),
+        models=['yolov3']
+    )
+
+    yolov5 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=[
+                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
+            ],
+            pattern_names=["mAP"],
+        ),
+        models=['yolov5'],
+    )
+
+    yolov5s_coco128 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['yolov5s_coco128']
+    )
+    
+    centernet_resnet18 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['centernet_resnet18']
+    )
+    
+    fcos_resnet50_fpn = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['fcos_resnet50_fpn']
+    )
+
+    ocr_recognition = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.5,  patterns=["0_word_acc"],
+        ),
+        models=[
+            "sar", "satrn"
+        ]
+    )
+
+
+
+class ComparatorConfig:
+
+    _configs = dict(tf=_TFComparatorConfig(), torch=_TorchComparatorConfig())
+
+    @classmethod
+    def get_frameworks(cls) -> List:
+        return list(cls._configs.keys())
+
+    @classmethod
+    def get(cls, tf_or_torch, name, default=None):
+        for model_kind, comb in cls._configs[tf_or_torch].iter_items():
+            if name in comb.models:
+                return comb.args
+        if default is not None:
+            return default
+        raise KeyError("Not found config, but got {name} for {fw}".format(name=name, fw=tf_or_torch))
+
+    @classmethod
+    def find_config(cls, script_path: str) -> LogComparatorArgs:
+        tf_or_torch = script_path.split('.')[-2].split('_')[-1]
+
+        # Find by the name of script
+        script_name = ospath.basename(script_path).rsplit('.', maxsplit=1)[0]
+        if script_name.startswith('train_'):
+            script_name = script_name.replace("train_", "", 1)
+        while script_name not in [None, "", "/", "\\"]:
+            try:
+                config = cls.get(tf_or_torch, script_name)
+                return config
+            except:
+                pass
+            script_name = script_name.rsplit('_', maxsplit=1)
+            if len(script_name) <= 1:
+                break
+            script_name = script_name[0]
+
+        # Find by the name of model's dir
+        model_dir_name = ospath.basename(ospath.dirname(script_path))
+        try:
+            config = cls.get(tf_or_torch, model_dir_name)
+            return config
+        except:
+            raise RuntimeError("Not found for", script_path)
+
+
+def get_compare_config_with_full_path(script_path: str, to_dict=True):
+    config = ComparatorConfig.find_config(script_path)
+    if to_dict:
+        return config.to_dict()
+    return config
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..35f7efa99b21179da30ce34f412fa3319ea1ba00
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from argparse import ArgumentParser
+from abc import abstractmethod
+
+
+class BaseCLI:
+
+    def __init__(self, parser=None, *args, **kwargs):
+        if parser is None:
+            self.parser = ArgumentParser(description=self.description ,*args, **kwargs)
+
+    def __call__(self):
+        self.run()
+
+    @property
+    def description(self):
+        return None
+
+    @abstractmethod
+    def command_name(self):
+        pass
+
+    def predefine_args(self):
+        pass
+
+    def parse_args(self, *args, **kwargs):
+        self.predefine_args()
+        return self.parser.parse_args(*args, **kwargs)
+
+    @abstractmethod
+    def run(self):
+        pass
+
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..97407f37bd9d8a4c5e0a68c760a561ec03a29f95
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+from collections import defaultdict
+import os.path as osp
+import subprocess
+import sys
+
+
+def get_envinfo():
+    import torch
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+    if cuda_available:
+        from torch.utils.cpp_extension import CUDA_HOME
+        env_info['CUDA_HOME'] = CUDA_HOME
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    f'"{nvcc}" -V | tail -n1', shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            env_info['GPU ' + ','.join(devids)] = name
+
+    gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+    gcc = gcc.decode('utf-8').strip()
+    env_info['GCC'] = gcc
+
+    env_info['PyTorch'] = torch.__version__
+
+    return env_info
+
+
+def get_gpu_type():
+    import torch
+    if "DEBUG_GPU_TYPE" in os.environ:
+        return os.environ["DEBUG_GPU_TYPE"]
+
+    if not torch.cuda.is_available():
+        return "BI"
+    dev_name = torch.cuda.get_device_name(0)
+    if 'IX BI' in dev_name or getattr(torch, "corex", False):
+        _type = "BI"
+    else:
+        _type = "NV"
+
+    return _type
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
new file mode 100644
index 0000000000000000000000000000000000000000..7328dd737c2720d544027ad1822d3c2007656a8e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import subprocess
+from enum import Enum
+
+__all__ = ["get_iluvatar_card_type", "IluvatarGPU"]
+
+class IluvatarGPU(Enum):
+    UNKNOWN = -1
+    MR50 = 0
+    MR100 = 1
+    BI150 = 2
+
+card_ixsmi_names = {
+        "BI150": IluvatarGPU.BI150,
+        "BI-V150": IluvatarGPU.BI150,
+        "MR100": IluvatarGPU.MR100,
+        "MR-V100": IluvatarGPU.MR100,
+        "MR50": IluvatarGPU.MR50,
+        "MR-V50": IluvatarGPU.MR50,
+}
+
+def get_iluvatar_card_type():
+    command = 'ixsmi -L | grep "GPU \{1,\}0"'
+    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if result.returncode == 0:
+        for key, value in card_ixsmi_names.items():
+            if key in result.stdout:
+                return value
+        else:
+            return IluvatarGPU.UNKNOWN
+    else:
+        return IluvatarGPU.UNKNOWN
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..29760001cab2d9a8cbeecc894e9e3344ad00d2b4
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+
+from typing import Union, List, Dict, Any, Mapping
+from argparse import Namespace, ArgumentParser
+import json
+
+
+def _obj_to_dict(obj) -> Dict:
+    if isinstance(obj, Mapping):
+        return obj
+
+    try:
+        from absl import flags
+        if isinstance(obj, flags.FlagValues):
+            return obj.flag_values_dict()
+    except:
+        pass
+    if isinstance(obj, Namespace):
+        return obj.__dict__
+    elif isinstance(obj, List):
+        new_obj = dict()
+        for _o in obj:
+            _o_dict = _obj_to_dict(_o)
+            new_obj.update(_o_dict)
+        return new_obj
+    elif not isinstance(obj, Dict):
+        if hasattr(obj, "__dict__"):
+            return obj.__dict__
+    try:
+        typename = type(obj).__name__
+    except:
+        typename = str(obj)
+    return {typename: str(obj)}
+
+
+def json_dump_obj(o):
+    if hasattr(o, "__name__"):
+        return o.__name__
+    return str(o)
+
+
+def show_infer_arguments(args: Union[List, Dict, Any]):
+    """ print running arguments
+    Example 1: For ArgumentParser
+        >>> parser = ArgumentParser("Test")
+        >>> parser.add_argument("--arg0", type=str)
+        >>> args = parser.parse_args()
+        >>> show_infer_arguments(args)
+
+    Example 2: For dict
+        >>> args = dict(arg=1)
+        >>> show_infer_arguments(args)
+
+    Example 3: For custom object
+        >>> from collections import namedtuple
+        >>> ArgsType = namedtuple("ArgsType", ["arg"])
+        >>> args = ArgsType(arg=123)
+        >>> show_infer_arguments(args)
+
+    Example 4: For absl
+        >>> from absl import flags
+        >>> flags.DEFINE_string("arg", "123", "test")
+        >>> show_infer_arguments(flags.FLAGS)
+
+    Example 5: For multi args
+        >>> args1 = dict(a=1)
+        >>> args2 = dict(b=2)
+        >>> show_infer_arguments([args1, args2])
+
+    """
+    if not "SHOW_RUNNING_ARGS" in os.environ:
+        return
+
+    if os.environ["SHOW_RUNNING_ARGS"].lower() in ["0", "f", "false"]:
+        return
+
+    if "LOCAL_RANK" in os.environ:
+        if os.environ["LOCAL_RANK"] != "0":
+            return
+    args = _obj_to_dict(args)
+    args = json.dumps(args, default=json_dump_obj)
+    print("[RunningArguments]", args)
+
+
+if __name__ == '__main__':
+    os.environ["SHOW_RUNNING_ARGS"] = "1"
+    show_infer_arguments([dict(a=1), dict(b=1), object()])
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..457bdb3ee2aab7d98faa5567856e8fa923589e0a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import copy
+import os
+
+
+def get_full_path(fname):
+    pwd = os.getcwd()
+    if fname.startswith('/'):
+        return fname
+    return os.path.join(pwd, fname)
+
+
+def is_main_proc(rank):
+    return str(rank) in ["None", "-1", "0"]
+
+
+def main_proc_print(*args, **kwargs):
+    if "RANK" in os.environ:
+        if is_main_proc(os.environ["RANK"]):
+            print(*args, **kwargs)
+            return
+
+    if "LOCAL_RANK" in os.environ:
+        if is_main_proc(os.environ["LOCAL_RANK"]):
+            print(*args, **kwargs)
+            return
+
+    print(*args, **kwargs)
+
+
+def create_subproc_env():
+    env = copy.copy(os.environ)
+    env["USE_DLTEST"] = "1"
+    return env
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9883213f4f44d8253986e91c64f4015c66d6ec4
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import os.path as ospath
+from pathlib import Path
+import tempfile
+
+
+class TemporaryFile:
+
+    def __init__(self, with_open=False, mode='r'):
+        self.name = None
+        self.with_open = with_open
+        self.mode = mode
+
+        self.file = None
+
+    def create(self):
+        self.name = tempfile.mktemp()
+        file_path = Path(self.name)
+        file_path.touch()
+
+    def delete(self):
+        if self.name is not None and ospath.exists(self.name):
+            os.unlink(self.name)
+
+    def read(self):
+        self._check_file_status()
+        return self.file.read()
+
+    def readlines(self):
+        self._check_file_status()
+        return self.file.readlines()
+
+    def _check_file_status(self):
+        if self.file is None:
+            raise RuntimeError("File is closed, please reopen it.")
+
+    def __enter__(self):
+        self.create()
+        if self.with_open:
+            self.file = open(self.name, mode=self.mode)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.with_open:
+            self.file.close()
+        self.delete()
+
+
+
+
+
+
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c5de879b0470d29e208368f1681df8469dcf488
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import subprocess
+from typing import Callable, Union, List
+
+from dltest.utils.real_tempfile import TemporaryFile
+from dltest.utils import misc
+
+
+def get_output_with_pipe(command, shell=None, callback: Callable[[list], None]=None, *args, **kwargs):
+    if shell is None:
+        shell = True
+
+    if shell and not isinstance(command, str):
+        command = " ".join(command)
+
+    stream = subprocess.Popen(
+        command, shell=shell,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        *args, **kwargs
+    )
+    outputs = []
+    while 1:
+        exit_code = stream.poll()
+        if exit_code is None:
+            if stream.stdout.readable():
+                outputs.append(stream.stdout.readline().decode("utf8").rstrip())
+                if callback is not None:
+                    callback(outputs[-1:])
+                print(outputs[-1])
+        else:
+            if stream.stdout.readable():
+                lines = stream.stdout.readlines()
+                lines = [line.decode("utf8".rstrip()) for line in lines]
+                outputs.extend(lines)
+                if callback is not None:
+                    callback(outputs[-1:])
+                print('\n'.join(lines))
+            break
+
+    return outputs
+
+
+def get_output_with_tempfile(command, *args, **kwargs):
+    if not isinstance(command, (list, tuple)):
+        command = [command]
+    stdout = None
+    with TemporaryFile(with_open=True) as file:
+        command.extend(['|', 'tee', file.name])
+        command = " ".join(command)
+
+        res = subprocess.run(command, stdout=stdout, stderr=subprocess.STDOUT, shell=True, *args, **kwargs)
+        output = file.readlines()
+
+    return output
+
+def execute_shell(command, *args, **kwargs):
+    if "env" not in kwargs:
+        kwargs["env"] = misc.create_subproc_env()
+
+    if not isinstance(command, (list, tuple)):
+        command = [command]
+
+    command = " ".join(command)
+    res = subprocess.run(command,
+                         shell=True, *args, **kwargs)
+    return res
+
+def get_output(command: List, capture_output_method: str = 'tempfile', *args, **kwargs):
+    if "env" not in kwargs:
+        kwargs["env"] = misc.create_subproc_env()
+
+    if capture_output_method == "tempfile":
+        return get_output_with_tempfile(command, *args, **kwargs)
+    return get_output_with_pipe(command, *args, **kwargs)
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e4fa4eea09fa2cdf51b02619d56fe5fcced869f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from setuptools import setup, find_packages
+from dltest.cli.entry_points import make_execute_path
+
+setup(
+    name="dltest",
+    version="0.1",
+    description='Iluvatar Corex AI Toolbox',
+    packages=find_packages(exclude=('examples')),
+    include_package_data=True,
+    zip_safe=False,
+    entry_points = {
+        'console_scripts': make_execute_path(),
+    },
+    install_requires=[
+        'psutil'
+    ]
+)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c40a978ea07b4a2bd107cd0cbba1c63ecea7256
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
@@ -0,0 +1,582 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import (
+    FusionCustomFC,
+    FusionCustomFCActivation,
+    FusionCustomFCGPT2,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_videobert_attention import FusionVideoBertAttention
+from passes.fusion_vit_attention import FusionVITAttention
+from passes.fusion_xsoftmax import FusionXSoftmax
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class BertOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class BertOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        self.attention_fusion.apply()
+        FusionAlbertAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        ).apply()
+        FusionVideoBertAttention(self).apply()
+        FusionVITAttention(self).apply()
+        FusionSwinLAttention(self).apply()
+        FusionGptAttentionNoPast(self).apply()
+        # Only relevant in models with Q-DQ nodes
+        self.qordered_attention_fusion.apply()
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_gpt2_classify(self):
+        fusion = FusionCustomFCGPT2(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def fuse_custom_xsoftmax(self):
+        fusion = FusionXSoftmax(self)
+        fusion.apply()
+
+    def fuse_disentangled_attention(self):
+        fusion = FusionDisentangledAttention(self)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self, self.hidden_size)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_swint_opt:
+            self.fuse_custom_fc()
+            self.fuse_swinT_serial_bias_add()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        if options.enable_gpt2_classify or options.enable_vit:
+            self.fuse_custom_fc_gpt2_classify()
+
+        if options.enable_vit:
+            self.fuse_custom_fc()
+
+        if (options is None) or options.enable_attention:
+            if options is not None:
+                self.attention_mask.set_mask_format(options.attention_mask_format)
+            self.fuse_attention()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        self.fuse_custom_fc()
+
+        self.fuse_custom_xsoftmax()
+
+        self.fuse_disentangled_attention()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.fuse_custom_fc_activation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a250a9ea05c5d7b625523e62b976f94fa7ab6cff
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
@@ -0,0 +1,576 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_conformer_attention import FusionConformerAttention
+from passes.fusion_conformer_xsoftmax import FusionConformerXSoftmax
+from passes.fusion_customfc import (
+    FusionConformerCustomFCActivation,
+    FusionCustomFC,
+    FusionCustomFCGPT2,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_splitQKV import FusionSplitQKV
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_vit_attention import FusionVITAttention
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class ConformerOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class conformerOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        FusionConformerAttention(self, self.hidden_size, self.num_heads).apply()
+        # Only relevant in models with Q-DQ nodes
+        self.qordered_attention_fusion.apply()
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_conformer_activation(self):
+        fusion = FusionConformerCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_gpt2_classify(self):
+        fusion = FusionCustomFCGPT2(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def fuse_custom_xsoftmax(self):
+        fusion = FusionConformerXSoftmax(self)
+        fusion.apply()
+
+    def fuse_disentangled_attention(self):
+        fusion = FusionDisentangledAttention(self)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self, self.hidden_size)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_split_qkv(self):
+        fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_swint_opt:
+            self.fuse_custom_fc()
+            self.fuse_swinT_serial_bias_add()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        if options.enable_gpt2_classify or options.enable_vit:
+            self.fuse_custom_fc_gpt2_classify()
+
+        if options.enable_vit:
+            self.fuse_custom_fc()
+
+        self.fuse_custom_fc()
+        self.fuse_custom_xsoftmax()
+
+        self.fuse_attention()
+
+        self.fuse_split_qkv()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.remove_unused_constant()
+        self.fuse_custom_fc_conformer_activation()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..85889319916199298dad2e9d2b47cde052c7c746
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
@@ -0,0 +1,540 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import (
+    FusionCustomFC,
+    FusionCustomFCActivation,
+    FusionCustomFcRoformer,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import (
+    FusionLayerNormalization,
+    FusionLayerNormalizationKeras,
+    FusionLayerNormalizationTF,
+)
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_roformer_attention import FusionRoformerCrossAttention
+from passes.fusion_rope import FusionRoPE
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_videobert_attention import FusionVideoBertAttention
+from passes.fusion_vit_attention import FusionVITAttention
+from passes.fusion_xsoftmax import FusionXSoftmax
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class RoformerOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        FusionRoformerCrossAttention(self).apply()
+
+    def fuse_format_roformer(self):
+        # FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_roformer(self):
+        fusion = FusionCustomFcRoformer(self)
+        fusion.apply()
+
+    def fuse_rope(self):
+        fusion = FusionRoPE(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalizationKeras(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        self.fuse_custom_fc_roformer()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        self.fuse_custom_fc()
+
+        if (options is None) or options.enable_attention:
+            if options is not None:
+                self.attention_mask.set_mask_format(options.attention_mask_format)
+            self.fuse_attention()
+
+        self.fuse_rope()
+
+        self.fuse_shape()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.fuse_custom_fc_activation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b1d6b5fec3bfa10533527a72e475cca1bc63b86
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
@@ -0,0 +1,519 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_rms_norm import FusionRMSNorm
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_t5_attention import FusionT5Attention
+from passes.fusion_utils import FusionUtils
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class BertOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class T5OnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize T5 ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_rms_norm(self):
+        fusion = FusionRMSNorm(self)
+        fusion.apply()
+
+    def fuse_t5_attention(self):
+        fusion = FusionT5Attention(self)
+        fusion.apply()
+        # pass
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        self.fuse_rms_norm()
+
+        self.fuse_t5_attention()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
new file mode 100644
index 0000000000000000000000000000000000000000..57982d0cc739fd766b5cc87a51479c62dabb22be
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
@@ -0,0 +1,114 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+from onnx import ModelProto
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_utils import FusionUtils
+from passes.fusion_yolov5_decoder import FusionYoloV5Decoder
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class YoloOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+        super().__init__(model)
+        self.utils = FusionUtils(self)
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self, 0)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.fuse_reshape()
+
+        FusionYoloV5Decoder(self).apply()
+        self.remove_unused_constant()
+        logger.info(f"opset version: {self.get_opset_version()}")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc823d366b327141bd5646e7d3aef153349cea8e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
@@ -0,0 +1,51 @@
+# IxRT optimizer
+
+## 1. optimizer 简介
+`optimizer` 是一个 ixrt 中集成的图融合工具，用于将onnx图中的op融合成对应的ixrt plugin；
+
+## 2. optimizer 功能说明
+| 功能           | 说明  |
+| -------------- | ---- |
+| 多 batchsize 支持 | 支持设置不同 batchsize 进行推理测试 |
+| 动态图支持 | 支持融合动态图和静态图 |
+| 模型支持 | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert等模型 |
+
+## 3. optimizer 运行参数
+| 参数           | 说明  |
+| -------------- | ---- |
+| `--onnx`       | 必选 ，指定要运行的 onnx 模型路径 |
+| `--num_heads`  | 可选 ，指定模型对应Attention模块注意力头的个数 |
+|`--hidden_size`    | 可选， 模型模型隐藏层的大小|
+|`--input_shapes` | 可选 ，指定模型输入数据类型，示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 |
+| `--dump_onnx` | 可选 ，用于图融合过程中dump出中间的onnx图 |
+|`--model_type`        | 可选 ，可以指定要融合的模型类型，默认是"bert", 可选["bert", "swint", "roformer"]|
+|`--log_level`     |可选 ，指定ixrt运行时显示日志的等级， 可指定为debug、info、error，默认为 info|
+
+
+## 4. 运行示例
+
+###  4.1 示例1：融合albert|videobert|roberta|deberta
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH}
+```
+
+###  4.2 示例2：融合swinL
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint
+```
+
+###  4.3 示例3：融合roformer
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer
+```
+
+### 4.4 精度验证
+
+请参考[高级话题](5_advanced_topics.md)中的<u>精度对比工具</u>一节，了解详细使用方法和原理。
+
+也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md)
+
+具体使用方法可以参考oss/samples
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..701bd7a41f9a7b87249b1af5e6e8aaac6db4d53d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
@@ -0,0 +1,195 @@
+import argparse
+import logging
+import time
+from typing import Dict, Optional
+
+import onnx
+from onnx import ModelProto, helper, load_model
+from onnx_model_bert import BertOnnxModel
+from onnx_model_roformer import RoformerOnnxModel
+from onnx_model_conformer import conformerOnnxModel
+from onnx_model_t5 import T5OnnxModel
+from onnx_model_yolo import YoloOnnxModel
+from onnxsim import simplify
+from passes.fusion_options import FusionOptions
+from passes.symbolic_shape_infer import SymbolicShapeInference
+
+logger = logging.getLogger(__name__)
+MODEL_TYPES = {
+    "bert": (BertOnnxModel, None, "pytorch", 1),
+    "swint": (BertOnnxModel, None, "pytorch", 1),
+    "roformer": (RoformerOnnxModel, None, "tf2onnx", 1),
+    "gpt2": (BertOnnxModel, None, "pytorch", 1),
+    "t5": (T5OnnxModel, None, "tf2onnx", 1),
+    "yolo": (YoloOnnxModel, None, "pytorch", 1),
+    "vit": (BertOnnxModel, None, "pytorch", 1),
+    "conformer": (conformerOnnxModel, None, "pytorch", 1),
+}
+
+
+def optimize_by_fusion(
+    model: ModelProto,
+    model_type: str = "bert",
+    num_heads: int = 0,
+    hidden_size: int = 0,
+    optimization_options: Optional[FusionOptions] = None,
+):
+    """Optimize Model by graph fusion logic.
+
+    Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
+    constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model.
+
+    For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
+
+    Args:
+        model (ModelProto): model object
+        model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
+        num_heads (int, optional): number of attention heads. Defaults to 0.
+                                   0 allows detect the parameter from graph automatically (for model_type "bert" only).
+        hidden_size (int, optional): hidden size. Defaults to 0.
+                                     0 allows detect the parameter from graph automatically (for model_type "bert" only).
+        optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None.
+
+     Returns:
+        object of an optimizer class.
+    """
+    if model_type != "bert" and (num_heads == 0 or hidden_size == 0):
+        logger.warning(
+            "Please specify parameters of num_heads and hidden_size when model_type is not 'bert'"
+        )
+
+    (optimizer_class, transformer_class, producer, _) = MODEL_TYPES[model_type]
+
+    if model.producer_name and producer != model.producer_name:
+        logger.warning(
+            f'Model producer not matched: Expected "{producer}", Got "{model.producer_name}".'
+            "Please specify correct --model_type parameter."
+        )
+
+    if optimization_options is None:
+        optimization_options = FusionOptions(model_type)
+
+    optimizer = optimizer_class(model, num_heads, hidden_size)
+
+    optimizer.optimize(optimization_options)
+
+    optimizer.topological_sort()
+
+    return optimizer, transformer_class
+
+
+def optimize_to_ixrt(args):
+    onnx_name = args.onnx[:-5]
+    model = onnx.load(args.onnx)
+
+    logger.info("simplify..")
+    simplified_model, check = simplify(model)
+    logger.info("simplify model end...")
+    if args.dump_onnx:
+        onnx.save(simplified_model, onnx_name + "_sim.onnx")
+
+    # transfer to static shape and optimize it
+    static_sim_model = simplified_model
+    if args.input_shapes:
+        for input_tensor in simplified_model.graph.input:
+            if input_tensor.name in args.input_shapes.keys():
+                new_shape = args.input_shapes[input_tensor.name]
+                dim_list = []
+                for dim in new_shape:
+                    if isinstance(dim, int):
+                        dim_proto = onnx.TensorShapeProto.Dimension()
+                        dim_proto.dim_value = dim
+                        dim_list.append(dim_proto)
+                    elif isinstance(dim, str):
+                        dim_proto = onnx.TensorShapeProto.Dimension()
+                        dim_proto.dim_param = dim
+                        dim_list.append(dim_proto)
+
+                del input_tensor.type.tensor_type.shape.dim[:]
+                input_tensor.type.tensor_type.shape.dim.extend(dim_list)
+
+    try:
+        auto_merge = False
+        if args.model_type in ["roformer"]:
+            auto_merge = True
+        static_model = SymbolicShapeInference.infer_shapes(
+            simplified_model, 2**31 - 1, auto_merge, False, 3
+        )
+        static_sim_model, check = simplify(static_model)
+        if args.dump_onnx:
+            onnx.save(static_sim_model, onnx_name + "_sim_static_sim.onnx")
+    except Exception as e:
+        static_model = static_sim_model = simplified_model
+
+    if args.dump_onnx:
+        onnx.save(static_model, onnx_name + "_sim_static.onnx")
+
+    logger.info("start fusion..")
+    opt_model, _ = optimize_by_fusion(
+        static_sim_model, args.model_type, args.num_heads, args.hidden_size
+    )
+    opt_model.save_model_to_file(onnx_name + "_end.onnx")
+    logger.info("done..")
+
+
+def parse_params(params_str):
+    params = {}
+    for item in params_str.replace(" ", "").split(","):
+        key, value = item.split(":")
+        params[key] = [int(x) if x.isdigit() else x for x in value.split("x")]
+    return params
+
+
+def args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--onnx", type=str, default=None, required=False, help="ONNX model file path"
+    )
+    parser.add_argument(
+        "--num_heads",
+        type=int,
+        default=0,
+        help="Used in model optimization. The num of the head used in the network",
+    )
+    parser.add_argument(
+        "--hidden_size",
+        type=int,
+        default=0,
+        help="Used in model optimization. The hidden_size used in the network",
+    )
+    parser.add_argument(
+        "--input_shapes",
+        type=parse_params,
+        help='Static input_shapes to the inference, format is --input_shapes "input_name1:3x224x224, input_name2:3x224x224"',
+    )
+    parser.add_argument(
+        "--dump_onnx",
+        action="store_true",
+        help="Whether to dump onnx",
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        choices=["bert", "swint", "roformer", "t5", "yolo", "gpt2", "vit", "conformer"],
+        help="Which kind of model to optimize",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="info",
+        choices=["debug", "info", "error"],
+        help="Which kind of model to optimize",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = args_parser()
+    if args.log_level == "info":
+        logging.basicConfig(level=logging.INFO)
+    elif args.log_level == "debug":
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.ERROR)
+    optimize_to_ixrt(args)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
new file mode 100644
index 0000000000000000000000000000000000000000..437e72fce0a316ab9d5041e86c6e9e864272a0b2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
@@ -0,0 +1,394 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
+# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option.
+
+import itertools
+import logging
+from typing import Dict, List
+
+import numpy as np
+import onnx
+from onnx import helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+from packaging import version
+
+logger = logging.getLogger(__name__)
+
+
+def _npfloat16_to_int(np_list):
+    """
+    Convert numpy float16 to python int.
+
+    :param np_list: numpy float16 list
+    :return int_list: python int list
+    """
+    return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
+
+
+def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """
+    Convert float32 numpy array to float16 without changing sign or finiteness.
+    Positive values less than min_positive_val are mapped to min_positive_val.
+    Positive finite values greater than max_finite_val are mapped to max_finite_val.
+    Similar for negative values. NaN, 0, inf, and -inf are unchanged.
+    """
+
+    def between(a, b, c):
+        return np.logical_and(a < b, b < c)
+
+    np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
+    np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
+    np_array = np.where(between(max_finite_val, np_array, float("inf")), max_finite_val, np_array)
+    np_array = np.where(between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array)
+    return np.float16(np_array)
+
+
+def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """Convert tensor float to float16.
+
+    Args:
+        tensor (TensorProto): the tensor to convert.
+        min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
+        max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
+
+    Raises:
+        ValueError: input type is not TensorProto.
+
+    Returns:
+        TensorProto: the converted tensor.
+    """
+
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+
+    if tensor.data_type == onnx_proto.TensorProto.FLOAT:
+        tensor.data_type = onnx_proto.TensorProto.FLOAT16
+        # convert float_data (float type) to float16 and write to int32_data
+        if tensor.float_data:
+            float16_data = convert_np_to_float16(np.array(tensor.float_data), min_positive_val, max_finite_val)
+            int_list = _npfloat16_to_int(float16_data)
+            tensor.int32_data[:] = int_list
+            tensor.float_data[:] = []
+        # convert raw_data (bytes type)
+        if tensor.raw_data:
+            # convert n.raw_data to float
+            float32_list = np.frombuffer(tensor.raw_data, dtype="float32")
+            # convert float to float16
+            float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
+            # convert float16 to bytes and write back to raw_data
+            tensor.raw_data = float16_list.tobytes()
+    return tensor
+
+
+def make_value_info_from_tensor(tensor):
+    shape = numpy_helper.to_array(tensor).shape
+    return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
+
+
+DEFAULT_OP_BLOCK_LIST = [
+    "ArrayFeatureExtractor",
+    "Binarizer",
+    "CastMap",
+    "CategoryMapper",
+    "DictVectorizer",
+    "FeatureVectorizer",
+    "Imputer",
+    "LabelEncoder",
+    "LinearClassifier",
+    "LinearRegressor",
+    "Normalizer",
+    "OneHotEncoder",
+    "SVMClassifier",
+    "SVMRegressor",
+    "Scaler",
+    "TreeEnsembleClassifier",
+    "TreeEnsembleRegressor",
+    "ZipMap",
+    "NonMaxSuppression",
+    "TopK",
+    "RoiAlign",
+    "Resize",
+    "Range",
+    "CumSum",
+    "Min",
+    "Max",
+    "Upsample",
+]
+
+
+class InitializerTracker:
+    """Class for keeping track of initializer."""
+
+    def __init__(self, initializer: onnx_proto.TensorProto):
+        self.initializer = initializer
+        self.fp32_nodes = []
+        self.fp16_nodes = []
+
+    def add_node(self, node: onnx_proto.NodeProto, is_node_blocked):
+        if is_node_blocked:
+            self.fp32_nodes.append(node)
+        else:
+            self.fp16_nodes.append(node)
+
+
+def convert_float_to_float16(
+    model,
+    min_positive_val=5.96e-08,
+    max_finite_val=65504.0,
+    keep_io_types=False,
+    disable_shape_infer=False,
+    op_block_list=None,
+    node_block_list=None,
+    force_fp16_initializers=False,
+):
+    """Convert model tensor float type in the ONNX ModelProto input to tensor float16.
+
+    Args:
+        model (ModelProto): The ONNX model to convert.
+        min_positive_val (float, optional): minimal positive value. Defaults to 5.96e-08.
+        max_finite_val (float, optional): maximal finite value of float16. Defaults to 65504.
+        keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
+                                                          If True, model inputs/outputs should be left as float32. Defaults to False.
+        disable_shape_infer (bool, optional): Skips running onnx shape/type inference. Useful if shape inference has been done. Defaults to False.
+        op_block_list (List[str], optional): List of op types to leave as float32.
+                                             Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
+        node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
+        force_fp16_initializers(bool): force converting all float initializers to float16.
+                                       Default to false, which will convert only the one needed to avoid precision loss.
+    Raises:
+        ValueError: input type is not ModelProto.
+
+    Returns:
+        ModelProto: converted model.
+    """
+    assert (
+        min_positive_val >= 5.96e-08
+    ), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
+    assert max_finite_val <= float(np.finfo(np.float16).max), "invalid max_finite_val. largest float16 value: 65504"
+
+    func_infer_shape = None
+    if not disable_shape_infer and version.parse(onnx.__version__) >= version.parse("1.2.0"):
+        try:
+            from onnx.shape_inference import infer_shapes
+
+            func_infer_shape = infer_shapes
+        finally:
+            pass
+
+    if not isinstance(model, onnx_proto.ModelProto):
+        raise ValueError("Expected model type is an ONNX ModelProto but got %s" % type(model))
+
+    # create blocklists
+    if op_block_list is None:
+        op_block_list = DEFAULT_OP_BLOCK_LIST
+    if node_block_list is None:
+        node_block_list = []
+    op_block_list = set(op_block_list)
+    node_block_list = set(node_block_list)
+
+    logger.debug(
+        f"fp16 parameters: min_positive_val={min_positive_val} max_finite_val={max_finite_val} keep_io_types={keep_io_types} disable_shape_infer={disable_shape_infer} op_block_list={op_block_list} node_block_list={node_block_list} force_fp16_initializers={force_fp16_initializers}"
+    )
+
+    # create a queue for BFS
+    queue = []
+    value_info_list = []
+    node_list = []
+    # type inference on input model
+    if func_infer_shape is not None:
+        model = func_infer_shape(model)
+    queue.append(model)
+    name_mapping = {}
+    graph_io_to_skip = set()
+    io_casts = set()
+
+    fp32_inputs = [n.name for n in model.graph.input if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
+    fp32_outputs = [n.name for n in model.graph.output if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
+    if isinstance(keep_io_types, list):
+        fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]
+        fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]
+    elif not keep_io_types:
+        fp32_inputs = []
+        fp32_outputs = []
+
+    for i, n in enumerate(model.graph.input):
+        if n.name in fp32_inputs:
+            output_name = "graph_input_cast_" + str(i)
+            name_mapping[n.name] = output_name
+            graph_io_to_skip.add(n.name)
+
+            node_name = "graph_input_cast" + str(i)
+            new_value_info = model.graph.value_info.add()
+            new_value_info.CopyFrom(n)
+            new_value_info.name = output_name
+            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+            # add Cast node (from tensor(float) to tensor(float16) after graph input
+            new_node = [helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name)]
+            model.graph.node.extend(new_node)
+            value_info_list.append(new_value_info)
+            io_casts.add(node_name)
+
+    for i, n in enumerate(model.graph.output):
+        if n.name in fp32_outputs:
+            input_name = "graph_output_cast_" + str(i)
+            name_mapping[n.name] = input_name
+            graph_io_to_skip.add(n.name)
+
+            node_name = "graph_output_cast" + str(i)
+            # add Cast node (from tensor(float16) to tensor(float) before graph output
+            new_value_info = model.graph.value_info.add()
+            new_value_info.CopyFrom(n)
+            new_value_info.name = input_name
+            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+            new_node = [helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)]
+            model.graph.node.extend(new_node)
+            value_info_list.append(new_value_info)
+            io_casts.add(node_name)
+
+    fp32_initializers: Dict[str, InitializerTracker] = {}
+    while queue:
+        next_level = []
+        for q in queue:
+            # if q is model, push q.graph (GraphProto)
+            if isinstance(q, onnx_proto.ModelProto):
+                next_level.append(q.graph)
+            # if q is model.graph, push q.node.attribute (AttributeProto)
+            if isinstance(q, onnx_proto.GraphProto):
+                for n in q.initializer:  # TensorProto type
+                    if n.data_type == onnx_proto.TensorProto.FLOAT:
+                        assert n.name not in fp32_initializers
+                        fp32_initializers[n.name] = InitializerTracker(n)
+
+                for n in q.node:
+                    # if n is in the block list (doesn't support float16), no conversion for the node,
+                    # and save the node for further processing
+                    if n.name in io_casts:
+                        continue
+                    for i in range(len(n.input)):
+                        if n.input[i] in name_mapping:
+                            n.input[i] = name_mapping[n.input[i]]
+                    for i in range(len(n.output)):
+                        if n.output[i] in name_mapping:
+                            n.output[i] = name_mapping[n.output[i]]
+
+                    is_node_blocked = n.op_type in op_block_list or n.name in node_block_list
+                    for input in n.input:
+                        if input in fp32_initializers:
+                            fp32_initializers[input].add_node(n, is_node_blocked)
+
+                    if is_node_blocked:
+                        node_list.append(n)
+                    else:
+                        if n.op_type == "Cast":
+                            for attr in n.attribute:
+                                if attr.name == "to" and attr.i == 1:
+                                    attr.i = 10
+                                    break
+                        for attr in n.attribute:
+                            next_level.append(attr)
+            # if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)
+            # and process node.attribute.t and node.attribute.tensors (TensorProto)
+            if isinstance(q, onnx_proto.AttributeProto):
+                next_level.append(q.g)
+                for n in q.graphs:
+                    next_level.append(n)
+                q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))
+                for n in q.tensors:
+                    n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)
+            # if q is graph, process input, output and value_info (ValueInfoProto)
+            if isinstance(q, onnx_proto.GraphProto):
+                # Note that float initializers tracked by fp32_initializers will be processed later.
+                # for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to
+                # tensor(float16) except map and seq(map). And save them in value_info_list for further processing
+                for n in itertools.chain(q.input, q.output, q.value_info):
+                    if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                        if n.name not in graph_io_to_skip:
+                            n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+                            value_info_list.append(n)
+                    if n.type.HasField("sequence_type"):
+                        if n.type.sequence_type.elem_type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                            if n.name not in graph_io_to_skip:
+                                n.type.sequence_type.elem_type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+                                value_info_list.append(n)
+
+        queue = next_level
+
+    for key, value in fp32_initializers.items():
+        # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.
+        if force_fp16_initializers or value.fp16_nodes:
+            value.initializer = convert_tensor_float_to_float16(value.initializer, min_positive_val, max_finite_val)
+            value_info_list.append(make_value_info_from_tensor(value.initializer))
+            if value.fp32_nodes and not force_fp16_initializers:
+                logger.info(
+                    "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
+                        value.fp16_nodes
+                    )
+                )
+
+    # process the nodes in block list that doesn't support tensor(float16)
+    for node in node_list:
+        # if input's name is in the value_info_list meaning input is tensor(float16) type,
+        # insert a float16 to float Cast node before the node,
+        # change current node's input name and create new value_info for the new name
+        for i in range(len(node.input)):
+            input = node.input[i]
+            for value_info in value_info_list:
+                if input == value_info.name:
+                    # create new value_info for current node's new input name
+                    new_value_info = model.graph.value_info.add()
+                    new_value_info.CopyFrom(value_info)
+                    output_name = node.name + "_input_cast_" + str(i)
+                    new_value_info.name = output_name
+                    new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
+                    # add Cast node (from tensor(float16) to tensor(float) before current node
+                    node_name = node.name + "_input_cast" + str(i)
+                    new_node = [helper.make_node("Cast", [input], [output_name], to=1, name=node_name)]
+                    model.graph.node.extend(new_node)
+                    # change current node's input name
+                    node.input[i] = output_name
+                    break
+        # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to
+        # float16 Cast node after the node, change current node's output name and create new value_info for the new name
+        for i in range(len(node.output)):
+            output = node.output[i]
+            for value_info in value_info_list:
+                if output == value_info.name:
+                    # create new value_info for current node's new output
+                    new_value_info = model.graph.value_info.add()
+                    new_value_info.CopyFrom(value_info)
+                    input_name = node.name + "_output_cast_" + str(i)
+                    new_value_info.name = input_name
+                    new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
+                    # add Cast node (from tensor(float) to tensor(float16) after current node
+                    node_name = node.name + "_output_cast" + str(i)
+                    new_node = [helper.make_node("Cast", [input_name], [output], to=10, name=node_name)]
+                    model.graph.node.extend(new_node)
+                    # change current node's input name
+                    node.output[i] = input_name
+                    break
+    return model
+
+
+def float_to_float16_max_diff(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """Measure the maximum absolute difference after converting a float tensor to float16."""
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+    if tensor.data_type != onnx_proto.TensorProto.FLOAT:
+        raise ValueError("Expected tensor data type is float.")
+
+    float32_data = None
+    if tensor.float_data:
+        float32_data = np.array(tensor.float_data)
+
+    if tensor.raw_data:
+        float32_data = np.frombuffer(tensor.raw_data, dtype="float32")
+
+    if float32_data is None:
+        raise RuntimeError("external data not loaded!")
+
+    float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)
+    return np.amax(np.abs(float32_data - np.float32(float16_data)))
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1fde76f93917ecdce2b22defc5dc5d4bd5bdaea
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
@@ -0,0 +1,65 @@
+from logging import getLogger
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+import numpy as np
+import onnx
+
+logger = getLogger(__name__)
+
+
+class FusionSerialBiasAdd(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Add", "Softmax")
+
+    def match_parent_path_from_dict(self, start_node, path_dict):
+        res_path = None
+        res_nodes = None
+        for k, v in path_dict.items():
+            res_nodes = self.model.match_parent_path(start_node, v[0], v[1])
+            if res_nodes is None:
+                continue
+            return res_nodes, k
+        return res_nodes, res_path
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        paths = {
+            "path1": (["Reshape", "Add", "Reshape", "Add"], [0, 0, 0, 0]),
+        }
+        series_nodes, path_chosen = self.match_parent_path_from_dict(node, paths)
+        if not series_nodes:
+            return
+        last_reshape, add_2nd, _, add_1st = series_nodes
+
+        biases = [
+            self.model.get_initializer(add_1st.input[1]),
+            self.model.get_initializer(add_2nd.input[1])
+        ]
+        if not all(biases):
+            return
+
+        bias_arr_1st = NumpyHelper.to_array(biases[0])
+        bias_arr_2nd = NumpyHelper.to_array(biases[1]).squeeze(0)
+        try:
+            relative_position_bias = bias_arr_1st + bias_arr_2nd
+        except Exception as e:
+            print("Two bias are unrelated:", e)
+            return
+
+        # Fuse
+        add_name = self.model.create_node_name("Add", "Add")
+        B = biases[0]
+        B.CopyFrom(numpy_helper.from_array(relative_position_bias, B.name))
+
+        fused_node = helper.make_node(
+            "Add",
+            inputs=[add_1st.input[0], B.name],
+            outputs=last_reshape.output,
+            name=add_name,
+        )
+        fused_node.domain = "com.iluvatar"
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend(series_nodes)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..47b8ec777a026d97256547d80e5a3c9d6ef77c2d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
@@ -0,0 +1,602 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import List, Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_attention import AttentionMask
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionAlbertAttention(Fusion):
+    """
+    Fuse Albert subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(
+                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
+            )
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(
+                    f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
+                )
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = (
+                    False  # Do not show the warning more than once
+                )
+
+        return num_heads, hidden_size
+
+    def get_add_qk_str(self, add_qk: NodeProto):
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
+        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
+
+        if input_0_shape is None or input_1_shape is None:
+            logger.debug(f"one of the inputs of {add_qk} is None")
+            return None
+
+        if input_0_shape != input_1_shape:
+            logger.debug(f"the shape of two inputs of {add_qk} is not same")
+            return None
+
+        return add_qk.input[1]
+
+    def create_attention_node(
+        self,
+        mask_index: str,
+        q_matmul: NodeProto,
+        k_matmul: NodeProto,
+        v_matmul: NodeProto,
+        q_add: NodeProto,
+        k_add: NodeProto,
+        v_add: NodeProto,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        add_qk_str: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            mask_index (str): mask input
+            q_matmul (NodeProto): MatMul node in fully connection for Q
+            k_matmul (NodeProto): MatMul node in fully connection for  K
+            v_matmul (NodeProto): MatMul node in fully connection for  V
+            q_add (NodeProto): Add bias node in fully connection for Q
+            k_add (NodeProto): Add bias node in fully connection for K
+            v_add (NodeProto): Add bias node in fully connection for V
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        q_weight = self.model.get_initializer(q_matmul.input[1])
+        k_weight = self.model.get_initializer(k_matmul.input[1])
+        v_weight = self.model.get_initializer(v_matmul.input[1])
+        q_bias = self.model.get_initializer(
+            q_add.input[1]
+        ) or self.model.get_initializer(q_add.input[0])
+        k_bias = self.model.get_initializer(
+            k_add.input[1]
+        ) or self.model.get_initializer(k_add.input[0])
+        v_bias = self.model.get_initializer(
+            v_add.input[1]
+        ) or self.model.get_initializer(v_add.input[0])
+
+        if q_weight is None:
+            print(
+                f"{q_matmul.input[1]} is not an initializer. "
+                "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
+            )
+            return None
+        if not (k_weight and v_weight and q_bias and k_bias):
+            return None
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        # assert q and k have same shape as expected
+        assert qw.shape == kw.shape
+
+        qw_in_size = qw.shape[0]
+        kw_in_size = kw.shape[0]
+        vw_in_size = vw.shape[0]
+
+        assert qw_in_size == kw_in_size == vw_in_size
+
+        if hidden_size > 0 and hidden_size != qw_in_size:
+            logger.warning(
+                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
+                "Please provide a correct input hidden size or pass in 0"
+            )
+
+        is_qkv_diff_dims = False
+
+        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
+        # For 2d weights, the shapes would be [in_size, out_size].
+        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        qkv_weight_dim = 0
+        qkv_weight = np.concatenate((qw, kw, vw), axis=1)
+        qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
+
+        qb = NumpyHelper.to_array(q_bias)
+        kb = NumpyHelper.to_array(k_bias)
+        vb = NumpyHelper.to_array(v_bias)
+
+        q_bias_shape = np.prod(qb.shape)
+        k_bias_shape = np.prod(kb.shape)
+        v_bias_shape = np.prod(vb.shape)
+
+        assert q_bias_shape == k_bias_shape == qw_out_size
+        assert v_bias_shape == vw_out_size
+
+        qkv_bias_dim = 0
+        if is_qkv_diff_dims:
+            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
+            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
+        else:
+            qkv_bias = np.stack((qb, kb, vb), axis=0)
+            qkv_bias_dim = 3 * q_bias_shape
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        weight = helper.make_tensor(
+            name=attention_node_name + "_qkv_weight",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_weight_dim, qw_in_size],
+            vals=qkv_weight.transpose(1, 0).flatten().tolist(),
+        )
+
+        # Sometimes weights and bias are stored in fp16
+        if q_weight.data_type == 10:
+            weight.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(weight).astype(np.float16), weight.name
+                )
+            )
+        self.model.add_initializer(weight, self.this_graph_name)
+
+        bias = helper.make_tensor(
+            name=attention_node_name + "_qkv_bias",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_bias_dim],
+            vals=qkv_bias.flatten().tolist(),
+        )
+        if q_bias.data_type == 10:
+            bias.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
+                )
+            )
+        self.model.add_initializer(bias, self.this_graph_name)
+
+        fc_output_tensor = helper.make_tensor_value_info(
+            attention_node_name + "_input", TensorProto.FLOAT, [None, None, None]
+        )
+        fc_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[input],
+            outputs=[fc_output_tensor.name],
+            name=self.model.create_node_name("AttentionFC", "MatMul_AddBias_"),
+        )
+        fc_node.domain = "com.iluvatar"
+        b = NumpyHelper.to_array(bias)
+        fc_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fc_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fc_node.attribute.extend([helper.make_attribute("W", weight)])
+        fc_node.attribute.extend([helper.make_attribute("B", bias)])
+        fc_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fc_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fc_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fc_node)
+
+        attention_inputs = [fc_node.output[0]]
+        if mask_index is not None:
+            attention_inputs.append(mask_index)
+        else:
+            attention_inputs.append("")
+
+        if add_qk_str is not None:
+            attention_inputs.append("")
+            attention_inputs.append(add_qk_str)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+
+        if is_qkv_diff_dims:
+            attention_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
+                    )
+                ]
+            )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+            else:
+                return
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [None, None, 0, 0, 0],
+        )
+        if qkv_nodes is None:
+            qkv_nodes = self.model.match_parent_path(
+                start_node,
+                ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+                [1, None, 0, 0, 0],
+            )
+        einsum_node = None
+        if qkv_nodes is not None:
+            (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+        else:
+            # Match Albert
+            qkv_nodes = self.model.match_parent_path(
+                start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
+            )
+            if qkv_nodes is not None:
+                (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
+            else:
+                return
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match flaubert                     Mask
+                                            |
+        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
+        if mul_before_layernorm is not None:
+            mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
+            if mul_children is not None and len(mul_children) == 2:
+                layernorm_node = mul_children[1]
+                if layernorm_node.op_type == "LayerNormalization":
+                    root_input = layernorm_node.output[0]
+                else:
+                    return
+            elif mul_children is not None and len(mul_children) == 5:
+                root_input = mul_before_layernorm.output[0]
+            else:
+                return
+        elif normalize_node.op_type == "LayerNormalization":
+            children = input_name_to_nodes[root_input]
+            for child in children:
+                if child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        children = input_name_to_nodes[root_input]
+        children_types = [child.op_type for child in children]
+        if children_types.count("MatMul") != 3:
+            return
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
+        )
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (_, _, add_v, matmul_v) = v_nodes
+
+        is_distill = False
+        is_distill_add = False
+        qk_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
+            "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
+            "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
+            "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
+        }
+
+        qk_nodes = None
+        for k, v in qk_paths.items():
+            qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
+            if qk_nodes is None:
+                continue
+            if k == "path3":
+                is_distill = True
+            if k == "path4":
+                is_distill_add = True
+            break
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        add_qk = None
+        matmul_qk = None
+        where_qk = None
+        if is_distill:
+            (_, where_qk, matmul_qk, _) = qk_nodes
+        elif is_distill_add:
+            (_, add_qk, where_qk, matmul_qk) = qk_nodes
+        else:
+            (_, add_qk, _, matmul_qk) = qk_nodes
+
+        q_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]
+        )
+        if q_nodes is None:
+            q_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Div", "Transpose", "Reshape", "Add", "MatMul"],
+                [0, 0, 0, 0, None],
+            )
+            if q_nodes is None:
+                logger.debug("fuse_attention: failed to match q path")
+                return
+        reshape_q = q_nodes[-3]
+        add_q = q_nodes[-2]
+        matmul_q = q_nodes[-1]
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
+        )
+        if k_nodes is None:
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
+                [1, 0, 0, 0, None],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+        add_k = k_nodes[-2]
+        matmul_k = k_nodes[-1]
+
+        # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
+        mask_nodes = None
+        add_qk_str = None
+        if is_distill:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Expand", "Reshape", "Equal"], [0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                    (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+        elif is_distill_add:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+            if add_qk is not None:
+                add_qk_str = self.get_add_qk_str(add_qk)
+                if add_qk_str is None:
+                    logger.debug(
+                        f"fuse_attention: failed to verify shape inference of {add_qk}"
+                    )
+                    return
+        else:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                add_qk,
+                [
+                    (
+                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
+                        [None, 0, 1, 0, 0],
+                    ),
+                    (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
+                    (["Mul", "Sub", "Cast", "Unsqueeze"], [None, 0, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+
+        if (
+            matmul_v.input[0] == root_input
+            and matmul_q.input[0] == root_input
+            and matmul_k.input[0] == root_input
+        ):
+            # mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+            if mask_nodes[0].op_type == "Mul":
+                mask_val = self.model.get_initializer(mask_nodes[0].input[1])
+                if mask_val is not None:
+                    mask_val_arr = NumpyHelper.to_array(mask_val)
+                    mask_val_arr = np.where(mask_val_arr <= -100, -100, 0.0).astype(
+                        np.float32
+                    )
+                    mask_val.CopyFrom(
+                        numpy_helper.from_array(mask_val_arr, mask_val.name)
+                    )
+            mask_index = mask_nodes[0].output[0]
+
+            attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
+
+            q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+            # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
+            # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
+            new_node = self.create_attention_node(
+                mask_index,
+                matmul_q,
+                matmul_k,
+                matmul_v,
+                add_q,
+                add_k,
+                add_v,
+                q_num_heads,
+                q_hidden_size,
+                root_input,
+                attention_last_node.output[0],
+                add_qk_str,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            if einsum_node is not None:
+                unique_index = einsum_node.input[0]
+                new_edge = "edge_modified_" + unique_index
+                shape_tensor = helper.make_tensor(
+                    name="shape_modified_tensor" + unique_index,
+                    data_type=TensorProto.INT64,
+                    dims=[4],
+                    vals=np.int64(
+                        [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]
+                    ).tobytes(),
+                    raw=True,
+                )
+                self.model.add_initializer(shape_tensor, self.this_graph_name)
+                self.model.add_node(
+                    helper.make_node(
+                        "Reshape",
+                        [attention_last_node.output[0], shape_tensor.name],
+                        [new_edge],
+                        "reshape_modified_" + unique_index,
+                    ),
+                    self.this_graph_name,
+                )
+                einsum_node.input[0] = new_edge
+
+            self.nodes_to_remove.extend(
+                [attention_last_node, transpose_qkv, matmul_qkv]
+            )
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..c750721836750e7826a06e83a71138001dc79510
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
@@ -0,0 +1,571 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class AttentionMask:
+    """
+    Fuse Attention subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel):
+        self.model = model
+        # A lookup table with mask input as key, and mask index output as value
+        self.mask_indice = {}
+        # A lookup table with mask input as key, and cast (to int32) output as value
+        self.mask_casted = {}
+        self.utils = FusionUtils(model)
+        self.mask_format = AttentionMaskFormat.MaskIndexEnd
+
+    def set_mask_format(self, mask_format: AttentionMaskFormat):
+        self.mask_format = mask_format
+
+    def set_mask_indice(self, mask, mask_index):
+        if mask in self.mask_indice:
+            assert mask_index == self.mask_indice[mask]
+        self.mask_indice[mask] = mask_index
+
+    def get_first_mask(self):
+        assert len(self.mask_indice) > 0
+        return next(iter(self.mask_indice))
+
+    def process_mask(self, input: str) -> str:
+        if self.mask_format == AttentionMaskFormat.NoMask:
+            return None
+
+        if input in self.mask_indice:
+            return self.mask_indice[input]
+
+        # Add cast to convert int64 to int32
+        if self.model.find_graph_input(input):
+            casted, input_name = self.utils.cast_graph_input_to_int32(input)
+        else:
+            input_name, cast_node = self.utils.cast_input_to_int32(input)
+            casted = True
+
+        if casted:
+            self.mask_casted[input] = input_name
+
+        # Attention supports int32 attention mask (2D) since 1.4.0
+        if self.mask_format == AttentionMaskFormat.AttentionMask:
+            self.mask_indice[input] = input_name
+            return input_name
+
+        # Add a mask processing node to convert attention mask to mask index (1D)
+        output_name = self.model.create_node_name("mask_index")
+        mask_index_node = helper.make_node(
+            "ReduceSum",
+            inputs=[input_name],
+            outputs=[output_name],
+            name=self.model.create_node_name("ReduceSum", "MaskReduceSum"),
+        )
+        mask_index_node.attribute.extend([helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)])
+        self.model.add_node(mask_index_node)
+
+        self.mask_indice[input] = output_name
+        return output_name
+
+
+class FusionAttention(Fusion):
+    """
+    Fuse Attention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        super().__init__(model, "Attention", ["SkipLayerNormalization", "LayerNormalization"])
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = False  # Do not show the warning more than once
+
+        return num_heads, hidden_size
+
+    def get_add_qk_str(self, add_qk: NodeProto):
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
+        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
+
+        if input_0_shape is None or input_1_shape is None:
+            logger.debug(f"one of the inputs of {add_qk} is None")
+            return None
+
+        if input_0_shape != input_1_shape:
+            logger.debug(f"the shape of two inputs of {add_qk} is not same")
+            return None
+
+        return add_qk.input[1]
+
+    def create_attention_node(
+        self,
+        mask_index: str,
+        q_matmul: NodeProto,
+        k_matmul: NodeProto,
+        v_matmul: NodeProto,
+        q_add: NodeProto,
+        k_add: NodeProto,
+        v_add: NodeProto,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        add_qk_str: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            mask_index (str): mask input
+            q_matmul (NodeProto): MatMul node in fully connection for Q
+            k_matmul (NodeProto): MatMul node in fully connection for  K
+            v_matmul (NodeProto): MatMul node in fully connection for  V
+            q_add (NodeProto): Add bias node in fully connection for Q
+            k_add (NodeProto): Add bias node in fully connection for K
+            v_add (NodeProto): Add bias node in fully connection for V
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        q_weight = self.model.get_initializer(q_matmul.input[1])
+        k_weight = self.model.get_initializer(k_matmul.input[1])
+        v_weight = self.model.get_initializer(v_matmul.input[1])
+        q_bias = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0])
+        k_bias = self.model.get_initializer(k_add.input[1]) or self.model.get_initializer(k_add.input[0])
+        v_bias = self.model.get_initializer(v_add.input[1]) or self.model.get_initializer(v_add.input[0])
+
+        if q_weight is None:
+            print(
+                f"{q_matmul.input[1]} is not an initializer. "
+                "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
+            )
+            return None
+        if not (k_weight and v_weight and q_bias and k_bias):
+            return None
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        # assert q and k have same shape as expected
+        assert qw.shape == kw.shape
+
+        qw_in_size = qw.shape[0]
+        kw_in_size = kw.shape[0]
+        vw_in_size = vw.shape[0]
+
+        assert qw_in_size == kw_in_size == vw_in_size
+
+        if hidden_size > 0 and hidden_size != qw_in_size:
+            logger.warning(
+                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
+                "Please provide a correct input hidden size or pass in 0"
+            )
+
+        is_qkv_diff_dims = False
+        if qw.shape != vw.shape:
+            is_qkv_diff_dims = True
+
+        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
+        # For 2d weights, the shapes would be [in_size, out_size].
+        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        qkv_weight_dim = 0
+        if is_qkv_diff_dims:
+            qkv_weight = np.concatenate((qw, kw, vw), axis=1)
+            qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
+        else:
+            qkv_weight = np.stack((qw, kw, vw), axis=1)
+            qkv_weight_dim = 3 * qw_out_size
+
+        qb = NumpyHelper.to_array(q_bias)
+        kb = NumpyHelper.to_array(k_bias)
+        vb = NumpyHelper.to_array(v_bias)
+
+        q_bias_shape = np.prod(qb.shape)
+        k_bias_shape = np.prod(kb.shape)
+        v_bias_shape = np.prod(vb.shape)
+
+        assert q_bias_shape == k_bias_shape == qw_out_size
+        assert v_bias_shape == vw_out_size
+
+        qkv_bias_dim = 0
+        if is_qkv_diff_dims:
+            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
+            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
+        else:
+            qkv_bias = np.stack((qb, kb, vb), axis=0)
+            qkv_bias_dim = 3 * q_bias_shape
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        weight = helper.make_tensor(
+            name=attention_node_name + "_qkv_weight",
+            data_type=TensorProto.FLOAT,
+            dims=[qw_in_size, qkv_weight_dim],
+            vals=qkv_weight.flatten().tolist(),
+        )
+
+        # Sometimes weights and bias are stored in fp16
+        if q_weight.data_type == 10:
+            weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
+        self.model.add_initializer(weight, self.this_graph_name)
+
+        bias = helper.make_tensor(
+            name=attention_node_name + "_qkv_bias",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_bias_dim],
+            vals=qkv_bias.flatten().tolist(),
+        )
+        if q_bias.data_type == 10:
+            bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
+        self.model.add_initializer(bias, self.this_graph_name)
+
+        attention_inputs = [
+            input,
+            attention_node_name + "_qkv_weight",
+            attention_node_name + "_qkv_bias",
+        ]
+        if mask_index is not None:
+            attention_inputs.append(mask_index)
+        else:
+            attention_inputs.append("")
+
+        if add_qk_str is not None:
+            attention_inputs.append("")
+            attention_inputs.append(add_qk_str)
+
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+
+        if is_qkv_diff_dims:
+            attention_node.attribute.extend(
+                [helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
+            )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+            else:
+                return
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [None, None, 0, 0, 0],
+        )
+        einsum_node = None
+        if qkv_nodes is not None:
+            (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+        else:
+            # Match Albert
+            qkv_nodes = self.model.match_parent_path(
+                start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
+            )
+            if qkv_nodes is not None:
+                (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
+            else:
+                return
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match flaubert                     Mask
+                                            |
+        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
+        if mul_before_layernorm is not None:
+            mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
+            if mul_children is not None and len(mul_children) == 2:
+                layernorm_node = mul_children[1]
+                if layernorm_node.op_type == "LayerNormalization":
+                    root_input = layernorm_node.output[0]
+                else:
+                    return
+            elif mul_children is not None and len(mul_children) == 5:
+                root_input = mul_before_layernorm.output[0]
+            else:
+                return
+        elif normalize_node.op_type == "LayerNormalization":
+            children = input_name_to_nodes[root_input]
+            for child in children:
+                if child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        children = input_name_to_nodes[root_input]
+        children_types = [child.op_type for child in children]
+        if children_types.count("MatMul") != 3:
+            return
+
+        v_nodes = self.model.match_parent_path(matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (_, _, add_v, matmul_v) = v_nodes
+
+        is_distill = False
+        is_distill_add = False
+        qk_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
+            "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
+            "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
+            "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
+        }
+
+        qk_nodes = None
+        for k, v in qk_paths.items():
+            qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
+            if qk_nodes is None:
+                continue
+            if k == "path3":
+                is_distill = True
+            if k == "path4":
+                is_distill_add = True
+            break
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        add_qk = None
+        matmul_qk = None
+        where_qk = None
+        if is_distill:
+            (_, where_qk, matmul_qk, _) = qk_nodes
+        elif is_distill_add:
+            (_, add_qk, where_qk, matmul_qk) = qk_nodes
+        else:
+            (_, add_qk, _, matmul_qk) = qk_nodes
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None])
+        if q_nodes is None:
+            q_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Div", "Transpose", "Reshape", "Add", "MatMul"],
+                [0, 0, 0, 0, None],
+            )
+            if q_nodes is None:
+                logger.debug("fuse_attention: failed to match q path")
+                return
+        reshape_q = q_nodes[-3]
+        add_q = q_nodes[-2]
+        matmul_q = q_nodes[-1]
+
+        k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
+        if k_nodes is None:
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
+                [1, 0, 0, 0, None],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+        add_k = k_nodes[-2]
+        matmul_k = k_nodes[-1]
+
+        # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
+        mask_nodes = None
+        add_qk_str = None
+        if is_distill:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Expand", "Reshape", "Equal"], [0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                    (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+        elif is_distill_add:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+            if add_qk is not None:
+                add_qk_str = self.get_add_qk_str(add_qk)
+                if add_qk_str is None:
+                    logger.debug(f"fuse_attention: failed to verify shape inference of {add_qk}")
+                    return
+        else:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                add_qk,
+                [
+                    (
+                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
+                        [None, 0, 1, 0, 0],
+                    ),
+                    (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+
+        if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
+            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+
+            attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
+
+            q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+            # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
+            # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
+            new_node = self.create_attention_node(
+                mask_index,
+                matmul_q,
+                matmul_k,
+                matmul_v,
+                add_q,
+                add_k,
+                add_v,
+                q_num_heads,
+                q_hidden_size,
+                root_input,
+                attention_last_node.output[0],
+                add_qk_str,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            if einsum_node is not None:
+                unique_index = einsum_node.input[0]
+                new_edge = "edge_modified_" + unique_index
+                shape_tensor = helper.make_tensor(
+                    name="shape_modified_tensor" + unique_index,
+                    data_type=TensorProto.INT64,
+                    dims=[4],
+                    vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]).tobytes(),
+                    raw=True,
+                )
+                self.model.add_initializer(shape_tensor, self.this_graph_name)
+                self.model.add_node(
+                    helper.make_node(
+                        "Reshape",
+                        [attention_last_node.output[0], shape_tensor.name],
+                        [new_edge],
+                        "reshape_modified_" + unique_index,
+                    ),
+                    self.this_graph_name,
+                )
+                einsum_node.input[0] = new_edge
+
+            self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaf742a45f7c8a56f1166a32d3b803fb497fe041
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
@@ -0,0 +1,82 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import List, Union
+
+from onnx import GraphProto
+
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class Fusion:
+    def __init__(
+        self,
+        model: OnnxModel,
+        fused_op_type: str,
+        search_op_types: Union[str, List[str]],
+        description: str = None,
+    ):
+        self.search_op_types: List[str] = (
+            [search_op_types] if isinstance(search_op_types, str) else search_op_types
+        )
+        self.fused_op_type: str = fused_op_type
+        self.description: str = (
+            f"{fused_op_type}({description})" if description else fused_op_type
+        )
+        self.model: OnnxModel = model
+        self.nodes_to_remove: List = []
+        self.nodes_to_add: List = []
+        self.prune_graph: bool = False
+        self.node_name_to_graph_name: dict = {}
+        self.this_graph_name: str = None
+        # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter.
+        self.fused_count: int = 0
+
+    def apply(self):
+        logger.debug(f"start {self.description} fusion...")
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        output_name_to_node = self.model.output_name_to_node()
+
+        # This assumes that two search ops will not be fused at same time!
+        for search_op_type in self.search_op_types:
+            for node in self.model.get_nodes_by_op_type(search_op_type):
+                graph = self.model.get_graph_by_node(node)
+                if graph is None:
+                    raise Exception("Can not find node in any graphs")
+                self.this_graph_name = graph.name
+                self.fuse(node, input_name_to_nodes, output_name_to_node)
+
+        op_list = [node.op_type for node in self.nodes_to_add]
+        count = max(self.fused_count, op_list.count(self.fused_op_type))
+        if count > 0:
+            logger.info(f"Fused {self.description} count: {count}")
+
+        self.model.remove_nodes(self.nodes_to_remove)
+        self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name)
+
+        if self.prune_graph:
+            self.model.prune_graph()
+        elif self.nodes_to_remove or self.nodes_to_add:
+            self.model.update_graph()
+
+    def match_parent_path_from_dict(
+        self, start_node, path_dict, output_name_to_node=None, return_indice=None
+    ):
+        res_path = None
+        res_nodes = None
+        for k, v in path_dict.items():
+            res_nodes = self.model.match_parent_path(
+                start_node,
+                v[0],
+                v[1],
+                output_name_to_node=output_name_to_node,
+                return_indice=return_indice,
+            )
+            if res_nodes is None:
+                continue
+            return res_nodes, k
+        return res_nodes, res_path
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3406c7f231b04b6367b4311da315bf8eb3f7df
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
@@ -0,0 +1,66 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionBiasGelu(Fusion):
+    def __init__(self, model: OnnxModel, is_fastgelu):
+        if is_fastgelu:
+            super().__init__(model, "FastGelu", "FastGelu", "add bias")
+        else:
+            super().__init__(model, "BiasGelu", "Gelu")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        gelu_op_type = node.op_type
+        fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu"
+
+        if len(node.input) != 1:
+            return
+
+        nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None])
+        if nodes is None:
+            return
+        (add, matmul) = nodes
+
+        bias_weight = None
+        # bias should be one dimension
+        bias_index = -1
+        for i, input in enumerate(add.input):
+            initializer = self.model.get_initializer(input)
+            if initializer is None:
+                continue
+            bias_index = i
+            bias_weight = NumpyHelper.to_array(initializer)
+            break
+        if bias_weight is None:
+            return
+        if len(bias_weight.shape) != 1:
+            return
+
+        subgraph_nodes = [node, add]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        fused_node = helper.make_node(
+            fuse_op_type,
+            inputs=[matmul.output[0], add.input[bias_index]],
+            outputs=node.output,
+            name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"),
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e825f95cbe698d9831b7291d5b03336797a8db85
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
@@ -0,0 +1,150 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionConformerAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["Concat"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+
+    def get_num_heads_and_hidden_size(
+        self, atten_matmul: NodeProto, div: NodeProto
+    ) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
+        div_initializer = self.model.get_initializer(div.input[1])
+
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+
+        atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
+        head_dim = math.ceil(div_value * div_value)
+        hidden_size = atten_matul_shape_value[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self, num_heads: int, hidden_size: int, inputs: str, outputs: str
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=outputs,
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+
+        return attention_node
+
+    def fuse_reshape(self, shape_data_name):
+
+        shape_tensor = helper.make_tensor(
+            name=shape_data_name,
+            data_type=TensorProto.INT64,
+            dims=[3],
+            vals=np.int64([128, -1, self.hidden_size // self.num_heads]).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor, self.this_graph_name)
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+
+        paths = {
+            "path": (
+                ["Unsqueeze", "Mul", "Gather", "Shape", "LayerNormalization"],
+                [None, None, None, None, None],
+            ),
+        }
+
+        reshape_nodes, reshape_path = self.match_parent_path_from_dict(
+            start_node, paths
+        )
+        if reshape_nodes is None:
+            return
+
+        self.nodes_to_remove.append(start_node)
+
+        self.nodes_to_remove.extend(reshape_nodes[:-1])
+        self.fuse_reshape(start_node.output[0])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..78a40973f37cb0ac9f089ad971a293cb906ffa53
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
@@ -0,0 +1,129 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+import numpy as np
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionConformerXSoftmax(Fusion):
+    """
+    Fuse Where + Softmax + Where into one node: XSoftmax
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "XSoftmax_IxRT", "Softmax")
+
+    def create_xsoftmax_node(
+        self, data_input: str, mask_input: str, output: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+
+        unique_index = data_input
+        new_edge = "edge_modified_" + unique_index
+        shape_tensor = helper.make_tensor(
+            name="shape_modified_tensor_" + unique_index,
+            data_type=TensorProto.INT64,
+            dims=[4],
+            vals=np.int64(
+                [-1, 8, 128, 128]  # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
+            ).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor, self.this_graph_name)
+        self.model.add_node(
+            helper.make_node(
+                "Reshape",
+                [data_input, shape_tensor.name],
+                [new_edge],
+                "reshape_modified_" + unique_index,
+            ),
+            self.this_graph_name,
+        )
+
+        new_edge2 = "edge_modified2_" + unique_index
+        xsoftmax_node_name = self.model.create_node_name("XSoftmax")
+
+        xsoftmax_node = helper.make_node(
+            "XSoftmax_IxRT",
+            inputs=[new_edge, mask_input],
+            outputs=[new_edge2],
+            name=xsoftmax_node_name,
+        )
+        xsoftmax_node.domain = "com.iluvatar"
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("is_conformer", 1)])
+
+        shape_tensor2 = helper.make_tensor(
+            name="shape_modified_tensor2_" + unique_index,
+            data_type=TensorProto.INT64,
+            dims=[3],
+            vals=np.int64(
+                [-1, 128, 128]  # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
+            ).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor2, self.this_graph_name)
+        self.model.add_node(
+            helper.make_node(
+                "Reshape",
+                [new_edge2, shape_tensor2.name],
+                [output],
+                "reshape_modified2_" + unique_index,
+            ),
+            self.this_graph_name,
+        )
+
+        return xsoftmax_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        xsoftmax_paths = {
+            "path": (["Add", "Where", "Reshape", "Expand"], [None, None, None, None]),
+        }
+        xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
+            node, xsoftmax_paths
+        )
+
+        if xsoftmax_nodes is None:
+            logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
+            return
+        else:
+            (add_node, where_node, reshape_node, expand_node) = xsoftmax_nodes
+
+            mask_input = expand_node.input[0]
+
+            data_output = node.output[0]
+
+            data_input = add_node.input[0]
+            if where_node.output[0] == add_node.input[0]:
+                data_input = add_node.input[1]
+            xsoftmax_node = self.create_xsoftmax_node(
+                data_input, mask_input, data_output
+            )
+
+            self.nodes_to_remove.extend(xsoftmax_nodes)
+            self.nodes_to_add.append(xsoftmax_node)
+            self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e4011509e00ecedd3c5237e4320d3cd1a7d316
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
@@ -0,0 +1,344 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionCustomFCGPT2(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Reshape"], "gpt2")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        nodes = self.model.match_parent_path(node, ["Gemm", "Reshape"], [0, 0])
+
+        if nodes is None:
+            return False
+
+        (matmul, reshape_before_matmul) = nodes
+
+        matmul_weight = self.model.get_initializer(matmul.input[1])
+        matmul_bias = self.model.get_initializer(matmul.input[2])
+
+        if matmul_weight is None or matmul_bias is None:
+            return False
+
+        w = NumpyHelper.to_array(matmul_weight)
+        b = NumpyHelper.to_array(matmul_bias)
+
+        transB = 0
+        for attr in matmul.attribute:
+            if attr.name == "transB":
+                transB = attr.i
+                break
+
+        trans_matmul_weight = w
+        if transB == 0:
+            trans_matmul_weight = w.transpose(1, 0)
+        if matmul_weight.name not in self.model.initializer_visited.keys():
+            self.model.initializer_visited[matmul_weight.name] = True
+            if matmul_weight.data_type == 10:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(
+                        trans_matmul_weight.astype(np.float16), matmul_weight.name
+                    )
+                )
+            else:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
+                )
+
+        if matmul_bias.data_type == 10:
+            matmul_bias.CopyFrom(
+                numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
+            )
+        else:
+            matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[reshape_before_matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend([matmul, node, reshape_before_matmul])
+
+
+class FusionCustomFcRoformer(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"], "roformer fc")
+
+        # For model Roformer.
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 2:
+            return False
+
+        fc_paths = {
+            "path1": (["Reshape", "MatMul", "Reshape"], [0, 0, 0]),
+            "path2": (["Reshape", "MatMul", "Reshape"], [1, 0, 0]),
+        }
+
+        nodes, paths = self.match_parent_path_from_dict(node, fc_paths)
+        if nodes is None:
+            return False
+
+        reshape_after_matmul = nodes[0]
+        matmul = nodes[1]
+        reshape_before_matmul = nodes[2]
+
+        reshape_before_shape = None
+        reshape_after_shape = None
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_before_matmul.input[0]:
+                reshape_before_shape = len(value_info.type.tensor_type.shape.dim)
+                break
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_after_matmul.output[0]:
+                reshape_after_shape = len(value_info.type.tensor_type.shape.dim)
+                break
+        if reshape_before_shape != reshape_after_shape:
+            return False
+
+        weight = self.model.get_initializer(matmul.input[1])
+        bias = self.model.get_initializer(node.input[1]) or self.model.get_initializer(
+            node.input[0]
+        )
+
+        if weight is None or bias is None:
+            return False
+
+        w = NumpyHelper.to_array(weight)
+        w_in_size = w.shape[0]
+        weight_dim = np.prod(w.shape[1:])
+
+        b = NumpyHelper.to_array(bias)
+        bias_dim = np.prod(b.shape)
+        trans_matmul_weight = w.transpose(1, 0)
+        weight.CopyFrom(onnx.numpy_helper.from_array(trans_matmul_weight, weight.name))
+        # Sometimes weights and bias are stored in fp16
+        if weight.data_type == 10:
+            weight.CopyFrom(
+                numpy_helper.from_array(
+                    trans_matmul_weight.astype(np.float16), weight.name
+                )
+            )
+        bias_arr = onnx.numpy_helper.to_array(bias).flatten()
+        bias.CopyFrom(onnx.numpy_helper.from_array(bias_arr, bias.name))
+        if bias.data_type == 10:
+            bias.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
+                )
+            )
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[reshape_before_matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+
+        self.nodes_to_remove.extend([node])
+        self.nodes_to_remove.extend(nodes)
+        return True
+
+
+class FusionCustomFC(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"])
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if self.fuse_1(node, input_name_to_nodes, output_name_to_node):
+            return
+
+    def fuse_1(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 2:
+            return False
+        nodes = self.model.match_parent_path(node, ["MatMul"], [None])
+
+        if nodes is None:
+            return False
+        matmul = nodes[0]
+
+        matmul_weight = self.model.get_initializer(matmul.input[1])
+        matmul_bias = self.model.get_initializer(
+            node.input[1]
+        ) or self.model.get_initializer(node.input[0])
+
+        if matmul_weight is None or matmul_bias is None:
+            return False
+
+        w = NumpyHelper.to_array(matmul_weight)
+        b = NumpyHelper.to_array(matmul_bias)
+
+        trans_matmul_weight = w.transpose(1, 0)
+        if matmul_weight.name not in self.model.initializer_visited.keys():
+            self.model.initializer_visited[matmul_weight.name] = True
+            if matmul_weight.data_type == 10:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(
+                        trans_matmul_weight.astype(np.float16), matmul_weight.name
+                    )
+                )
+            else:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
+                )
+
+        if matmul_bias.data_type == 10:
+            matmul_bias.CopyFrom(
+                numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
+            )
+        else:
+            matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend([matmul, node])
+        return True
+
+
+class FusionCustomFCActivation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomFCPluginDynamic_IxRT",
+            ["Gelu", "Relu", "CustomGeluPluginDynamic_IxRT", "Mul"],
+            "with activation",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if node.op_type == "Mul":
+            return_indice = []
+            nodes = self.model.match_parent_path(
+                node,
+                ["Sigmoid", "Mul", "CustomFCPluginDynamic_IxRT"],
+                [None, 0, 0],
+                return_indice=return_indice,
+            )
+            if nodes is None:
+                return
+
+            (sigmoid_node, mul_node, custom_fc_node) = nodes
+            if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
+                return
+
+            activation_type = 20
+            for attr in custom_fc_node.attribute:
+                if attr.name == "act_type":
+                    attr.i = activation_type
+                    break
+
+            custom_fc_node.output[0] = node.output[0]
+            self.nodes_to_add.append(custom_fc_node)
+            self.nodes_to_remove.extend([node, sigmoid_node, mul_node, custom_fc_node])
+            self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
+        else:
+            nodes = self.model.match_parent_path(
+                node, ["CustomFCPluginDynamic_IxRT"], [0]
+            )
+
+            if nodes is None:
+                logger.debug("CustomFCActivation: failed to match fc+gelu/relu path")
+                return
+
+            fc_node = nodes[0]
+            activation_type = 3
+            if node.op_type == "Gelu":
+                activation_type = 21
+            if node.op_type == "Relu":
+                activation_type = 4
+
+            for attr in fc_node.attribute:
+                if attr.name == "act_type":
+                    attr.i = activation_type
+                    break
+
+            fc_node.output[0] = node.output[0]
+            self.nodes_to_add.append(fc_node)
+            self.nodes_to_remove.extend([node, fc_node])
+            self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
+
+
+class FusionConformerCustomFCActivation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomFCPluginDynamic_IxRT",
+            ["Mul"],
+            "with activation",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        # return_indice = []
+        nodes = self.model.match_parent_path(
+            node,
+            ["Sigmoid", "CustomFCPluginDynamic_IxRT"],
+            [
+                None,
+                0,
+            ],
+            # return_indice=return_indice,
+        )
+        if nodes is None:
+            return
+        (sigmoid_node, custom_fc_node) = nodes
+        # if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
+        #     return
+        activation_type = 20
+        for attr in custom_fc_node.attribute:
+            if attr.name == "act_type":
+                attr.i = activation_type
+                break
+        custom_fc_node.attribute.extend([helper.make_attribute("swish_alpha", 1.0)])
+        custom_fc_node.output[0] = node.output[0]
+        self.nodes_to_add.append(custom_fc_node)
+        self.nodes_to_remove.extend([node, sigmoid_node, custom_fc_node])
+        self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..04eb863f81fb8f026c74fa52ce5e2ca959cee13c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
@@ -0,0 +1,109 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionDisentangledAttention(Fusion):
+    """
+    Match Disentangled Attention
+        -------------------------------------------
+                                                  |
+        GatherElements          -->   Add  -->   Add  -->
+                                       |
+        GatherElements --> Transpose  ->
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "DisentangledAttention_IxRT", "Add")
+
+    def create_disentangled_attention_node(
+        self,
+        inputs: List[str],
+        outputs: List[str],
+    ) -> Union[NodeProto, None]:
+        """Create an disentangled attention node.
+
+        Args:
+            inputs List[str]: data input names
+            outputs List[str]: data output names
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        disentangled_attention_node_name = self.model.create_node_name(
+            "DisentangledAttention"
+        )
+
+        disentangled_attention_node = helper.make_node(
+            "DisentangledAttention_IxRT",
+            inputs=inputs,
+            outputs=outputs,
+            name=disentangled_attention_node_name,
+        )
+        disentangled_attention_node.domain = "com.iluvatar"
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("plugin_namespace", "")]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("plugin_version", "1")]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("factor", 0.1)]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("span", 512)]
+        )
+
+        return disentangled_attention_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        disentangled_attention_path1 = {
+            "path": (["Add", "GatherElements", "MatMul"], [None, None, None]),
+        }
+
+        disentangled_attention_path2 = {
+            "path": (
+                ["Add", "Transpose", "GatherElements", "MatMul"],
+                [None, None, None, None],
+            ),
+        }
+
+        nodes1, _ = self.match_parent_path_from_dict(node, disentangled_attention_path1)
+        nodes2, _ = self.match_parent_path_from_dict(node, disentangled_attention_path2)
+
+        if nodes1 is not None and nodes2 is not None:
+            if nodes1[0] == nodes2[0]:
+                (head_add, first_gather, first_matmul) = nodes1
+                (_, transpose, second_gather, second_matmul) = nodes2
+                tail_add = node
+
+                first_input = [i for i in tail_add.input if i != head_add.output[0]][0]
+                second_input = first_matmul.output[0]
+                third_input = second_matmul.output[0]
+                output = tail_add.output[0]
+
+                disentangled_attention_node = self.create_disentangled_attention_node(
+                    [first_input, second_input, third_input], [output]
+                )
+                self.nodes_to_add.append(disentangled_attention_node)
+                self.node_name_to_graph_name[
+                    disentangled_attention_node.name
+                ] = self.this_graph_name
+                self.nodes_to_remove.append(tail_add)
+                self.nodes_to_remove.append(head_add)
+                self.nodes_to_remove.append(first_gather)
+                self.nodes_to_remove.append(transpose)
+                self.nodes_to_remove.append(second_gather)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
new file mode 100644
index 0000000000000000000000000000000000000000..90bddbf89ece285a7be5b4e4f45a55defbdd138f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
@@ -0,0 +1,703 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict, List, Tuple, Union
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import NodeProto, TensorProto, helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionEmbedLayerNoMask(Fusion):
+    """
+    Fuse embedding layer into one node (EmbedLayerNormalization).
+    It supports the following model types: BERT, DistilBert, ALBert.
+    """
+
+    def __init__(self, model: OnnxModel, description: str = "no mask"):
+        super().__init__(
+            model,
+            "EmbedLayerNormalization",
+            ["LayerNormalization", "SkipLayerNormalization"],
+            description,
+        )
+        self.utils = FusionUtils(model)
+        self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
+        # The following will be reset in each fuse call of FusionEmbedLayerNormalization
+        self.attention = None
+        self.embed_node = None
+
+    def match_two_gather(self, add: NodeProto) -> Union[None, Tuple[NodeProto, NodeProto]]:
+        gather_0_path = self.model.match_parent_path(add, ["Gather"], [0])
+        if gather_0_path is None:
+            return None
+
+        gather_1_path = self.model.match_parent_path(add, ["Gather"], [1])
+        if gather_1_path is None:
+            return None
+
+        return gather_0_path[0], gather_1_path[0]
+
+    def check_attention_subgraph(
+        self,
+        layernorm: NodeProto,
+        input_name_to_nodes: Dict[str, List[NodeProto]],
+        is_distil_bert: bool,
+    ) -> bool:
+        """Check that LayerNormalization has a child of Attention node or subgraph like Attention.
+
+        Args:
+            layernorm (NodeProto): LayerNormalization node
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            is_distil_bert (bool): whether it is DistilBert or not
+
+        Returns:
+            bool: whether there is Attention node or subgraph like Attention
+        """
+        self.attention = self.model.find_first_child_by_type(
+            layernorm, "Attention", input_name_to_nodes, recursive=False
+        )
+        if self.attention is None:
+            # In case user disables attention fusion, check whether subgraph looks like Attention.
+            if layernorm.output[0] not in input_name_to_nodes:
+                return False
+            children = input_name_to_nodes[layernorm.output[0]]
+
+            # For Albert, there is MatMul+Add after embedding layer before attention.
+            if len(children) == 1 and children[0].op_type == "MatMul" and children[0].output[0] in input_name_to_nodes:
+                grandchildren = input_name_to_nodes[children[0].output[0]]
+                if (
+                    len(grandchildren) == 1
+                    and grandchildren[0].op_type == "Add"
+                    and grandchildren[0].output[0] in input_name_to_nodes
+                ):
+                    nodes = input_name_to_nodes[grandchildren[0].output[0]]
+                    for node in nodes:
+                        if node.op_type == "Attention":
+                            self.attention = node
+                            return True
+                    children_types = sorted([child.op_type for child in nodes])
+            else:
+                children_types = sorted([child.op_type for child in children])
+
+            # Two Shape nodes might be merged by ORT
+            if is_distil_bert:
+                # SkipLayerNormailization might exist when model has been optimized by ORT first.
+                if (
+                    children_types != ["MatMul", "MatMul", "MatMul", "Shape", "SkipLayerNormalization"]
+                    and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape", "Shape"]
+                    and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape"]
+                ):
+                    logger.debug("No Attention like subgraph in children of LayerNormalization")
+                    return False
+            else:
+                if children_types != ["Add", "MatMul", "MatMul", "MatMul",] and children_types != [
+                    "MatMul",
+                    "MatMul",
+                    "MatMul",
+                    "SkipLayerNormalization",
+                ]:
+                    logger.debug("No Attention like subgraph in children of LayerNormalization")
+                    return False
+        return True
+
+    def match_position_embedding_distilbert(self, position_embedding_gather, input_ids, output_name_to_node):
+        """  Match position embedding path from input_ids to Gather for DistilBert.
+
+        Pattern is like the following:
+                 (input_ids)
+                      |
+                     Shape
+                       |   \
+                       |    Gather (indices=1)
+                       |       |
+                       |      Cast (optional)
+                       |       |
+                       |      Range (start=0, end=*, delta=1)
+                       |       |
+                       |    Unsqueeze
+                       |    /
+                      Expand
+                        |
+                      Gather
+        """
+        # remove after tests pass
+        path1 = self.model.match_parent_path(position_embedding_gather, ["Expand", "Shape"], [1, 1])
+        if path1 is None:
+            path1 = self.model.match_parent_path(
+                position_embedding_gather,
+                ["Expand", "Where", "Reshape", "Shape"],
+                [1, 1, 2, 0],
+            )
+            if path1 is None:
+                return False
+
+        expand, shape = path1[0], path1[-1]
+        if shape.input[0] != input_ids:
+            return False
+
+        _, path2, _ = self.model.match_parent_paths(
+            expand,
+            [
+                (["Unsqueeze", "Range", "Cast", "Gather", "Shape"], [0, 0, 1, 0, 0]),
+                (["Unsqueeze", "Range", "Gather", "Shape"], [0, 0, 1, 0]),
+            ],
+            output_name_to_node,
+        )
+        if path2 is None:
+            return False
+
+        range_node = path2[1]
+        if not (
+            self.utils.check_node_input_value(range_node, 0, 0) and self.utils.check_node_input_value(range_node, 2, 1)
+        ):
+            return False
+
+        gather_node = path2[-2]
+        if not (self.utils.check_node_input_value(gather_node, 1, 1)):
+            return False
+
+        shape_node = path2[-1]
+        if shape_node.input[0] != input_ids:
+            return False
+
+        return True
+
+    def match_position_embedding_roberta(self, position_embedding_gather, input_ids, output_name_to_node):
+        """Match position embedding path from input_ids to Gather for Roberta.
+
+        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
+          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
+                                                |                              ^
+                                                V                              |
+                                                +------------------------------+
+
+        Roberta new pattern from transformers v4.9:
+           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
+                                                |                                           ^
+                                                V                                           |
+                                                +-------------------------------------------+
+
+        start_node = position_embedding_gather
+        start_index = 1
+
+        # match optional Cast node.
+        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
+        if parent is None:
+            return
+        if parent.op_type == "Cast":
+            if OnnxModel.get_node_attribute(parent, "to") != 7:
+                return
+            start_node = parent
+            start_index = 0
+
+        i, path, return_indices = self.model.match_parent_paths(
+            start_node,
+            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
+              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
+            output_name_to_node)
+
+        if path is not None:
+            # constant input of Add shall be 1.
+            i, value = self.model.get_constant_input(path[0])
+            if value != 1:
+                return False
+
+            _, self.padding_word_id = self.model.get_constant_input(path[-1])
+
+            return input_ids == path[-1].input[0]
+        """
+
+        return False
+
+    def match_position_embedding_bert(self, position_embedding_gather, input_ids, output_name_to_node):
+        """  Match position embedding path from input_ids to Gather for BERT.
+
+        BERT Embedding Layer Pattern:       
+                                    (input_ids)
+                                   /         \
+                                 /          Shape
+                                /              |
+                              /              Gather (indices=1)
+                             /                  |
+                            /                  Add (optional, B=0)
+                           /                    |
+                        Gather (segment_ids) Unsqueeze (axes=0)
+                           \        |           |
+                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
+                              \    /            |
+                                Add          Gather 
+                                   \       /
+                                      Add
+                                       |
+                                LayerNormalization
+        """
+        path = self.model.match_parent_path(
+            position_embedding_gather,
+            ["Slice", "Unsqueeze"],
+            [1, 2],
+            output_name_to_node,
+        )
+        if path is None:
+            return False
+
+        slice, unsqueeze = path
+        slice_weight = self.model.get_constant_value(slice.input[0])
+        if not (
+            slice_weight is not None
+            and len(slice_weight.shape) == 2
+            and slice_weight.shape[0] == 1
+            and self.utils.check_node_input_value(slice, 1, [0])
+            and self.utils.check_node_input_value(slice, 3, [1])
+            and (len(slice.input) == 4 or self.utils.check_node_input_value(slice, 4, [1]))
+        ):
+            return False
+
+        opset_version = self.model.get_opset_version()
+        if opset_version < 13:
+            if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
+                return False
+        else:
+            if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
+                return False
+
+        node = self.model.get_parent(unsqueeze, 0, output_name_to_node)
+        if node is None:
+            return False
+        if node.op_type == "Add":
+            if not self.utils.check_node_input_value(node, 1, 0):
+                return False
+            gather = self.model.get_parent(node, 0, output_name_to_node)
+        else:
+            gather = node
+
+        if gather is None or gather.op_type != "Gather":
+            return False
+        if not (self.utils.check_node_input_value(gather, 1, 1)):
+            return False
+
+        shape = self.model.get_parent(gather, 0, output_name_to_node)
+        if shape is None or shape.op_type != "Shape":
+            return False
+
+        return input_ids == shape.input[0]
+
+    def match_position_embedding(self, position_embedding_gather, input_ids, output_name_to_node):
+        if self.match_position_embedding_bert(position_embedding_gather, input_ids, output_name_to_node):
+            return True
+
+        # TODO: Support roberta (position starts from 2 instead of 0) in EmbedLayerNormalization kernel
+        #       related: https://github.com/huggingface/transformers/issues/10736
+        # if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
+        #    return True
+
+        if self.match_position_embedding_distilbert(position_embedding_gather, input_ids, output_name_to_node):
+            return True
+
+        return False
+
+    def check_embedding(self, word_embedding_gather, segment_embedding_gather, position_embedding_gather):
+        """Sanity check of embedding weights, and match hidden_size of weights and shape of inputs."""
+        input_ids = word_embedding_gather.input[1]
+        segment_ids = segment_embedding_gather.input[1] if segment_embedding_gather else None
+        position_ids = position_embedding_gather.input[1]
+
+        if self.shape_infer_helper is not None:
+            input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
+            position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
+            assert input_ids_shape and position_ids_shape
+            if not (
+                len(input_ids_shape) == 2
+                and len(position_ids_shape) == 2
+                and input_ids_shape[1] == position_ids_shape[1]
+            ):
+                logger.info(
+                    "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
+                        input_ids_shape, position_ids_shape
+                    )
+                )
+                return False
+
+            if segment_ids and not self.shape_infer_helper.compare_shape(input_ids, segment_ids):
+                logger.info(
+                    "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
+                        input_ids_shape,
+                        self.shape_infer_helper.get_edge_shape(segment_ids),
+                    )
+                )
+                return False
+
+        word_embedding_table = self.model.get_constant_value(word_embedding_gather.input[0])
+        if word_embedding_table is None or len(word_embedding_table.shape) != 2:
+            logger.info("Cannot fuse EmbedLayerNormalization: word embedding table is not expected")
+            return False
+
+        position_embedding_table = self.model.get_constant_value(position_embedding_gather.input[0])
+        if (
+            position_embedding_table is None
+            or len(position_embedding_table.shape) != 2
+            or (word_embedding_table.shape[1] != position_embedding_table.shape[1])
+        ):
+            logger.info("Cannot fuse EmbedLayerNormalization: position embedding table is not expected")
+            return False
+
+        if segment_ids:
+            segment_embedding_table = self.model.get_constant_value(segment_embedding_gather.input[0])
+            if (
+                segment_embedding_table is None
+                or len(segment_embedding_table.shape) != 2
+                or (word_embedding_table.shape[1] != segment_embedding_table.shape[1])
+            ):
+                logger.info("Cannot fuse EmbedLayerNormalization: segment embedding table is not expected")
+                return False
+
+        # In normal case, word embeding table is the largest, and segment embedding table is the smallest, while postion embedding table is in between.
+        # TODO: use other information (like initializer names) to identify different embedding weights automatically.
+        if word_embedding_table.shape[0] <= position_embedding_table.shape[0]:
+            logger.warning(
+                f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]}"
+            )
+
+        if segment_ids:
+            if word_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
+                logger.warning(
+                    f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
+                )
+
+            if position_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
+                logger.warning(
+                    f"position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
+                )
+
+        return True
+
+    def cast_to_int32(self, input_name: str) -> Tuple[str, Union[None, NodeProto]]:
+        """Cast a graph input or node input to int32.
+
+        Args:
+            input_name (str): name of graph input or node input
+
+        Returns:
+            A tuple of casted input name and the cast node.
+            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
+            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
+        """
+        input_cast_node = None
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input is not None:
+            if graph_input.type.tensor_type.elem_type != TensorProto.INT32:
+                int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name)
+            else:
+                int32_output = input_name
+        else:
+            int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name)
+
+        return int32_output, input_cast_node
+
+    def create_fused_node(
+        self,
+        input_ids: str,
+        layernorm: NodeProto,
+        word_embedding_gather: NodeProto,
+        position_embedding_gather: NodeProto,
+        segment_embedding_gather: Union[None, NodeProto],
+        position_ids: str = None,
+        embedding_sum_output=False,
+    ):
+        """Create an EmbedLayerNormalization node. Note that segment embedding is optional.
+
+        Args:
+            input_ids (str): input_ids for word embeddings
+            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
+            word_embedding_gather (NodeProto): the Gather node for word embedding
+            position_embedding_gather (NodeProto): the Gather node for position embedding
+            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.
+
+        Returns:
+            NodeProto: the EmbedLayerNormalization node created.
+        """
+        nodes_to_add = []
+        input_ids, _ = self.cast_to_int32(input_ids)
+
+        node_name = self.model.create_node_name("EmbedLayerNormalization")
+
+        if layernorm.op_type == "LayerNormalization":
+            gamma = layernorm.input[1]
+            beta = layernorm.input[2]
+        else:  # SkipLayerNormalization
+            gamma = layernorm.input[2]
+            beta = layernorm.input[3]
+
+        embed_node_inputs = None
+        if segment_embedding_gather is not None:
+            segment_ids, _ = self.cast_to_int32(segment_embedding_gather.input[1])
+
+            embed_node_inputs = [
+                input_ids,
+                segment_ids,
+                word_embedding_gather.input[0],
+                position_embedding_gather.input[0],
+                segment_embedding_gather.input[0],
+                gamma,
+                beta,
+            ]
+        else:  # no segment embedding
+            embed_node_inputs = [
+                input_ids,
+                "",
+                word_embedding_gather.input[0],
+                position_embedding_gather.input[0],
+                "",
+                gamma,
+                beta,
+            ]
+
+        if position_ids is not None:
+            # Adding an empty input for mask before position_ids
+            embed_node_inputs.append("")
+            position_ids, _ = self.cast_to_int32(position_ids)
+            embed_node_inputs.append(position_ids)
+
+        embed_node_outputs = [node_name + "_output", node_name + "_dummy_mask_index"]
+        if embedding_sum_output:
+            embed_node_outputs.append(node_name + "_embedding_sum")
+
+        embed_node = helper.make_node(
+            "EmbedLayerNormalization",
+            embed_node_inputs,
+            outputs=embed_node_outputs,
+            name=node_name,
+        )
+
+        embed_node.domain = "com.microsoft"
+
+        # Pass attribute "epsilon" from normalize node to EmbedLayerNormalization.
+        for att in layernorm.attribute:
+            if att.name == "epsilon":
+                embed_node.attribute.extend([att])
+
+        # Set default value to 1e-12 if no attribute is found.
+        # OnnxRuntime 1.2.0 or older has no epsilon attribute. The optimized model can only work for 1.3.0 or later.
+        if len(embed_node.attribute) == 0:
+            embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0e-12)])
+
+        # Make sure new EmbedLayerNormalization node is the last one in self.nodes_to_add.
+        nodes_to_add.append(embed_node)
+        for node in nodes_to_add:
+            self.node_name_to_graph_name[node.name] = self.this_graph_name
+        self.nodes_to_add.extend(nodes_to_add)
+
+        self.embed_node = embed_node
+        return embed_node
+
+    def finish_fusion(self, layernorm, embed_node):
+        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
+        # use prune graph to remove nodes that is not needed
+        self.prune_graph = True
+
+    def is_embedding_sum_needed(self, add_before_layer_norm):
+        """Check that Add before layer norm has an output to add before next layernorm
+
+        Args:
+            add_before_layer_norm (NodeProto): Add before any LayerNormalization node in topological order of graph
+
+        Returns:
+            bool: whether there is an extra output needed out of embed layer norm node
+        """
+
+        nodes = self.model.get_children(add_before_layer_norm)
+
+        return len(nodes) > 1
+
+    def fuse_gpt2(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        # graph checks
+        # gpt2 has no segment embedding, subgraph pattern is like
+        #     input_ids  position_ids
+        #        |        |
+        #     Gather    Gather
+        #          \   /
+        #           Add _ _ _ _ _
+        #            |           |
+        #    LayerNormalization  |
+        #            |           |
+        #         Attention      |
+        #            |           |
+        #          Matmul        |
+        #            |          /
+        #           Add        /
+        #             \       /
+        #                Add
+        two_gather = self.match_two_gather(add_before_layernorm)
+        if two_gather is None:
+            return False
+
+        add_output = add_before_layernorm.output[0]
+
+        word_embedding_gather, position_embedding_gather = two_gather
+        input_ids = word_embedding_gather.input[1]
+        position_ids = position_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=False):
+            return False
+
+        if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
+            return False
+
+        optional_embedding_sum_output = False
+        if self.is_embedding_sum_needed(add_before_layernorm):
+            optional_embedding_sum_output = True
+
+        # make the fused node
+        embed_node = self.create_fused_node(
+            input_ids,
+            layernorm,
+            word_embedding_gather,
+            position_embedding_gather,
+            None,
+            position_ids,
+            optional_embedding_sum_output,
+        )
+
+        # direct the output to another add too
+        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
+        if optional_embedding_sum_output:
+            self.model.replace_input_of_all_nodes(add_output, embed_node.output[2])
+
+        return True
+
+    def fuse_distilbert(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        """Fuse embedding layer for DistilBert
+        Args:
+            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
+            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
+        """
+
+        # DistilBert has no segment embedding, subgraph pattern is like
+        #       input_ids
+        #        |      \
+        #        |     (position_embedding_subgraph)
+        #        |        |
+        #     Gather    Gather
+        #          \   /
+        #           Add
+        #            |
+        #    LayerNormalization
+        two_gather = self.match_two_gather(add_before_layernorm)
+        if two_gather is None:
+            return False
+
+        word_embedding_gather, position_embedding_gather = two_gather
+        input_ids = word_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=True):
+            return False
+
+        if not self.match_position_embedding(position_embedding_gather, input_ids, output_name_to_node):
+            return False
+
+        if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
+            return False
+
+        embed_node = self.create_fused_node(
+            input_ids, layernorm, word_embedding_gather, position_embedding_gather, None
+        )
+        self.finish_fusion(layernorm, embed_node)
+        return True
+
+    def fuse_bert(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        """Fuse embedding layer for Bert
+        Args:
+            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
+            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
+        """
+
+        add_2_gather = self.model.match_parent_path(add_before_layernorm, ["Add"], [0])
+        if add_2_gather is None:
+            return False
+
+        two_gather = self.match_two_gather(add_2_gather[0])
+        if two_gather is None:
+            return False
+
+        word_embedding_gather, segment_embedding_gather = two_gather
+
+        input_ids = word_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=False):
+            return False
+
+        position_embedding_path = self.model.match_parent_path(add_before_layernorm, ["Gather"], [1])
+        if position_embedding_path is None:
+            return False
+
+        position_embedding_gather = position_embedding_path[0]
+        if not self.match_position_embedding(position_embedding_gather, input_ids, output_name_to_node):
+            if not self.match_position_embedding(segment_embedding_gather, input_ids, output_name_to_node):
+                return False
+            # position and segment are switched
+            temp = segment_embedding_gather
+            segment_embedding_gather = position_embedding_gather
+            position_embedding_gather = temp
+
+        if not self.check_embedding(word_embedding_gather, segment_embedding_gather, position_embedding_gather):
+            return False
+
+        embed_node = self.create_fused_node(
+            input_ids,
+            layernorm,
+            word_embedding_gather,
+            position_embedding_gather,
+            segment_embedding_gather,
+        )
+        self.finish_fusion(layernorm, embed_node)
+        return True
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if node.op_type == "LayerNormalization":
+            first_add_path = self.model.match_parent_path(node, ["Add"], [0])
+            if first_add_path is None:
+                return
+            add_before_layernorm = first_add_path[0]
+        else:  # SkipLayerNormalization
+            add_before_layernorm = node  # Add is fused into SkipLayerNormalization
+
+        if self.fuse_gpt2(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_distilbert(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_bert(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+
+class FusionEmbedLayerNormalization(FusionEmbedLayerNoMask):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "with mask")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        # Reset attention and embed_node so that we know fusion is successful when they are not None.
+        self.attention = None
+        self.embed_node = None
+        super().fuse(node, input_name_to_nodes, output_name_to_node)
+
+        if self.attention and self.embed_node:
+            mask_index = self.attention.input[3]
+            if mask_index in output_name_to_node:
+                node = output_name_to_node[mask_index]
+                if node.op_type == "ReduceSum":
+                    embed_node = self.embed_node
+                    mask_input_name = node.input[0]
+                    self.nodes_to_remove.extend([node])
+                    embed_node.input.append(mask_input_name)
+                    embed_node.output[1] = mask_index
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e24a9dd7e018bd949f4a2bba18de7d2c909ce2b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
@@ -0,0 +1,404 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict, Optional
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionFastGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomGeluPluginDynamic_IxRT", "Tanh")
+
+    def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_2(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+    def fuse_1(
+        self, tanh_node, input_name_to_nodes, output_name_to_node
+    ) -> Optional[bool]:
+        """
+        Fuse Gelu with tanh into one node:
+              +---------------------------+
+              |                           |
+              |                           v
+            [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul
+              |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)     ^
+              |                                                              |
+              +------> Mul(B=0.5)--------------------------------------------+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_tanh = children[0]
+
+        mul_half = self.model.match_parent(
+            mul_after_tanh, "Mul", None, output_name_to_node
+        )
+        if mul_half is None:
+            return
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        root_input = mul_half.input[0 if i == 1 else 1]
+
+        # root_node could be None when root_input is graph input
+        root_node = self.model.get_parent(
+            mul_half, 0 if i == 1 else 1, output_name_to_node
+        )
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
+        if i < 0:
+            return
+
+        add_before_tanh = self.model.match_parent(
+            mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
+        )
+        if add_before_tanh is None:
+            return
+
+        mul_after_pow = self.model.match_parent(
+            add_before_tanh,
+            "Mul",
+            None,
+            output_name_to_node,
+            exclude=[root_node] if root_node else [],
+        )
+        if mul_after_pow is None:
+            return
+
+        i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
+        if i < 0:
+            return
+
+        pow = self.model.match_parent(
+            mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
+        )
+        if pow is None:
+            return
+
+        if not self.model.has_constant_input(pow, 3.0):
+            return
+
+        if pow.input[0] != root_input:
+            return
+
+        subgraph_nodes = [
+            mul_after_tanh,
+            mul_half,
+            add_after_tanh,
+            tanh_node,
+            mul_before_tanh,
+            add_before_tanh,
+            mul_after_pow,
+            pow,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_after_tanh.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_input],
+            outputs=mul_after_tanh.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_2(
+        self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
+    ) -> Optional[bool]:
+        """
+        This pattern is from Tensorflow model.
+        Fuse Gelu with tanh into one node:
+              +---------------------------+
+              |                           |
+              |                           v
+            [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul(B=0.5)-->Mul-->
+              |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)                  ^
+              |                                                                           |
+              +---------------------------------------------------------------------------+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_half = children[0]
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_mul_half = children[0]
+
+        root_node = self.model.get_parent(
+            mul_after_mul_half,
+            0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
+            output_name_to_node,
+        )
+        if root_node is None:
+            return
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
+        if i < 0:
+            return
+
+        add_before_tanh = self.model.match_parent(
+            mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
+        )
+        if add_before_tanh is None:
+            return
+
+        mul_after_pow = self.model.match_parent(
+            add_before_tanh, "Mul", None, output_name_to_node, exclude=[root_node]
+        )
+        if mul_after_pow is None:
+            return
+
+        i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
+        if i < 0:
+            return
+
+        pow = self.model.match_parent(
+            mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
+        )
+        if pow is None:
+            return
+
+        if not self.model.has_constant_input(pow, 3.0):
+            return
+
+        if pow.input[0] != root_node.output[0]:
+            return
+
+        subgraph_nodes = [
+            mul_after_mul_half,
+            mul_half,
+            add_after_tanh,
+            tanh_node,
+            mul_before_tanh,
+            add_before_tanh,
+            mul_after_pow,
+            pow,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_after_mul_half.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_node.output[0]],
+            outputs=mul_after_mul_half.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_3(
+        self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
+    ) -> Optional[bool]:
+        """
+        OpenAI's gelu implementation, also used in Megatron:
+           Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
+
+        Fuse subgraph into a FastGelu node:
+            +------------ Mul (B=0.79788456) -------------------+
+            |                                                   |
+            +-------------------------------+                   |
+            |                               |                   |
+            |                               v                   v
+          [root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
+            |                                                                                 ^
+            |                                                                                 |
+            +-----------> Mul (B=0.5) --------------------------------------------------------+
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_last = children[0]
+
+        mul_half = self.model.match_parent(mul_last, "Mul", None, output_name_to_node)
+        if mul_half is None:
+            return
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        root_input = mul_half.input[0 if i == 1 else 1]
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        add_1 = self.model.match_parent(
+            mul_before_tanh, "Add", None, output_name_to_node
+        )
+        if add_1 is None:
+            return
+        j = self.model.find_constant_input(add_1, 1.0)
+        if j < 0:
+            return
+
+        mul_7978 = self.model.match_parent(
+            mul_before_tanh, "Mul", None, output_name_to_node
+        )
+        if mul_7978 is None:
+            return
+        k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001)
+        if k < 0:
+            return
+        if mul_7978.input[0 if k == 1 else 1] != root_input:
+            return
+
+        mul_before_add_1 = self.model.match_parent(
+            add_1, "Mul", 0 if j == 1 else 1, output_name_to_node
+        )
+        if mul_before_add_1 is None:
+            return
+
+        if mul_before_add_1.input[0] == root_input:
+            another = 1
+        elif mul_before_add_1.input[1] == root_input:
+            another = 0
+        else:
+            return
+
+        mul_0447 = self.model.match_parent(
+            mul_before_add_1, "Mul", another, output_name_to_node
+        )
+        if mul_0447 is None:
+            return
+        m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001)
+        if m < 0:
+            return
+
+        if mul_0447.input[0 if m == 1 else 1] != root_input:
+            return
+
+        subgraph_nodes = [
+            mul_0447,
+            mul_before_add_1,
+            add_1,
+            mul_before_tanh,
+            tanh_node,
+            add_after_tanh,
+            mul_7978,
+            mul_half,
+            mul_last,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_last.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_input],
+            outputs=mul_last.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b6d66ad3f6ae5e73a2f921c9b807fa22e439c33
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
@@ -0,0 +1,107 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionRemoveUselessElementwise(Fusion):
+    """
+    Fusion to remove useless elementwise in roformer model.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "Sqrt", "Sqrt")
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        paths = {
+            "path1": (
+                ["Max", "Min", "Add", "GlobalAveragePool"],
+                [None, None, None, None],
+            ),
+        }
+
+        pool_nodes, pool_path = self.match_parent_path_from_dict(node, paths)
+
+        if pool_nodes is None:
+            logger.debug("GlobalAveragePool: failed searching path after pool node.")
+            return
+
+        max_node = pool_nodes[0]
+        min_node = pool_nodes[1]
+        add_node = pool_nodes[2]
+        pool_node = pool_nodes[3]
+        if not self.model.has_constant_input(add_node, 9.999999960041972e-13):
+            return
+
+        if not self.model.has_constant_input(max_node, 0):
+            return
+
+        max_node.input[0] = pool_node.output[0]
+        self.nodes_to_remove.extend([min_node, add_node])
+
+
+class FusionFormatInvalidMask(Fusion):
+    """
+    Fusion to format invalid mask in roformer model.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "", ["Greater"])
+
+    def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
+        nodes = self.model.match_parent_path(
+            start_node,
+            [
+                "ReduceMin",
+                "Cast",
+                "Concat",
+                "Unsqueeze",
+                "Greater",
+                "ReduceMin",
+                "Cast",
+                "Concat",
+                "Unsqueeze",
+            ],
+            [0, 0, 0, 0, 0, 0, 0, 0, 0],
+        )
+
+        if nodes is None:
+            logger.debug("Roformer: unable to format the mask.")
+            return
+
+        unsqueeze_node = nodes[-1]
+
+        for node in self.model.graph().node:
+            for (id, input) in enumerate(node.input):
+                if start_node.output[0] == input:
+                    node.input[id] = unsqueeze_node.input[0]
+
+        self.nodes_to_remove.extend(nodes)
+        self.nodes_to_remove.extend([start_node])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c5c7e848dc033d69d5fad17834f1b20ed89bd0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
@@ -0,0 +1,333 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict, Optional
+
+from .fusion_base import Fusion
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Gelu", "Erf")
+
+    def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        if self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        self.fuse_4(erf_node, input_name_to_nodes, output_name_to_node)
+
+    def fuse_1(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from PyTorch model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->
+                              (B=1.4142...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=1.4142...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return
+
+        if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            return
+
+        subgraph_input = div.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return
+            mul_half = children[0]
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.model.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return
+
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+
+            if subgraph_input not in mul_half.input:
+                return
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_2(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from Keras model
+        Fuse Gelu with Erf into one node:
+                       +------------------------------------------+
+                       |                                          |
+                       |                                          v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul
+                              (B=1.4142...)       (A=1)   (A=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        if not self.model.has_constant_input(mul_after_erf, 0.5):
+            return
+
+        if mul_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul = children[0]
+
+        div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return
+
+        sqrt_node = None
+        if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            sqrt_node = self.model.match_parent(div, "Sqrt", 1, output_name_to_node)
+            if sqrt_node is None:
+                return
+            if not self.model.has_constant_input(sqrt_node, 2.0):
+                return
+
+        root_node = self.model.get_parent(div, 0, output_name_to_node)
+        if root_node is None:
+            return
+
+        if root_node.output[0] not in mul.input:
+            return
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
+        if sqrt_node:
+            subgraph_nodes.append(sqrt_node)
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_3(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+                       +----------------------------------------------+
+                       |                                              |
+                       |                                              v
+                    [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
+                               (A=0.7071067690849304)  (B=1)  (B=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_half = children[0]
+
+        if not self.model.has_constant_input(mul_half, 0.5):
+            return
+
+        first_mul = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if first_mul is None:
+            return
+
+        i = self.model.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
+        if i < 0:
+            return
+
+        root_node = self.model.get_parent(first_mul, 0 if i == 1 else 1, output_name_to_node)
+        if root_node is None:
+            return
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        last_mul = children[0]
+
+        if not (last_mul.input[0] == root_node.output[0] or last_mul.input[1] == root_node.output[0]):
+            return
+
+        subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [last_mul.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_4(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Mul -----> Erf  --> Add --> Mul -->
+                              (B=0.7071...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Mul -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=0.7071...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        mul_before_erf = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if mul_before_erf is None:
+            return
+
+        if self.model.find_constant_input(mul_before_erf, 0.7071, delta=0.001) != 1:
+            return
+
+        subgraph_input = mul_before_erf.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return
+            mul_half = children[0]
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.model.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return
+
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+
+            if subgraph_input not in mul_half.input:
+                return
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [mul_before_erf, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
new file mode 100644
index 0000000000000000000000000000000000000000..35f4b93a732e7cc73dd5f9ae917f75bd505c93a3
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
@@ -0,0 +1,27 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from .fusion_base import Fusion
+from onnx import helper
+from .onnx_model import OnnxModel
+
+
+class FusionGeluApproximation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "FastGelu", ["Gelu", "BiasGelu"], "GeluApproximation")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        new_node = helper.make_node(
+            "FastGelu",
+            inputs=node.input,
+            outputs=node.output,
+            name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation"),
+        )
+        new_node.domain = "com.microsoft"
+        self.nodes_to_remove.append(node)
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b856dd19de9f03cd5b799eb1e042ed6bce193fd2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
@@ -0,0 +1,473 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGptAttentionPastBase(Fusion):
+    """Base class for GPT Attention Fusion with past state"""
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, "Attention", "LayerNormalization", "with past")
+        self.num_heads = num_heads
+        self.utils = FusionUtils(model)
+        self.casted_attention_mask = {}  # map from name of attention mask to the name that casted to int32
+
+    def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
+        # Pattern 1:
+        #                      {past}
+        #                    /        \
+        #                   /          \
+        #    Gather(axes=0, indices=0)  Gather(indices=1)
+        #      |                          |
+        #    Transpose (perm=0,1,3,2)     |
+        #      |                          |
+        #  Concat_k                     Concat_v
+        #      |                        /
+        #  Transpose (perm=0,1,3,2)    /
+        #      |                      /
+        #  Unsqueeze        Unsqueeze
+        #        \        /
+        #         \      /
+        #           Concat
+        #             |
+        #         {present}
+        gather = self.model.get_parent(concat_v, 0, output_name_to_node)
+        if gather.op_type != "Gather":
+            logger.debug("match_past_pattern_1: expect Gather for past")
+            return None
+
+        if not self.model.find_constant_input(gather, 1) == 1:
+            logger.debug("match_past_pattern_1: expect indices=1 for Gather of past")
+            return None
+        past = gather.input[0]
+
+        parent = self.model.get_parent(concat_k, 0, output_name_to_node)
+        if parent.op_type == "Gather":
+            gather_past_k = parent
+        else:
+            past_k_nodes = self.model.match_parent_path(concat_k, ["Transpose", "Gather"], [0, 0])
+            if past_k_nodes is None:
+                logger.debug("match_past_pattern_1: failed match Transpose and Gather")
+                return None
+            gather_past_k = past_k_nodes[-1]
+
+        if not self.model.find_constant_input(gather_past_k, 0) == 1:
+            logger.debug("match_past_pattern_1: expect indices=0 for Gather k of past")
+            return None
+        past_k = gather_past_k.input[0]
+        if past != past_k:
+            logger.debug("match_past_pattern_1: expect past to be same")
+            return None
+
+        return past
+
+    def match_past_pattern_2(self, concat_k, concat_v, output_name_to_node):
+        # Pattern 2:
+        #      Split (QKV)
+        #      / |   |
+        #     /  |   +----------------------+
+        #        |                          |
+        #        |         {past}           |
+        #        |           |              |
+        #      Reshape     Split         Reshape
+        #        |         /    \           |
+        # Transpose_k  Squeeze  Squeeze  Transpose_v
+        #        |      |        \        /
+        #        +------|---+     \      /
+        #               |   |      \    /
+        #              Concat_k   Concat_v
+        #               |            |
+        #          Unsqueeze    Unsqueeze
+        #                \       /
+        #                 Concat
+        #                   |
+        #               {present}
+        #
+        squeeze = self.model.get_parent(concat_v, 0, output_name_to_node)
+        if squeeze.op_type != "Squeeze":
+            logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v")
+            return None
+
+        split = self.model.get_parent(squeeze, 0, output_name_to_node)
+        if split.op_type != "Split":
+            logger.debug("match_past_pattern_2: expect Split for past path")
+            return None
+
+        opset_version = self.model.get_opset_version()
+        if opset_version < 13:
+            if not FusionUtils.check_node_attribute(squeeze, "axes", [0]):
+                logger.debug("match_past_pattern_2: axes != [0] for Squeeze in past path")
+                return None
+
+            if not FusionUtils.check_node_attribute(split, "split", [1, 1]):
+                logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
+                return None
+        else:
+            if not self.utils.check_node_input_value(squeeze, 1, [0]):
+                logger.debug("match_past_pattern_2: axes != [0] for Squeeze in past path")
+                return None
+
+            if not self.utils.check_node_input_value(split, 1, [1, 1]):
+                logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
+                return None
+
+        if not FusionUtils.check_node_attribute(split, "axis", 0, default_value=0):
+            logger.debug("match_past_pattern_2: attribute axis of Split are not expected in past path")
+            return None
+        past = split.input[0]
+
+        past_k_nodes = self.model.match_parent_path(concat_k, ["Squeeze", "Split"], [0, 0])
+        if past_k_nodes is None:
+            logger.debug("match_past_pattern_2: failed to match past_k_nodes path")
+            return None
+        past_k = past_k_nodes[-1].input[0]
+
+        if past != past_k:
+            logger.info("match_past_pattern_2: expect past to be same")
+            return None
+
+        return past
+
+    def match_present(self, concat_v, input_name_to_nodes):
+        unsqueeze_present_v = self.model.find_first_child_by_type(
+            concat_v, "Unsqueeze", input_name_to_nodes, recursive=False
+        )
+        if not unsqueeze_present_v:
+            logger.info("expect unsqueeze for present")
+            return None
+        concat_present = self.model.find_first_child_by_type(
+            unsqueeze_present_v, "Concat", input_name_to_nodes, recursive=False
+        )
+        if not concat_present:
+            logger.info("expect concat for present")
+            return None
+
+        present = concat_present.output[0]
+        return present
+
+    def cast_attention_mask(self, input_name):
+        if input_name in self.casted_attention_mask:
+            attention_mask_input_name = self.casted_attention_mask[input_name]
+        elif self.model.find_graph_input(input_name):
+            casted, attention_mask_input_name = self.utils.cast_graph_input_to_int32(input_name)
+            self.casted_attention_mask[input_name] = attention_mask_input_name
+        else:
+            attention_mask_input_name, cast_node = self.utils.cast_input_to_int32(input_name)
+            self.casted_attention_mask[input_name] = attention_mask_input_name
+        return attention_mask_input_name
+
+
+class FusionGptAttention(FusionGptAttentionPastBase):
+    """
+    Fuse GPT-2 Attention with past state subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, num_heads)
+
+    def create_attention_node(
+        self,
+        fc_weight,
+        fc_bias,
+        gemm_qkv,
+        past,
+        present,
+        input,
+        output,
+        mask,
+        is_unidirectional,
+    ):
+        attention_node_name = self.model.create_node_name("GptAttention")
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=[input, fc_weight, fc_bias, mask, past],
+            outputs=[attention_node_name + "_output", present],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend(
+            [
+                helper.make_attribute("num_heads", self.num_heads),
+                helper.make_attribute("unidirectional", 1 if is_unidirectional else 0),
+            ]
+        )
+
+        matmul_node = helper.make_node(
+            "MatMul",
+            inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
+            outputs=[attention_node_name + "_matmul_output"],
+            name=attention_node_name + "_matmul",
+        )
+
+        add_node = helper.make_node(
+            "Add",
+            inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
+            outputs=[output],
+            name=attention_node_name + "_add",
+        )
+        self.nodes_to_add.extend([attention_node, matmul_node, add_node])
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
+        self.node_name_to_graph_name[add_node.name] = self.this_graph_name
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        past = None
+        present = None
+        return_indice = []
+        qkv_nodes = self.model.match_parent_path(
+            normalize_node,
+            ["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
+            [0, None, 0, 0, 0, 0, 0],
+            output_name_to_node=output_name_to_node,
+            return_indice=return_indice,
+        )  # yapf: disable
+        if qkv_nodes is None:
+            return
+        (
+            add_qkv,
+            reshape_qkv,
+            gemm_qkv,
+            reshape_1,
+            reshape_2,
+            transpose_qkv,
+            matmul_qkv,
+        ) = qkv_nodes
+
+        another_input = add_qkv.input[1 - return_indice[0]]
+
+        v_nodes = self.model.match_parent_path(matmul_qkv, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (concat_v, transpose_v, reshape_v, split_fc) = v_nodes
+
+        fc_nodes = self.model.match_parent_path(
+            split_fc,
+            ["Reshape", "Gemm", "Reshape", "LayerNormalization"],
+            [0, 0, 0, 0],
+            output_name_to_node,
+        )
+        if fc_nodes is None:
+            fc_nodes = self.model.match_parent_path(
+                split_fc,
+                ["Add", "MatMul", "LayerNormalization"],
+                [0, None, 0],
+                output_name_to_node,
+            )
+            if fc_nodes is None:
+                logger.debug("fuse_attention: failed to match fc path")
+                return
+            fc_weight = fc_nodes[1].input[1]
+            i, _ = self.model.get_constant_input(fc_nodes[0])
+            fc_bias = fc_nodes[0].input[i]
+        else:
+            fc_weight = fc_nodes[1].input[1]
+            fc_bias = fc_nodes[1].input[2]
+
+        layernorm_before_attention = fc_nodes[-1]
+
+        if not another_input in layernorm_before_attention.input:
+            logger.debug("Add and LayerNormalization shall have one same input")
+            return
+
+        is_unidirectional = True
+        slice_mask = None
+        input_mask_nodes = None
+        concat_k_to_match = None
+        qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0])
+        if qk_nodes is not None:
+            (softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
+            mask_nodes = self.model.match_parent_path(
+                sub_qk,
+                [
+                    "Mul",
+                    "Sub",
+                    "Slice",
+                    "Slice",
+                    "Unsqueeze",
+                    "Sub",
+                    "Squeeze",
+                    "Slice",
+                    "Shape",
+                    "Div",
+                ],
+                [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
+            )  # yapf: disable
+            if mask_nodes is None:
+                logger.debug("fuse_attention: failed to match unidirectional mask path")
+                return
+            div_mask = mask_nodes[-1]
+            slice_mask = mask_nodes[3]
+
+            if div_qk != div_mask:
+                logger.debug("fuse_attention: skip since div_qk != div_mask")
+                return
+        else:
+            # New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
+            i, qk_nodes, _ = self.model.match_parent_paths(
+                matmul_qkv,
+                [
+                    (["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0]),
+                    (["Softmax", "Add", "Where", "Div", "MatMul"], [0, 0, None, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+            if qk_nodes is None:
+                logger.debug("fuse_attention: failed to match qk nodes")
+                return
+
+            where_qk = qk_nodes[-3]
+            div_qk = qk_nodes[-2]
+            matmul_qk = qk_nodes[-1]
+
+            if i == 1:
+                add_qk = qk_nodes[1]
+                _, input_mask_nodes, _ = self.model.match_parent_paths(
+                    add_qk,
+                    [
+                        (
+                            ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze", "Reshape"],
+                            [None, 0, 1, 0, 0, 0],
+                        ),
+                        (
+                            ["Mul", "Sub", "Unsqueeze", "Unsqueeze", "Reshape"],
+                            [None, 0, 1, 0, 0],
+                        ),
+                        (
+                            ["Mul", "Sub", "Unsqueeze", "Unsqueeze"],
+                            [None, 0, 1, 0],
+                        ),  # useless cast and reshape are removed.
+                    ],
+                    output_name_to_node,
+                )  # yapf: disable
+                if input_mask_nodes is None:
+                    logger.debug("fuse_attention: failed to match input attention mask path")
+                    return
+
+            mask_nodes = self.model.match_parent_path(
+                where_qk,
+                [
+                    "Cast",
+                    "Slice",
+                    "Slice",
+                    "Unsqueeze",
+                    "Sub",
+                    "Squeeze",
+                    "Slice",
+                    "Shape",
+                ],
+                [0, 0, 0, 1, 0, 0, 0, 0],
+                output_name_to_node,
+            )  # yapf: disable
+            if mask_nodes is None:
+                # TODO: match mask path for GPT2LMHeadModel_BeamSearchStep.
+                logger.debug("fuse_attention: failed to match mask path")
+                return
+
+            slice_mask = mask_nodes[2]
+
+            div_or_concat = self.model.get_parent(mask_nodes[-1], 0, output_name_to_node)
+            if div_or_concat.op_type == "Div":
+                div_mask = div_or_concat
+                if div_qk != div_mask:
+                    logger.debug("fuse_attention: skip since div_qk != div_mask")
+                    return
+            elif div_or_concat.op_type == "Concat":
+                concat_k_to_match = div_or_concat
+            else:
+                logger.debug("fuse_attention: failed to match mask path")
+
+        # Validate that the mask data is either lower triangular (unidirectional) or all ones
+        mask_data = numpy_helper.to_array(self.model.get_initializer(slice_mask.input[0]))
+        if not (
+            len(mask_data.shape) == 4 and mask_data.shape[:2] == (1, 1) and mask_data.shape[2] == mask_data.shape[3]
+        ):
+            logger.debug("fuse_attention: skip since mask shape is not 1x1xWxW")
+            return
+        if np.allclose(mask_data, np.ones_like(mask_data)):
+            is_unidirectional = False
+        elif not np.allclose(mask_data, np.tril(np.ones_like(mask_data))):
+            logger.debug("fuse_attention: skip since mask is neither lower triangular nor ones")
+            return
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0])
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (transpose_q, reshape_q, split_q) = q_nodes
+        if split_fc != split_q:
+            logger.debug("fuse_attention: skip since split_fc != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(matmul_qk, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
+        if k_nodes is None:
+            # This pattern is from pytorch 1.7.1 and transformers 4.6.1
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Concat", "Transpose", "Reshape", "Split"],
+                [1, 0, 1, 0, 0],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+            else:
+                (_, concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        else:
+            (concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        if split_fc != split_k:
+            logger.debug("fuse_attention: skip since split_fc != split_k")
+            return
+
+        if concat_k_to_match and concat_k != concat_k_to_match:
+            logger.debug("fuse_attention: skip since concat_k != concat_k_to_match")
+            return
+
+        attention_mask_input_name = ""
+        if input_mask_nodes is not None:
+            input_name = input_mask_nodes[-1].input[0]
+            attention_mask_input_name = self.cast_attention_mask(input_name)
+
+        # Match past and present paths
+        past = self.match_past_pattern_1(concat_k, concat_v, output_name_to_node) or self.match_past_pattern_2(
+            concat_k, concat_v, output_name_to_node
+        )
+        if past is None:
+            logger.info("fuse_attention: failed to match past path")
+            return
+        if not self.model.find_graph_input(past):
+            logger.debug("past is not graph input.")
+            # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
+
+        present = self.match_present(concat_v, input_name_to_nodes)
+        if present is None:
+            logger.info("fuse_attention: failed to match present path")
+            return
+        if not self.model.find_graph_output(present):
+            logger.info("expect present to be graph output")
+            return
+
+        self.create_attention_node(
+            fc_weight,
+            fc_bias,
+            gemm_qkv,
+            past,
+            present,
+            layernorm_before_attention.output[0],
+            reshape_qkv.output[0],
+            attention_mask_input_name,
+            is_unidirectional,
+        )
+
+        # we rely on prune_graph() to clean old subgraph nodes:
+        # qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv]
+        self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
new file mode 100644
index 0000000000000000000000000000000000000000..8510ae42937b77d7c7d26941d1b0be9abe8b9679
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
@@ -0,0 +1,292 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_gpt_attention import FusionGptAttentionPastBase
+from .fusion_utils import FusionUtils
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+def is_close(value, expected_value):
+    return abs(value - expected_value) <= 1e-6
+
+
+class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
+    """
+    Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, num_heads)
+
+    def fuse_attention_node(
+        self,
+        matmul_before_split,
+        add_before_split,
+        past,
+        present,
+        input,
+        reshape_qkv,
+        mask,
+    ):
+        attention_node_name = self.model.create_node_name("GptAttention")
+        int32_mask = self.cast_attention_mask(mask)
+        output = reshape_qkv.output[0]
+        i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=[
+                input,
+                matmul_before_split.input[1],
+                add_before_split.input[i],
+                int32_mask,
+                past,
+            ],
+            outputs=[output, present],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend(
+            [
+                helper.make_attribute("num_heads", self.num_heads),
+                helper.make_attribute("unidirectional", 0),  # unidirectional shall not be ON for 4D attention mask
+            ]
+        )
+
+        nodes_to_add = [attention_node]
+        self.nodes_to_add.extend(nodes_to_add)
+
+        for node in nodes_to_add:
+            self.node_name_to_graph_name[node.name] = self.this_graph_name
+
+        self.nodes_to_remove.append(reshape_qkv)
+
+        # we rely on prune_graph() to clean old subgraph nodes
+        self.prune_graph = True
+
+    def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
+        mask_nodes = self.model.match_parent_path(
+            sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0]
+        )  # yapf: disable
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match unidirectional mask path")
+            return None
+        (mul_mask, sub_mask, last_slice_mask, slice_mask) = mask_nodes
+
+        if mul_qk.input[1] != last_slice_mask.output[0]:
+            logger.debug("fuse_attention failed: mul_qk.input[1] != last_slice_mask.output[0]")
+            return None
+
+        if not self.utils.check_node_input_value(mul_mask, 1, 10000.0):
+            logger.debug("fuse_attention failed: mul_mask input 1 is not constant 10000.0")
+            return None
+
+        if not self.utils.check_node_input_value(sub_mask, 0, 1.0):
+            logger.debug("fuse_attention failed: sub_mask input 0 is not constant 1.0")
+            return None
+
+        if not self.model.find_graph_input(slice_mask.input[0]):
+            logger.info("expect slick_mask input 0 to be graph input")
+            return None
+
+        if not self.utils.check_node_input_value(last_slice_mask, 1, [0]):
+            logger.debug("fuse_attention failed: last_slice_mask input 1 (starts) is not constant [0]")
+            return None
+
+        if not self.utils.check_node_input_value(last_slice_mask, 3, [3]):
+            logger.debug("fuse_attention failed: last_slice_mask input 3 (axes) is not constant [3]")
+            return False
+
+        if not self.utils.check_node_input_value(last_slice_mask, 4, [1]):
+            logger.debug("fuse_attention failed: last_slice_mask input 4 (steps) is not constant [1]")
+            return False
+
+        if not self.utils.check_node_input_value(slice_mask, 3, [2]):
+            logger.debug("fuse_attention failed: slice_mask input 3 (axes) is not constant [2]")
+            return None
+
+        if not self.utils.check_node_input_value(slice_mask, 4, [1]):
+            logger.debug("fuse_attention failed: slice_mask input 4 (steps) is not constant [1]")
+            return None
+
+        last_slice_path = self.model.match_parent_path(
+            last_slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
+        )
+        if last_slice_path is None or last_slice_path[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match last slice path")
+            return None
+
+        first_slice_path = self.model.match_parent_path(
+            slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
+        )
+        if first_slice_path is None or first_slice_path[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match first slice path")
+            return None
+
+        first_slice_sub = self.model.match_parent_path(
+            slice_mask,
+            ["Unsqueeze", "Sub", "Gather", "Shape", "MatMul"],
+            [1, 0, 0, 0, 0],
+        )
+        if first_slice_sub is None or first_slice_sub[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match last slice sub path")
+            return None
+
+        first_slice_sub_1 = self.model.match_parent_path(
+            slice_mask,
+            ["Unsqueeze", "Sub", "Gather", "Shape", "LayerNormalization"],
+            [1, 0, 1, 0, 0],
+        )
+        if first_slice_sub_1 is None or first_slice_sub_1[-1] != layernorm_before_attention:
+            logger.debug("fuse_attention: failed to match last slice sub path 1")
+            return None
+
+        return slice_mask.input[0]
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        past = None
+        present = None
+
+        qkv_nodes = self.model.match_parent_path(
+            normalize_node,
+            ["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [0, 1, None, 0, 0, 0],
+            output_name_to_node=output_name_to_node,
+        )  # yapf: disable
+        if qkv_nodes is None:
+            return
+        (
+            add_skip,
+            add_after_attention,
+            matmul_after_attention,
+            reshape_qkv,
+            transpose_qkv,
+            matmul_qkv,
+        ) = qkv_nodes
+
+        skip_input = add_skip.input[0]
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            [
+                "Concat",
+                "Transpose",
+                "Reshape",
+                "Split",
+                "Add",
+                "MatMul",
+                "LayerNormalization",
+            ],
+            [1, 1, 0, 0, 0, None, 0],
+        )  # yapf: disable
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (
+            concat_v,
+            transpose_v,
+            reshape_v,
+            split_v,
+            add_before_split,
+            matmul_before_split,
+            layernorm_before_attention,
+        ) = v_nodes
+        if skip_input != layernorm_before_attention.input[0]:
+            logger.debug("fuse_attention: skip_input != layernorm_before_attention.input[0]")
+            return
+
+        qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "MatMul"], [0, 0, 0, 0])
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return None
+        (softmax_qk, sub_qk, mul_qk, matmul_qk) = qk_nodes
+        if self.model.get_node_attribute(softmax_qk, "axis") != 3:
+            logger.debug("fuse_attention failed: softmax_qk axis != 3")
+            return None
+
+        attention_mask = self.match_mask(sub_qk, mul_qk, matmul_qk, layernorm_before_attention)
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Div", "Transpose", "Reshape", "Split"], [0, 0, 0, 0])
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (div_q, transpose_q, reshape_q, split_q) = q_nodes
+        if split_v != split_q:
+            logger.debug("fuse_attention: skip since split_v != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Div", "Transpose", "Concat", "Transpose", "Reshape", "Split"],
+            [1, 0, 0, 1, 0, 0],
+        )
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        (div_k, _, concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        if split_v != split_k:
+            logger.debug("fuse_attention: skip since split_v != split_k")
+            return
+
+        i, value = self.model.get_constant_input(reshape_k)
+        if not (
+            isinstance(value, np.ndarray)
+            and list(value.shape) == [4]
+            and value[0] == 0
+            and value[1] == 0
+            and value[2] > 0
+            and value[3] > 0
+        ):
+            logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]")
+            return
+
+        num_heads = value[2]
+        if num_heads != self.num_heads:
+            logger.info(f"Detected num_heads={num_heads}. Ignore user specified value {self.num_heads}")
+            self.num_heads = num_heads
+
+        hidden_size_per_head = value[3]
+        i, value = self.model.get_constant_input(div_k)
+        expected_value = float(np.sqrt(np.sqrt(hidden_size_per_head)))
+        if not is_close(value, expected_value):
+            logger.debug(f"fuse_attention: div_k value={value} expected={expected_value}")
+            return
+
+        i, value = self.model.get_constant_input(div_q)
+        if not is_close(value, expected_value):
+            logger.debug(f"fuse_attention: div_q value={value} expected={expected_value}")
+            return
+
+        # Match past and present paths
+        past = self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
+        if past is None:
+            logger.debug("fuse_attention: match past failed")
+            return
+        if not self.model.find_graph_input(past):
+            logger.debug("fuse_attention: past is not graph input.")
+            # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
+
+        present = self.match_present(concat_v, input_name_to_nodes)
+        if present is None:
+            logger.debug("fuse_attention: match present failed")
+            return
+        if not self.model.find_graph_output(present):
+            logger.info("fuse_attention: expect present to be graph output")
+            return
+
+        self.fuse_attention_node(
+            matmul_before_split,
+            add_before_split,
+            past,
+            present,
+            layernorm_before_attention.output[0],
+            reshape_qkv,
+            attention_mask,
+        )
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca88f144fb2fc0095c03a79bc040e8a369255603
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
@@ -0,0 +1,252 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from logging import getLogger
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGptAttentionNoPast(Fusion):
+    """
+    Fuse GPT-2 Attention without past state into one Attention node.
+    This does not support attention_mask graph input right now.
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+            "without past",
+        )
+        self.where_qk_shared = None
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, div: NodeProto
+    ) -> Tuple[int, int]:
+        div_initializer = self.model.get_initializer(div.input[1])
+
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.ceil(div_value * div_value)
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        where_qk: NodeProto,
+    ) -> Union[NodeProto, None]:
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_inputs = [input]
+        if where_qk is not None:
+            has_mask = 1
+            has_qk_bias = 1
+            attention_inputs.append(where_qk.output[0])
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        return_indice = []
+        add_qkv = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                add_qkv = add_before_layernorm
+
+        qkv_paths = {
+            "path1": (
+                ["CustomFCPluginDynamic_IxRT", "Reshape", "Transpose", "MatMul"],
+                [None, 0, 0, 0],
+            ),
+            "path2": (
+                ["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"],
+                [None, 0, 0],
+            ),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(
+            add_qkv,
+            qkv_paths,
+            output_name_to_node,
+            return_indice,
+        )  # yapf: disable
+
+        if qkv_nodes is None:
+            return
+        reshape_2 = None
+        if qkv_path == "path1":
+            (
+                custom_fc_after_attention,
+                reshape_2,
+                transpose_qkv,
+                matmul_qkv,
+            ) = qkv_nodes
+        else:
+            (
+                custom_fc_after_attention,
+                transpose_qkv,
+                matmul_qkv,
+            ) = qkv_nodes
+
+        another_input = add_qkv.input[1 - return_indice[0]]
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "Split", "CustomFCPluginDynamic_IxRT"],
+            [1, 0, 0, 0],
+        )  # yapf: disable
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (
+            transpose_v,
+            reshape_v,
+            split_v,
+            custom_fc_before_attention,
+        ) = v_nodes
+
+        layernorm_before_attention = self.model.get_parent(
+            custom_fc_before_attention, 0, output_name_to_node
+        )
+        if (
+            layernorm_before_attention is None
+            or layernorm_before_attention.op_type != "LayerNormalization"
+        ):
+            if layernorm_before_attention.op_type != "Add":
+                logger.debug(
+                    f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}"
+                )
+                return
+
+        if not another_input in layernorm_before_attention.input:
+            # match openai-gpt
+            if not another_input in layernorm_before_attention.output:
+                logger.debug("Add and LayerNormalization shall have one same input")
+                return
+
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv, ["Softmax", "Add", "Where", "Div", "MatMul"], [0, None, 0, 1, 0]
+        )
+        where_qk = None
+        matmul_qk = None
+        mask_return_indices = []
+        if qk_nodes is not None:
+            (softmax_qk, add_qk, where_qk, div_qk, matmul_qk) = qk_nodes
+            mask_nodes = self.model.match_parent_path(
+                add_qk,
+                ["Mul", "Sub", "Cast", "Unsqueeze"],
+                [None, 0, 1, 0],
+                return_indice=mask_return_indices,
+            )  # yapf: disable
+            if mask_nodes is None:
+                logger.debug("fuse_attention: failed to match mask path")
+                return
+
+        q_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0]
+        )
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (transpose_q, reshape_q, split_q) = q_nodes
+        if split_v != split_q:
+            logger.debug("fuse_attention: skip since split_v != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Split"], [1, 0, 0]
+        )
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        (transpose_k, reshape_k, split_k) = k_nodes
+        if split_v != split_k:
+            logger.debug("fuse_attention: skip since split_v != split_k")
+            return
+
+        if where_qk is None:
+            return
+
+        if self.where_qk_shared is None:
+            where_qk.input[1] = mask_nodes[0].output[0]
+            div_qk.output[0] = where_qk.output[0]
+            add_qk.input[1 - mask_return_indices[0]] = div_qk.output[0]
+            self.where_qk_shared = where_qk
+            self.nodes_to_remove.extend([softmax_qk, add_qk, div_qk, matmul_qk])
+        else:
+            self.nodes_to_remove.extend(
+                [softmax_qk, add_qk, where_qk, div_qk, matmul_qk]
+            )
+
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+            custom_fc_after_attention, div_qk
+        )
+        new_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            custom_fc_before_attention.output[0],
+            transpose_qkv.output[0] if reshape_2 is None else reshape_2.output[0],
+            self.where_qk_shared,
+        )
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+        if reshape_2 is not None:
+            self.nodes_to_remove.extend([reshape_2])
+        self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
+        self.nodes_to_remove.extend(q_nodes)
+        self.nodes_to_remove.extend(k_nodes)
+        self.nodes_to_remove.extend(v_nodes[:-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..727a1aa50848f7008ebb752a1aebc765efbc0e61
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
@@ -0,0 +1,495 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from .fusion_base import Fusion
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel, hidden_size):
+        self.hidden_size = hidden_size
+        super().__init__(model, "LayerNormalization", "ReduceMean")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        Fuse Layer Normalization subgraph into one node LayerNormalization:
+              +----------------------+
+              |                      |
+              |                      v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                     (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0)    ^
+                                     |                                               |
+                                     +-----------------------------------------------+
+
+         It also handles cases of duplicated sub nodes exported from older version of PyTorch:
+              +----------------------+
+              |                      v
+              |           +-------> Sub-----------------------------------------------+
+              |           |                                                           |
+              |           |                                                           v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
+              |                      ^
+              |                      |
+              +----------------------+
+        """
+        children = self.model.get_children(node, input_name_to_nodes)
+        if len(children) == 0 or len(children) > 2:
+            return
+
+        root_input = node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        if len(children) == 2:
+            if children[1].op_type != "Sub" or children[1].input[0] != root_input:
+                return
+
+        div_node = None
+        for child in children:
+            div_node = self.model.find_first_child_by_type(
+                child, "Div", input_name_to_nodes, recursive=False
+            )
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+
+        path_id, parent_nodes, _ = self.model.match_parent_paths(
+            div_node,
+            [
+                (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
+                (
+                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
+                    [1, 0, 0, 0, 0, 0],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[1]
+        i, add_weight = self.model.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expeced: {add_weight}")
+            return
+
+        pow_node = parent_nodes[3]
+        if not self.model.find_constant_input(pow_node, 2.0) == 1:
+            return
+
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        is_not_have_mul_and_add = False
+        is_not_have_mul_and_add_lst_node = None
+        # deal with special case : layernorm do not have mul and add
+        if mul_node.op_type != "Mul" and mul_node.op_type == "MatMul":
+            is_not_have_mul_and_add = True
+            is_not_have_mul_and_add_lst_node = div_node
+        elif mul_node.op_type != "Mul":
+            return
+
+        if is_not_have_mul_and_add:
+            last_add_node = is_not_have_mul_and_add_lst_node
+            if self.hidden_size == 0:
+                print(
+                    "[Error] Please add '--hidden_size' and '--num_head' to fuse layernorm ..."
+                )
+                exit(0)
+
+            subgraph_nodes = [node]
+            subgraph_nodes.extend(children)
+            subgraph_nodes.extend(parent_nodes[:-1])
+            subgraph_nodes.extend([last_add_node])
+            if len(subgraph_nodes) == 7:
+                self.nodes_to_remove.extend(subgraph_nodes)
+            else:
+                return
+
+            norm_name = self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            )
+            np_weights = np.ones((self.hidden_size)).astype(np.float32)
+            np_weights_name = norm_name + "_weights"
+            weights_tensor = helper.make_tensor(
+                np_weights_name, TensorProto.FLOAT, np_weights.shape, np_weights
+            )
+            np_bias = np.zeros((self.hidden_size)).astype(np.float32)
+            np_bias_name = norm_name + "_bias"
+            bias_tensor = helper.make_tensor(
+                np_bias_name, TensorProto.FLOAT, np_bias.shape, np_bias
+            )
+            self.model.add_initializer(weights_tensor)
+            self.model.add_initializer(bias_tensor)
+            normalize_node = helper.make_node(
+                "LayerNormalization",
+                inputs=[node.input[0], np_weights_name, np_bias_name],
+                outputs=[last_add_node.output[0]],
+                name=norm_name,
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("epsilon", float(add_weight))]
+            )
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+        else:
+            last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+            if last_add_node.op_type != "Add":
+                return
+
+            subgraph_nodes = [node]
+            subgraph_nodes.extend(children)
+            subgraph_nodes.extend(parent_nodes[:-1])
+
+            subgraph_nodes.extend([last_add_node, mul_node, div_node])
+            if not self.model.is_safe_to_fuse_nodes(
+                subgraph_nodes,
+                last_add_node.output,
+                input_name_to_nodes,
+                output_name_to_node,
+            ):
+                logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
+                return
+
+            weight_input = mul_node.input[
+                1 - self.model.input_index(div_node.output[0], mul_node)
+            ]
+            if not self.model.is_constant_with_specified_dimension(
+                weight_input, 1, "layernorm weight"
+            ):
+                return
+
+            bias_input = last_add_node.input[
+                1 - self.model.input_index(mul_node.output[0], last_add_node)
+            ]
+            if not self.model.is_constant_with_specified_dimension(
+                bias_input, 1, "layernorm bias"
+            ):
+                return
+
+            self.nodes_to_remove.extend(subgraph_nodes)
+            normalize_node = helper.make_node(
+                "LayerNormalization",
+                inputs=[node.input[0], weight_input, bias_input],
+                outputs=[last_add_node.output[0]],
+                name=self.model.create_node_name(
+                    "LayerNormalization", name_prefix="LayerNorm"
+                ),
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("epsilon", float(add_weight))]
+            )
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionLayerNormalizationKeras(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model, "LayerNormalization", "GlobalAveragePool", "Keras layernorm"
+        )
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+          +-------------------------------+
+          |                               |
+          |                               v
+        [Root] -->  GlobalAveragePool-->  Sub  --> Mul --> GlobalAveragePool --> Add/Min/Max --> Sqrt --> Div --> Mul --> Add
+                                           |                                                               ^
+                                           |                                                               |
+                                           +---------------------------------------------------------------+
+        """
+        children = self.model.get_children(node, input_name_to_nodes)
+        # print(len(children))
+        if len(children) != 1:
+            return
+
+        root_input = node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        div_node = None
+        for child in children:
+            div_node = self.model.find_first_child_by_type(
+                child, "Div", input_name_to_nodes, recursive=False
+            )
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+        # print('div_node_name:', div_node.name)
+        path_id, parent_nodes, _ = self.model.match_parent_paths(
+            div_node,
+            [
+                (
+                    ["Sqrt", "Max", "Min", "Add", "GlobalAveragePool", "Mul", "Sub"],
+                    [1, 0, 0, 0, None, 0, None],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[3]
+        i, add_weight = self.model.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expeced: {add_weight}")
+            return
+
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        if mul_node.op_type != "Mul":
+            return
+
+        last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+        if last_add_node.op_type != "Add":
+            return
+
+        subgraph_nodes = [node]
+        subgraph_nodes.extend(children)
+        subgraph_nodes.extend(parent_nodes[:-1])
+
+        subgraph_nodes.extend([last_add_node, mul_node, div_node])
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            last_add_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
+            return
+
+        weight_input = mul_node.input[
+            1 - self.model.input_index(div_node.output[0], mul_node)
+        ]
+        if not self.model.is_constant_with_specified_dimension(
+            weight_input, 1, "layernorm weight"
+        ):
+            return
+
+        bias_input = last_add_node.input[
+            1 - self.model.input_index(mul_node.output[0], last_add_node)
+        ]
+        if not self.model.is_constant_with_specified_dimension(
+            bias_input, 1, "layernorm bias"
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        normalize_node = helper.make_node(
+            "LayerNormalization",
+            inputs=[node.input[0], weight_input, bias_input],
+            outputs=[last_add_node.output[0]],
+            name=self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            ),
+        )
+        normalize_node.attribute.extend(
+            [helper.make_attribute("epsilon", float(add_weight))]
+        )
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionLayerNormalizationTF(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "LayerNormalization", "Add", "TF")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+         Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
+          +------------------------------------+
+          |                                    |
+          |                                    |
+        (Cast_1)                               |
+          |                                    |
+          |                                    v                                           (B)                             (B)             (A)
+         Add --> (Cast_1) --> ReduceMean -->  Sub  --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
+          |                       |                                                                                         |       ^              ^
+          |                       |                                                                                         |       |              |
+          |                       +--------------------------------------------------(Cast_2)-------------------------------|-------+              |
+          |                                                                                                                 v                      |
+          +---------------------------------------------------------------------------------------------------------------> Mul--------------------+
+        """
+        return_indice = []
+        _, parent_nodes, return_indice = self.model.match_parent_paths(
+            node,
+            [
+                (
+                    [
+                        "Sub",
+                        "Mul",
+                        "Mul",
+                        "Reciprocal",
+                        "Sqrt",
+                        "Add",
+                        "ReduceMean",
+                        "Mul",
+                        "Sub",
+                        "ReduceMean",
+                    ],
+                    [1, 1, None, 0, 0, 0, None, 0, 0, None],
+                ),
+                (
+                    [
+                        "Sub",
+                        "Mul",
+                        "Mul",
+                        "Reciprocal",
+                        "Sqrt",
+                        "Add",
+                        "Cast",
+                        "ReduceMean",
+                        "Mul",
+                        "Sub",
+                        "ReduceMean",
+                    ],
+                    [1, 1, None, 0, 0, 0, 0, None, 0, 0, None],
+                ),
+            ],
+            output_name_to_node,
+        )  # yapf: disable
+
+        if parent_nodes is None:
+            return
+
+        assert len(return_indice) == 3
+        if not (
+            return_indice[0] in [0, 1]
+            and return_indice[1] in [0, 1]
+            and return_indice[2] in [0, 1]
+        ):
+            logger.debug(
+                "return indice is exepected in [0, 1], but got {return_indice}"
+            )
+            return
+
+        (
+            sub_node_0,
+            mul_node_0,
+            mul_node_1,
+            reciprocol_node,
+            sqrt_node,
+            add_node_0,
+        ) = parent_nodes[:6]
+        reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[
+            -4:
+        ]
+
+        cast_node_3 = None
+        if len(parent_nodes) == 11:
+            cast_node_3 = parent_nodes[6]
+            assert cast_node_3.op_type == "Cast"
+
+        mul_node_3 = self.model.match_parent(node, "Mul", 0, output_name_to_node)
+        if mul_node_3 is None:
+            logger.debug("mul_node_3 not found")
+            return
+
+        node_before_reduce = self.model.get_parent(
+            reduce_mean_node_1, 0, output_name_to_node
+        )
+        root_node = (
+            node_before_reduce
+            if cast_node_3 is None
+            else self.model.get_parent(node_before_reduce, 0, output_name_to_node)
+        )
+        if root_node is None:
+            logger.debug("root node is none")
+            return
+
+        i, epsilon = self.model.get_constant_input(add_node_0)
+        if (
+            epsilon is None
+            or epsilon <= 0
+            or (epsilon > 1.0e-5 and cast_node_3 is None)
+        ):
+            logger.debug("epsilon is not matched")
+            return
+
+        if cast_node_3 is None and (
+            reduce_mean_node_1.input[0] not in mul_node_3.input
+            or reduce_mean_node_1.input[0] not in sub_node_1.input
+        ):
+            logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
+            return
+
+        if cast_node_3 is not None and (
+            node_before_reduce.input[0] not in mul_node_3.input
+            or reduce_mean_node_1.input[0] not in sub_node_1.input
+        ):
+            logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
+            return
+
+        if mul_node_2.input[0] != mul_node_2.input[1]:
+            logger.debug("mul_node_2 shall have two same inputs")
+            return
+
+        subgraph_nodes = [
+            node,
+            sub_node_0,
+            mul_node_0,
+            mul_node_1,
+            reciprocol_node,
+            sqrt_node,
+            add_node_0,
+            reduce_mean_node_0,
+            mul_node_2,
+            sub_node_1,
+            reduce_mean_node_1,
+            mul_node_3,
+        ]
+
+        if cast_node_3 is not None:
+            cast_node_2 = self.model.match_parent(
+                mul_node_0, "Cast", 0, output_name_to_node
+            )
+            if cast_node_2 is None:
+                logger.debug("cast_node_2 not found")
+                return
+            subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3])
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            node.output,
+            self.model.input_name_to_nodes(),
+            self.model.output_name_to_node(),
+        ):
+            logger.debug("not safe to fuse layer normalization")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        weight_input = mul_node_1.input[1]
+        bias_input = sub_node_0.input[0]
+
+        # TODO: add epsilon attribute
+        fused_node = helper.make_node(
+            "LayerNormalization",
+            inputs=[mul_node_3.input[0], weight_input, bias_input],
+            outputs=[node.output[0]],
+            name=self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            ),
+        )
+        fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0a1a535b13b391736f91064799b89f422eb600a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
@@ -0,0 +1,170 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from argparse import ArgumentParser
+
+
+class AttentionMaskFormat:
+    MaskIndexEnd = 0
+    MaskIndexEndAndStart = 1
+    AttentionMask = 2
+    NoMask = 3
+
+
+class FusionOptions:
+    """Options of fusion in graph optimization"""
+
+    def __init__(self, model_type):
+        self.enable_gelu = True
+        self.enable_layer_norm = True
+        self.enable_attention = True
+        self.enable_skip_layer_norm = True
+        self.enable_embed_layer_norm = True
+        self.enable_bias_skip_layer_norm = True
+        self.enable_bias_gelu = True
+        self.enable_gelu_approximation = False
+        self.enable_qordered_matmul = True
+
+        self.enable_shape_inference = True
+        self.enable_swint_opt = False
+        self.enable_format_roformer = False
+        self.enable_gpt2_classify = False
+        self.enable_vit = False
+        self.attention_mask_format = AttentionMaskFormat.AttentionMask
+
+        if model_type == "gpt2":
+            self.enable_skip_layer_norm = False
+            self.enable_gpt2_classify = True
+        elif model_type == "swint":
+            self.enable_swint_opt = True
+        elif model_type == "roformer":
+            self.enable_format_roformer = True
+        elif model_type == "vit":
+            self.enable_vit = True
+
+    def use_raw_attention_mask(self, use_raw_mask=True):
+        if use_raw_mask:
+            self.attention_mask_format = AttentionMaskFormat.AttentionMask
+        else:
+            self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd
+
+    def disable_attention_mask(self):
+        self.attention_mask_format = AttentionMaskFormat.NoMask
+
+    @staticmethod
+    def parse(args):
+        options = FusionOptions(args.model_type)
+        if args.disable_gelu:
+            options.enable_gelu = False
+        if args.disable_layer_norm:
+            options.enable_layer_norm = False
+        if args.disable_attention:
+            options.enable_attention = False
+        if args.disable_skip_layer_norm:
+            options.enable_skip_layer_norm = False
+        if args.disable_embed_layer_norm:
+            options.enable_embed_layer_norm = False
+        if args.disable_bias_skip_layer_norm:
+            options.enable_bias_skip_layer_norm = False
+        if args.disable_bias_gelu:
+            options.enable_bias_gelu = False
+        if args.enable_gelu_approximation:
+            options.enable_gelu_approximation = True
+        if args.disable_shape_inference:
+            options.enable_shape_inference = False
+        if args.use_mask_index:
+            options.use_raw_attention_mask(False)
+        if args.no_attention_mask:
+            options.disable_attention_mask()
+        return options
+
+    @staticmethod
+    def add_arguments(parser: ArgumentParser):
+        parser.add_argument(
+            "--disable_attention",
+            required=False,
+            action="store_true",
+            help="disable Attention fusion",
+        )
+        parser.set_defaults(disable_attention=False)
+
+        parser.add_argument(
+            "--disable_skip_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable SkipLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_skip_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_embed_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable EmbedLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_embed_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_bias_skip_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable Add Bias and SkipLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_bias_skip_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_bias_gelu",
+            required=False,
+            action="store_true",
+            help="disable Add Bias and Gelu/FastGelu fusion",
+        )
+        parser.set_defaults(disable_bias_gelu=False)
+
+        parser.add_argument(
+            "--disable_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable LayerNormalization fusion",
+        )
+        parser.set_defaults(disable_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_gelu",
+            required=False,
+            action="store_true",
+            help="disable Gelu fusion",
+        )
+        parser.set_defaults(disable_gelu=False)
+
+        parser.add_argument(
+            "--enable_gelu_approximation",
+            required=False,
+            action="store_true",
+            help="enable Gelu/BiasGelu to FastGelu conversion",
+        )
+        parser.set_defaults(enable_gelu_approximation=False)
+
+        parser.add_argument(
+            "--disable_shape_inference",
+            required=False,
+            action="store_true",
+            help="disable symbolic shape inference",
+        )
+        parser.set_defaults(disable_shape_inference=False)
+
+        parser.add_argument(
+            "--use_mask_index",
+            required=False,
+            action="store_true",
+            help="use mask index instead of raw attention mask in attention operator",
+        )
+        parser.set_defaults(use_mask_index=False)
+
+        parser.add_argument(
+            "--no_attention_mask",
+            required=False,
+            action="store_true",
+            help="no attention mask. Only works for model_type=bert",
+        )
+        parser.set_defaults(no_attention_mask=False)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b502acb41a9a34f31b4ace3c9d01ea218382ec
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
@@ -0,0 +1,421 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple
+
+import numpy as np
+from .fusion_attention import AttentionMask
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedAttention(Fusion):
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization")
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+
+            # Check if the second input to Reshape flows through a Constant node
+            # TODO: Investigate why FusionAttention doesn't have such logic
+            constant_node = self.model.match_parent_path(reshape_q, ["Constant"], [1])
+
+            if constant_node is None:
+                return self.num_heads, self.hidden_size  # Fall back to user specified value
+            else:
+                constant_node = constant_node[0]
+
+                if len(constant_node.attribute) != 1:
+                    return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+                # This is assuming it is a Tensor attribute (this is a safe assumption)
+                q_shape = constant_node.attribute[0].t
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = False  # Do not show the warning more than once
+
+        return num_heads, hidden_size
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        add_before_layernorm = self.model.match_parent_path(
+            normalize_node,
+            ["QuantizeLinear", "Add"],
+            [0, 0],
+        )
+
+        if add_before_layernorm is not None:
+            start_node = add_before_layernorm[-1]
+        else:
+            return
+
+        # Input QDQ nodes
+        dequantize_input = self.model.match_parent_path(
+            start_node,
+            ["DequantizeLinear"],
+            [None],
+        )
+
+        if dequantize_input is None:
+            logger.debug("fuse_qordered_attention: failed to match input qdq nodes path")
+            return
+
+        dequantize_input = dequantize_input[-1]
+
+        # QKV nodes
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear", "MatMul"],
+            [None, None, 0, 0, 0, 0, 0],
+        )
+
+        if qkv_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match qkv path")
+            return
+
+        (_, projection_matmul, reshape_qkv, transpose_qkv, dequantize_qkv, quantize_qkv, matmul_qkv) = qkv_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qkv, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qkv, self.model):
+            return
+
+        # Identify the root input to the Attention node
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+
+            other_inputs.append(input)
+
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+
+        # V nodes
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [1, 0, 0, 0, 0, None],
+        )
+
+        if v_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match v path")
+            return
+
+        (_, _, dequantize_v, quantize_v, add_v, matmul_v) = v_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_v, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v, self.model):
+            return
+
+        # V MatMul weight
+        dequantize_v_matmul_weight = self.model.match_parent_path(matmul_v, ["DequantizeLinear"], [1])
+
+        if dequantize_v_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match v path")
+            return
+
+        dequantize_v_matmul_weight = dequantize_v_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_v_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v_matmul_weight, self.model, False):
+            return
+
+        # QK nodes
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            [
+                "DequantizeLinear",
+                "QuantizeLinear",
+                "Softmax",
+                "Add",
+                "Div",
+                "DequantizeLinear",
+                "QuantizeLinear",
+                "MatMul",
+            ],
+            [0, 0, 0, 0, None, 0, 0, 0],
+        )
+
+        if qk_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match qk path")
+            return
+
+        (
+            dequantize_qk_softmax,
+            quantize_qk_softmax,
+            softmax_qk,
+            add_qk,
+            div_qk,
+            dequantize_qk,
+            quantize_qk,
+            matmul_qk,
+        ) = qk_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk_softmax, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk_softmax, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk, self.model):
+            return
+
+        # Q nodes
+        q_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [0, 0, 0, 0, 0, None],
+        )
+
+        if q_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match q path")
+            return
+
+        (_, reshape_q, dequantize_q, quantize_q, add_q, matmul_q) = q_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_q, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q, self.model):
+            return
+
+        # Q MatMul weight
+        dequantize_q_matmul_weight = self.model.match_parent_path(matmul_q, ["DequantizeLinear"], [1])
+
+        if dequantize_q_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match q path")
+            return
+
+        dequantize_q_matmul_weight = dequantize_q_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_q_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q_matmul_weight, self.model, False):
+            return
+
+        # K nodes
+        k_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [1, 0, 0, 0, 0, None],
+        )
+
+        if k_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match k path")
+            return
+
+        (_, _, dequantize_k, quantize_k, add_k, matmul_k) = k_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_k, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k, self.model):
+            return
+
+        # K MatMul weight
+        dequantize_k_matmul_weight = self.model.match_parent_path(matmul_k, ["DequantizeLinear"], [1])
+
+        if dequantize_k_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match k path")
+            return
+
+        dequantize_k_matmul_weight = dequantize_k_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_k_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k_matmul_weight, self.model, False):
+            return
+
+        # Mask nodes
+        mask_nodes = self.model.match_parent_path(
+            add_qk, ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]
+        )
+
+        if mask_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match mask_nodes path")
+            return
+
+        # Ascertain `qkv_hidden_sizes` attribute value
+        q_weight = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
+        k_weight = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
+        v_weight = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        # Form QOrderedAttention node
+        if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
+            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+
+            # Ascertain `num_heads` and `hidden_size`
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+
+            # Formulate the inputs
+            # Actual quantized input
+            attention_inputs = [dequantize_input.input[0]]
+            attention_inputs.append(dequantize_input.input[1])
+
+            attention_inputs.append(dequantize_q.input[1])
+            attention_inputs.append(dequantize_k.input[1])
+            attention_inputs.append(dequantize_v.input[1])
+
+            attention_inputs.append(dequantize_q_matmul_weight.input[0])
+            attention_inputs.append(dequantize_k_matmul_weight.input[0])
+            attention_inputs.append(dequantize_v_matmul_weight.input[0])
+
+            attention_inputs.append(dequantize_q_matmul_weight.input[1])
+            attention_inputs.append(dequantize_k_matmul_weight.input[1])
+            attention_inputs.append(dequantize_v_matmul_weight.input[1])
+
+            if self.model.get_initializer(add_q.input[0]):
+                attention_inputs.append(add_q.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_q.input[1])
+
+            if self.model.get_initializer(add_k.input[0]):
+                attention_inputs.append(add_k.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_k.input[1])
+
+            if self.model.get_initializer(add_v.input[0]):
+                attention_inputs.append(add_v.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_v.input[1])
+
+            attention_inputs.append(quantize_qk.input[1])
+            attention_inputs.append(quantize_qk_softmax.input[1])
+            attention_inputs.append(dequantize_qkv.input[1])
+
+            # Mask input
+            if mask_index is not None:
+                attention_inputs.append(mask_index)
+            else:
+                attention_inputs.append("")
+
+            # The MatMul weight 'B' and 'bias' need some post-processing
+            # Transpose weight 'B' from order ROW to order COL
+            # This offline transpose is needed only while using the CUDA EP
+            # TODO: Make this fusion logic EP-agnostic ?
+            q_weight_tensor = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(q_weight_tensor)
+
+            k_weight_tensor = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(k_weight_tensor)
+
+            v_weight_tensor = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(v_weight_tensor)
+
+            # Name and create Attention node
+            attention_node_name = self.model.create_node_name("QOrderedAttention")
+
+            attention_node = helper.make_node(
+                "QOrderedAttention",
+                inputs=attention_inputs,
+                outputs=[reshape_qkv.output[0]],
+                name=attention_node_name,
+            )
+
+            self.model.replace_node_input(dequantize_qkv, dequantize_qkv.input[0], attention_node.output[0])
+            self.model.replace_node_input(projection_matmul, projection_matmul.input[0], dequantize_qkv.output[0])
+
+            attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+            attention_node.attribute.extend([helper.make_attribute("order_input", 1)])
+            attention_node.attribute.extend([helper.make_attribute("order_weight", 0)])
+            attention_node.attribute.extend([helper.make_attribute("order_output", 1)])
+            attention_node.attribute.extend(
+                [helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
+            )
+
+            attention_node.domain = "com.microsoft"
+
+            self.nodes_to_add.append(attention_node)
+            self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([reshape_qkv, transpose_qkv, quantize_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+            self.nodes_to_remove.extend(
+                [dequantize_q_matmul_weight, dequantize_k_matmul_weight, dequantize_v_matmul_weight]
+            )
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ce59f784bc9242213c9e0dc699764d8c50e0fb2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
@@ -0,0 +1,117 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedGelu", ["Gelu", "FastGelu"])
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        INPUT PATTERN
+        Fuse (quantized) Gelu subgraph into one node QOrderedGelu:
+            -> quantized input  -> DQ -> Gelu -> Q ->
+
+        (or)
+
+            -> quantized input  -> DQ -> FastGelu -> Q ->
+
+        OUTPUT PATTERN
+            -> QOrderedGelu ->
+        """
+        gelu_children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - QuantizeLinear (or)
+        # Should have 2 children - QuantizeLinear + Shape
+        if not (
+            (len(gelu_children) == 1 and gelu_children[0].op_type == "QuantizeLinear")
+            or (
+                len(gelu_children) == 2
+                and gelu_children[0].op_type == "QuantizeLinear"
+                and gelu_children[1].op_type == "Shape"
+            )
+        ):
+            return
+
+        downstream_quantize_node = gelu_children[0]
+        downstream_shape_node = None
+
+        if len(gelu_children) == 2:
+            downstream_shape_node = gelu_children[1]
+
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to Gelu should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        if first_path_id < 0:
+            return
+
+        upstream_dequantize_node = first_input_parent_nodes[0]
+
+        if not FusionUtils.check_qdq_node_for_fusion(upstream_dequantize_node, self.model):
+            return
+
+        # Fusion logic
+        subgraph_nodes = [node]  # Gelu/FastGelu
+        subgraph_nodes.extend([downstream_quantize_node, upstream_dequantize_node])  # Relevant Q, DQ nodes
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [node.output[0], downstream_quantize_node.output[0]]
+            if downstream_shape_node is not None
+            else downstream_quantize_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        ordered_gelu_node = helper.make_node(
+            "QOrderedGelu",
+            inputs=[
+                upstream_dequantize_node.input[0],
+                upstream_dequantize_node.input[1],
+                downstream_quantize_node.input[1],
+            ],
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedGelu", name_prefix="QOrderedGelu"),
+        )
+
+        # Arrange the downstream Shape's input to be fed from the
+        # downstream QuantizeLinear node, so that fusion will
+        # be deemed safe
+        if downstream_shape_node is not None:
+            self.model.replace_node_input(
+                downstream_shape_node, downstream_shape_node.input[0], downstream_quantize_node.output[0]
+            )
+
+        # TODO: We only support CuBlasLt order ORDER_ROW for now.
+        # Once we start supporting other data ordering format(s), we
+        # will support user configuring the data ordering for the op.
+        ordered_gelu_node.attribute.extend([helper.make_attribute("order_X", 1)])
+        ordered_gelu_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        ordered_gelu_node.domain = "com.microsoft"
+
+        self.nodes_to_add.append(ordered_gelu_node)
+        self.node_name_to_graph_name[ordered_gelu_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..08def4a20f205658df7ca9371e9fb9509103657b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
@@ -0,0 +1,121 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedLayerNormalization", "LayerNormalization")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        Fuse (quantized) Layer Normalization subgraph into one node QOrderedLayerNormalization:
+            quantized input  -> DQ
+                                |
+                                |
+            (other inputs)-> LayerNormalization --> Q -->
+
+            should become
+
+            (quantized input + other inputs)->  QOrderedLayerNormalization --> Q -->
+        """
+
+        children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - QuantizeLinear (or)
+        # Should have 2 children - QuantizeLinear + Shape
+        if not (
+            (len(children) == 1 and children[0].op_type == "QuantizeLinear")
+            or (len(children) == 2 and children[0].op_type == "QuantizeLinear" and children[1].op_type == "Shape")
+        ):
+            return
+
+        downstream_quantize_node = children[0]
+        downstream_shape_node = None
+
+        if len(children) == 2:
+            downstream_shape_node = children[1]
+
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to LayerNormalization should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        if first_path_id < 0:
+            return
+
+        upstream_dequantize_node = first_input_parent_nodes[0]
+
+        if not FusionUtils.check_qdq_node_for_fusion(upstream_dequantize_node, self.model):
+            return
+
+        # Fusion logic
+        subgraph_nodes = [node]  # LayerNormalization
+        subgraph_nodes.extend([downstream_quantize_node])  # Q node after LayerNormalization
+
+        upstream_dequantize_node_children = self.model.get_children(upstream_dequantize_node, input_name_to_nodes)
+
+        # In GPT2, the DQ node will be feeding a residual downstream Add and hence,
+        # we do not want to remove it
+        if len(upstream_dequantize_node_children) == 1:
+            subgraph_nodes.extend([upstream_dequantize_node])  # DQ node before LayerNormalization
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [node.output[0], downstream_quantize_node.output[0]]
+            if downstream_shape_node is not None
+            else downstream_quantize_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedLayerNormalization node. Skip")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        normalize_node = helper.make_node(
+            "QOrderedLayerNormalization",
+            inputs=[
+                upstream_dequantize_node.input[0],
+                upstream_dequantize_node.input[1],
+                node.input[1],
+                node.input[2],
+                downstream_quantize_node.input[1],
+            ],
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedLayerNormalization", name_prefix="QOrderedLayerNormalization"),
+        )
+
+        # Arrange the downstream Shape's input to be fed from the
+        # downstream QuantizeLinear node, so that fusion will
+        # be deemed safe
+        if downstream_shape_node is not None:
+            self.model.replace_node_input(
+                downstream_shape_node, downstream_shape_node.input[0], downstream_quantize_node.output[0]
+            )
+
+        # TODO: We only support CuBlasLt order ORDER_ROW for now.
+        # Once we start supporting other data ordering format(s), we
+        # will support user configuring the data ordering for the op.
+        normalize_node.attribute.extend([helper.make_attribute("order_X", 1)])
+        normalize_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        normalize_node.domain = "com.microsoft"
+
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..de0196c53b3f6e8c38301adae476dcfd6f524aa3
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
@@ -0,0 +1,217 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedMatMul(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedMatMul", "MatMul")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        matmul_children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - Bias Add
+        if len(matmul_children) != 1 or matmul_children[0].op_type != "Add":
+            return
+
+        bias_add_node = matmul_children[0]
+
+        # Atleast one of the inputs to Bias Add node must be a constant
+        bias_add_node_index = 0
+        if (
+            self.model.get_constant_value(bias_add_node.input[0]) is None
+            and self.model.get_constant_value(bias_add_node.input[1]) is None
+        ):
+            return
+
+        if self.model.get_constant_value(bias_add_node.input[0]) is None:
+            bias_add_node_index = 1
+
+        bias_add_children = self.model.get_children(bias_add_node, input_name_to_nodes)
+
+        if len(bias_add_children) != 1:
+            return
+
+        bias_add_child = bias_add_children[0]
+
+        # Bias Add can have another Add downstream (Residual Add layer)
+        residual_add_node = None
+
+        downstream_quantize_node = None
+
+        if bias_add_child.op_type == "Add":
+            residual_add_node = bias_add_child
+
+            residual_add_children = self.model.get_children(residual_add_node, input_name_to_nodes)
+
+            if len(residual_add_children) != 1 or residual_add_children[0].op_type != "QuantizeLinear":
+                return
+
+            downstream_quantize_node = residual_add_children[0]
+
+        elif bias_add_child.op_type == "QuantizeLinear":
+            downstream_quantize_node = bias_add_child
+
+        else:
+            return
+
+        # Make sure the downstream QuantizeLinear has the proper zero points and scales
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to MatMul should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        # If Attention is not fused, this is the pattern to look for
+        # leading upto the MatMul
+        reshape_node_0 = None
+        transpose_node_0 = None
+        if first_path_id < 0:
+            first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+                node,
+                [(["Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear"], [0, 0, 0, 0])],
+                output_name_to_node,
+            )
+
+            if first_path_id < 0:
+                return
+
+            reshape_node_0 = first_input_parent_nodes[0]
+            transpose_node_0 = first_input_parent_nodes[1]
+            dequantize_node_0 = first_input_parent_nodes[2]
+        else:
+            dequantize_node_0 = first_input_parent_nodes[0]
+
+        # Make sure the upstream DequantizeLinear-0 has the proper zero points and scales
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_0, self.model):
+            return
+
+        # The second input to MatMul should flow through a DequantizeLinear node
+        dequantize_node_1 = None
+        is_weight_transpose_required = True
+
+        weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear", "QuantizeLinear", "Transpose", "DequantizeLinear"], [1, 0, 0, 0])],
+            output_name_to_node,
+        )
+
+        if weight_path_id < 0:
+            weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
+                node,
+                [(["DequantizeLinear"], [1])],
+                output_name_to_node,
+            )
+
+            if weight_path_id < 0:
+                return
+
+            dequantize_node_1 = weight_nodes[0]
+        else:
+            is_weight_transpose_required = False
+            dequantize_node_1 = weight_nodes[3]
+
+        # Check if weight 'B' is a constant
+        if self.model.get_constant_value(dequantize_node_1.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_1, self.model, False):
+            return
+
+        # Make sure the upstream flow into the Residual Add node flows through a DQ node
+        residual_add_dequantize_node = None
+
+        if residual_add_node is not None:
+            residual_path_id, residual_input_parent_nodes, _ = self.model.match_parent_paths(
+                residual_add_node,
+                [
+                    (["DequantizeLinear"], [1]),
+                ],
+                output_name_to_node,
+            )
+
+            if residual_path_id < 0:
+                return
+
+            residual_add_dequantize_node = residual_input_parent_nodes[0]
+
+        # Make sure the upstream DequantizeLinear to the Residual Add has the proper zero points and scales
+        if residual_add_dequantize_node is not None and not FusionUtils.check_qdq_node_for_fusion(
+            residual_add_dequantize_node, self.model
+        ):
+            return
+
+        # Subgraph nodes to be fused
+        subgraph_nodes = [node, bias_add_node]  # MatMul + Bias Add
+
+        if residual_add_node is not None:
+            subgraph_nodes.extend([residual_add_node])  # Residual Add
+
+        subgraph_nodes.extend(weight_nodes)
+        subgraph_nodes.extend([downstream_quantize_node])  # Downstream Q node
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, downstream_quantize_node.output, input_name_to_nodes, output_name_to_node
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
+            return
+
+        # Deal with the case where-in the Attention subgraph is not fused
+        if transpose_node_0 is not None:
+            self.model.replace_node_input(transpose_node_0, transpose_node_0.input[0], dequantize_node_0.input[0])
+
+        # Make inputs
+        fused_node_inputs = [
+            reshape_node_0.output[0] if reshape_node_0 is not None else dequantize_node_0.input[0],
+            dequantize_node_0.input[1],
+            dequantize_node_1.input[0],
+            dequantize_node_1.input[1],
+            downstream_quantize_node.input[1],
+            bias_add_node.input[bias_add_node_index],
+        ]
+
+        if residual_add_node is not None:
+            fused_node_inputs.append(residual_add_dequantize_node.input[0])
+            fused_node_inputs.append(residual_add_dequantize_node.input[1])
+
+        # The MatMul weight 'B' and 'bias' need some post-processing
+        # Transpose weight 'B' from order ROW to order COL
+        # This offline transpose is needed only while using the CUDA EP
+        # TODO: Make this fusion logic EP-agnostic ?
+        if is_weight_transpose_required:
+            weight_tensor = self.model.get_initializer(dequantize_node_1.input[0])
+            FusionUtils.transpose_2d_int8_tensor(weight_tensor)
+
+        fused_node = helper.make_node(
+            "QOrderedMatMul",
+            inputs=fused_node_inputs,
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedMatMul", name_prefix="QOrderedMatMul"),
+        )
+
+        fused_node.attribute.extend([helper.make_attribute("order_A", 1)])
+        fused_node.attribute.extend([helper.make_attribute("order_B", 0)])
+        fused_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        fused_node.domain = "com.microsoft"
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b46c16cac89d23bbdbea86b0b418bff792fcdc
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
@@ -0,0 +1,175 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionReshape(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Reshape", "Reshape")
+        self.prune_graph: bool = False
+
+    def replace_reshape_node(self, shape, reshape_node, concat_node):
+        shape_value = np.asarray(shape, dtype=np.int64)
+        constant_shape_name = self.model.create_node_name("Constant", "constant_shape")
+        new_node = helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[constant_shape_name],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.INT64,
+                dims=shape_value.shape,
+                vals=bytes(shape_value),
+                raw=True,
+            ),
+        )
+        reshape_node.input[1] = constant_shape_name
+        reshape_node.name = self.model.create_node_name("Reshape", "Reshape_Fuse")
+        self.nodes_to_remove.extend([concat_node])
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+    def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node):
+        if reshape_node.input[1] not in output_name_to_node:
+            return
+
+        concat_node = output_name_to_node[reshape_node.input[1]]
+        if concat_node.op_type != "Concat" or len(concat_node.input) < 3 or len(concat_node.input) > 4:
+            return
+
+        path0 = self.model.match_parent_path(
+            concat_node,
+            ["Unsqueeze", "Gather", "Shape"],
+            [0, 0, 0],
+            output_name_to_node,
+        )
+        if path0 is None:
+            return
+
+        (unsqueeze_0, gather_0, shape_0) = path0
+
+        path1 = self.model.match_parent_path(
+            concat_node,
+            ["Unsqueeze", "Gather", "Shape"],
+            [1, 0, 0],
+            output_name_to_node,
+        )
+        if path1 is None:
+            return
+        (unsqueeze_1, gather_1, shape_1) = path1
+
+        shape = []
+        gather_value = self.model.get_constant_value(gather_0.input[1])
+        if gather_value == 0:
+            shape.append(0)
+
+        gather_value = self.model.get_constant_value(gather_1.input[1])
+        if gather_value == 1:
+            shape.append(0)
+
+        if len(shape) != 2:
+            return
+
+        path2 = []
+        path3 = []
+        shape_nodes = [shape_0, shape_1]
+        if len(concat_node.input) == 3 and self.model.get_initializer(concat_node.input[2]) is None:
+            path2 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Mul", "Gather", "Shape"],
+                [2, 0, 0, 0],
+                output_name_to_node,
+            )
+            if path2 is None:
+                path2 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
+                    [2, 0, 0, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path2 is None:
+                    return
+
+            path3 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Mul", "Gather", "Shape"],
+                [2, 0, 1, 0],
+                output_name_to_node,
+            )
+            if path3 is None:
+                path3 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
+                    [2, 0, 1, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path3 is None:
+                    return
+
+            shape_nodes.extend([path2[-1], path3[-1]])
+            shape.append(-1)
+        elif len(concat_node.input) > 2:
+            concat_2 = self.model.get_initializer(concat_node.input[2])
+            if concat_2 is None:
+                return
+            concat_value = numpy_helper.to_array(concat_2)
+            if isinstance(concat_value, list):
+                shape.extend(concat_value)
+            else:
+                shape.append(concat_value)
+
+        if len(concat_node.input) == 4 and self.model.get_initializer(concat_node.input[3]) is None:
+            if -1 in shape:
+                return
+
+            path2 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Div", "Gather", "Shape"],
+                [3, 0, 0, 0],
+                output_name_to_node,
+            )
+            if path2 is None:
+                path2 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Div", "Squeeze", "Slice", "Shape"],
+                    [3, 0, 0, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path2 is None:
+                    return
+            shape_nodes.extend([path2[-1]])
+            shape.append(-1)
+        elif len(concat_node.input) > 3:
+            concat_3 = self.model.get_initializer(concat_node.input[3])
+            if concat_3 is None:
+                return
+
+            concat_value = numpy_helper.to_array(concat_3)
+            if isinstance(concat_value, list):
+                shape.extend(concat_value)
+            else:
+                shape.append(concat_value)
+
+        root_input = reshape_node.input[0]
+        same_shape_input = True
+        for shape_node in shape_nodes:
+            if shape_node.input[0] != root_input:
+                same_shape_input = False
+
+        if not same_shape_input:
+            return
+
+        self.replace_reshape_node(shape, reshape_node, concat_node)
+
+        # TODO(tlwu): Subgraph blocks pruning un-used nodes. Add code to remove un-used nodes safely.
+        self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c831f15c58907e4069fcbebe7d23078c7b47bf06
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
@@ -0,0 +1,155 @@
+import logging
+from typing import Dict
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = logging.getLogger(__name__)
+
+
+class FusionRMSNorm(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "RMSNorm", "Mul")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if node.op_type != "Mul":
+            return
+
+        sim_ln_nodes = None
+        # SimplifiedLayerNorm calculation (notation from https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary):
+        # DD = Pow(D, 2)
+        # Var = ReduceMean(DD)
+        # VarEps = Add(Var, epsilon)
+        # StdDev = Sqrt(VarEps)
+        # InvStdDev = Div(1, StdDev)
+        # Normalized = Mul(D, InvStdDev)
+        # NormalizedScaled = Mul(Normalized, Scale)
+
+        #                              RMSNorm
+        #          +-------------------------------------------------------+
+        #          |                                                       |
+        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_1 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
+            [1, 1, 1, 0, 0, 0, 0],
+        )
+        #                                RMSNorm
+        #             +-------------------------------------------------------+
+        #             |                                                       |
+        # Gather --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                     |
+        #                                                                    node
+        sim_ln_nodes_2 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"],
+            [1, 1, 1, 0, 0, 0, 0],
+        )
+
+        # For LLaMA from Microsoft custom export:
+        # sim_ln_nodes_3 uses a different start parent index than sim_ln_nodes_1
+        #
+        #                              RMSNorm
+        #          +-------------------------------------------------------+
+        #          |                                                       |
+        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_3 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
+            [0, 1, 1, 0, 0, 0, 0],
+        )
+
+        # sim_ln_nodes_4 starts with a graph input instead of an Add node like sim_ln_nodes_3
+        #
+        #                                  RMSNorm
+        #                  +-----------------------------------------------+
+        #                  |                                               |
+        # graph_input --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_4 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow"],
+            [0, 1, 1, 0, 0, 0],
+        )
+
+        add_node, pow_node = None, None
+        if sim_ln_nodes_1 is not None:
+            sim_ln_nodes = sim_ln_nodes_1
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_2 is not None:
+            sim_ln_nodes = sim_ln_nodes_2
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_3 is not None:
+            sim_ln_nodes = sim_ln_nodes_3
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_4 is not None:
+            sim_ln_nodes = sim_ln_nodes_4
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-1]
+            # Verify that parent input to Pow node is graph_input
+            if pow_node.input[0] not in self.model.get_graphs_input_names():
+                return
+        else:
+            return
+
+        layernorm_weight_index = (
+            1 if sim_ln_nodes in (sim_ln_nodes_3, sim_ln_nodes_4) else 0
+        )
+        starts_with_graph_input = sim_ln_nodes == sim_ln_nodes_4
+
+        if self.model.find_constant_input(pow_node, 2.0) != 1:
+            return
+
+        root_input = pow_node.input[0]
+        if root_input != sim_ln_nodes[0].input[0]:
+            return
+
+        i, add_weight = self.model.get_constant_input(add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expected: {add_weight}")
+            return
+
+        self.nodes_to_remove.extend(
+            sim_ln_nodes[:-1] if not starts_with_graph_input else sim_ln_nodes
+        )
+        self.nodes_to_remove.append(node)
+
+        normalize_node = helper.make_node(
+            "RMSNormPluginDynamic_IxRT",
+            inputs=[root_input, node.input[layernorm_weight_index]],
+            outputs=[node.output[0]],
+            name=self.model.create_node_name(
+                "RMSNormPluginDynamic_IxRT", name_prefix="RMSNorm_"
+            ),
+        )
+
+        normalize_node.domain = "com.iluvatar"
+        normalize_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        normalize_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        normalize_node.attribute.extend(
+            [helper.make_attribute("epsilon", float(add_weight))]
+        )
+        normalize_node.attribute.extend([helper.make_attribute("axis", -1)])
+        normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)])
+        gamma_data = self.model.get_initializer(normalize_node.input[1])
+        gamma_data_np = NumpyHelper.to_array(gamma_data)
+        normalize_node.attribute.extend(
+            [helper.make_attribute("hidden_size", int(gamma_data_np.shape[0]))]
+        )
+
+        normalize_node.attribute.extend([helper.make_attribute("gamma", gamma_data)])
+
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+        return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5079c2d38c8fd465e49ca51735c570706c9bd40
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
@@ -0,0 +1,368 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionRoformerCrossAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQkvCrossToContext_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, mul: NodeProto
+    ) -> Tuple[int, int]:
+        mul_initializer = self.model.get_initializer(mul.input[1])
+
+        # 检查float_data是否为空
+        if len(mul_initializer.float_data) > 0:
+            mul_value = mul_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(mul_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
+                mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the mul_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.floor(1.0 / (mul_value * mul_value))
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input_q: str,
+        input_k: str,
+        input_v: str,
+        input_mask: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input_q: str,
+            input_k: str,
+            input_v: str,
+            input_mask: str,
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("CrossAttention")
+
+        attention_inputs = [input_q, input_k, input_v, input_mask]
+
+        attention_node = helper.make_node(
+            "CustomQkvCrossToContext_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+        return attention_node
+
+    def get_shape(self, edge_name):
+        for info in self.model.graph().value_info:
+            if info.name == edge_name:
+                return info.type.tensor_type.shape.dim
+        return None
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (
+                [
+                    "CustomFCPluginDynamic_IxRT",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "MatMul",
+                ],
+                [0, 0, 0, 0, 0],
+            ),
+            "path2": (
+                [
+                    "CustomFCPluginDynamic_IxRT",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "MatMul",
+                ],
+                [1, 0, 0, 0, 0],
+            ),
+        }
+        # print('start_nodes:', start_node.name)
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        fc_after_atten = None
+        if qkv_path in ["path1", "path2"]:
+            (
+                fc_after_atten,
+                reshape_qkv_2,
+                transpose_qkv,
+                reshape_qkv_1,
+                matmul_qkv,
+            ) = qkv_nodes
+
+        """
+        Match
+        Add --> LayerNormalization -->  Attention -->     Add --> LayerNormalization
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        v_paths = {"path1": (["Reshape", "Transpose", "Reshape"], [1, 0, 0])}
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == "path1":
+            (reshape_v, transpose_v, v_reshape) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (
+                ["Softmax", "Add", "Mul", "Mul", "Reshape", "MatMul"],
+                [0, 0, None, None, None, 0],
+            )
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+        # print('qk_nodes', qk_nodes[0].name)
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, add_mask, mul_mask, mul_qk, reshape_qk, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1": (["Transpose", "Add"], [0, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        # print('q_nodes', q_nodes[0].name)
+        if q_path == "path1":
+            (q_tranpose, q_add) = q_nodes
+
+        k_paths = {
+            "path1": (["Reshape", "Transpose", "Add"], [1, 0, 0]),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        # print('k_nodes', k_nodes[0].name)
+        if k_path == "path1":
+            (_, k_transpose, k_add) = k_nodes
+        # print('add_mask', add_mask.name)
+        mask_paths = {
+            "path1": (
+                ["Mul", "Sub", "Unsqueeze", "Cast", "Greater"],
+                [1, None, 1, 0, 0],
+            )
+        }
+        mask_nodes, mask_path = self.match_parent_path_from_dict(add_mask, mask_paths)
+
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+        # print('mask_nodes', mask_nodes[0].name)
+        (_, mask_sub, mask_unsqueeze, mask_cast, mask_greater) = mask_nodes
+
+        if (
+            self.get_shape(q_add.output[0]) == self.get_shape(k_add.output[0])
+            and self.get_shape(k_add.output[0]) == self.get_shape(v_reshape.output[0])
+            and mul_mask.input[1] in mask_unsqueeze.output
+        ):
+            attention_last_node = reshape_qkv_1
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+                fc_after_atten, mul_qk
+            )
+
+            q_transpose_type = None
+            q_transpose_name = None
+            for info in self.model.graph().value_info:
+                if info.name == q_tranpose.output[0]:
+                    q_transpose_type = info.type
+                    q_transpose_name = info.name
+                    break
+
+            q_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_q", q_transpose_type
+            )
+            q_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[q_add.output[0]],
+                outputs=[q_transpose_output.name],
+                name=q_transpose_output.name,
+            )
+            q_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            k_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_k", q_transpose_type
+            )
+            k_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[k_add.output[0]],
+                outputs=[k_transpose_output.name],
+                name=k_transpose_output.name,
+            )
+            k_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            v_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_v", q_transpose_type
+            )
+            v_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[v_reshape.output[0]],
+                outputs=[v_transpose_output.name],
+                name=v_transpose_output.name,
+            )
+            v_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            mask_type = None
+            for info in self.model.graph().value_info:
+                if info.name == mask_sub.output[0]:
+                    mask_type = info.type
+                    break
+
+            new_mask_type = onnx.TypeProto()
+            new_mask_type.tensor_type.elem_type = onnx.TensorProto.INT32
+            for dim in mask_type.tensor_type.shape.dim:
+                new_dim = new_mask_type.tensor_type.shape.dim.add()
+                new_dim.CopyFrom(dim)
+
+            mask_cast_to_int32_output = helper.make_value_info(
+                mask_sub.name + "_cast_to_int32", new_mask_type
+            )
+            mask_cast_to_int32_node = helper.make_node(
+                "Cast",
+                inputs=[mask_sub.output[0]],
+                outputs=[mask_cast_to_int32_output.name],
+                name=mask_cast_to_int32_output.name,
+            )
+            mask_cast_to_int32_node.attribute.extend([helper.make_attribute("to", 6)])
+
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                q_transpose_node.output[0],
+                k_transpose_node.output[0],
+                v_transpose_node.output[0],
+                mask_cast_to_int32_node.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.extend(
+                [
+                    q_transpose_node,
+                    k_transpose_node,
+                    v_transpose_node,
+                    new_node,
+                    mask_cast_to_int32_node,
+                ]
+            )
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[q_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[k_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[v_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[
+                mask_cast_to_int32_node.name
+            ] = self.this_graph_name
+
+            self.nodes_to_remove.extend(qkv_nodes[3:])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes[:-1])
+            self.nodes_to_remove.extend(k_nodes[:-1])
+            self.nodes_to_remove.extend(v_nodes[:-1])
+            self.nodes_to_remove.extend([mask_nodes[0]])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ca376c39904b298973f403c2989418ec17e460e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionRoPE(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomRoPEPluginDynamic_IxRT", "Add")
+
+    def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
+        src_paths = {"path1": (["Mul", "Concat", "Split", "Slice"], [0, 1, None, 0])}
+        src_nodes, src_path = self.match_parent_path_from_dict(start_node, src_paths)
+        if src_nodes is None:
+            logger.debug("fuse_rope: failed to match src_node")
+            return
+
+        src_node = src_nodes[0]
+
+        rotate_paths = {"path1": (["Mul", "Reshape", "Concat"], [1, 0, 0])}
+        rotate_nodes, rotate_path = self.match_parent_path_from_dict(
+            start_node, rotate_paths
+        )
+
+        if rotate_nodes is None:
+            logger.debug("fuse_rope: failed to match rotate_path")
+            return
+
+        concat_node = rotate_nodes[-1]
+        mul_right_node = rotate_nodes[0]
+
+        odd_paths = {"path1": (["Unsqueeze", "Neg", "Slice", "Reshape"], [0, 0, 0, 0])}
+        odd_nodes, odd_path = self.match_parent_path_from_dict(concat_node, odd_paths)
+
+        if odd_nodes is None:
+            logger.debug("fuse_rope: failed to match odd_path")
+            return
+
+        even_paths = {"path1": (["Unsqueeze", "Slice", "Reshape"], [1, 0, 0])}
+        even_nodes, even_path = self.match_parent_path_from_dict(
+            concat_node, even_paths
+        )
+
+        if even_nodes is None:
+            logger.debug("fuse_rope: failed to match even_path")
+            return
+        reshape_node = even_nodes[-1]
+
+        if reshape_node.output[0] == src_node.input[0]:
+            rope_node_name = self.model.create_node_name("RoPE")
+            rope_node = helper.make_node(
+                "CustomRoPEPluginDynamic_IxRT",
+                inputs=[
+                    reshape_node.output[0],
+                    src_nodes[0].input[1],
+                    mul_right_node.input[1],
+                ],
+                outputs=[start_node.output[0]],
+                name=rope_node_name,
+            )
+            rope_node.domain = "com.iluvatar"
+            rope_node.attribute.extend([helper.make_attribute("type_id", 2)])
+            rope_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+            rope_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+            self.nodes_to_add.append(rope_node)
+            self.node_name_to_graph_name[rope_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([start_node])
+            self.nodes_to_remove.extend([src_nodes[0]])
+            self.nodes_to_remove.extend(rotate_nodes)
+            self.nodes_to_remove.extend(odd_nodes[:-1])
+            self.nodes_to_remove.extend(even_nodes[:-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47be680f13948c63ea73694d443488cf992daa1
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
@@ -0,0 +1,110 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict, List, Union
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import NodeProto, TensorProto
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionShape(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Shape", "Concat")
+        self.utils = FusionUtils(model)
+        self.shape_infer = None
+        self.shape_infer_done = False
+
+    def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[int, None]:
+        if tensor_proto.type.tensor_type.HasField("shape"):
+            return len(tensor_proto.type.tensor_type.shape.dim)
+        else:
+            return None
+
+    def get_dimensions(self, input_name: str) -> Union[int, None]:
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input:
+            return self.get_dimensions_from_tensor_proto(graph_input)
+
+        if not self.shape_infer_done:
+            self.shape_infer = self.model.infer_runtime_shape({}, update=True)
+            self.shape_infer_done = True
+
+        if self.shape_infer is not None:
+            return self.get_dimensions_from_tensor_proto(self.shape_infer.known_vi_[input_name])
+
+        return None
+
+    def fuse(
+        self,
+        concat_node: NodeProto,
+        input_name_to_nodes: Dict[str, List[NodeProto]],
+        output_name_to_node: Dict[str, NodeProto],
+    ):
+        """
+        Smplify subgraph like
+
+                   (2d_input)
+                    /       \
+                Shape       shape
+                /             \
+            Gather(indices=0)  Gather(indices=1)
+                |                |
+            Unsqueeze(axes=0)   Unsqueeze(axes=0)
+                   \          /
+                      Concat 
+                        |
+
+        into  (2d_input) --> Shape -->
+        """
+        opset_version = self.model.get_opset_version()
+
+        inputs = len(concat_node.input)
+        root = None
+        shape_output = None
+        for i in range(inputs):
+            path = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Gather", "Shape"],
+                [i, 0, 0],
+                output_name_to_node,
+            )
+            if path is None:
+                return
+
+            unsqueeze, gather, shape = path
+            if i == 0:
+                shape_output = shape.output[0]
+            if root is None:
+                root = shape.input[0]
+                if self.get_dimensions(root) != inputs:
+                    return
+            elif shape.input[0] != root:
+                return
+
+            if not FusionUtils.check_node_attribute(unsqueeze, "axis", 0, default_value=0):
+                return
+
+            if opset_version < 13:
+                if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
+                    return
+            else:
+                if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
+                    return
+
+            value = self.model.get_constant_value(gather.input[1])
+            from numpy import array_equal, ndarray
+
+            if not (isinstance(value, ndarray) and value.size == 1 and value.item() == i):
+                return
+
+        if self.model.find_graph_output(concat_node.output[0]) is None:
+            self.model.replace_input_of_all_nodes(concat_node.output[0], shape_output)
+            self.fused_count += 1
+            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5868964467ee7555ea3b47603402f4034885c590
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
@@ -0,0 +1,212 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionSkipLayerNormalization(Fusion):
+    """
+    Fuse Add + LayerNormalization into one node: SkipLayerNormalization
+    Note: This fusion does not check the input shape of Add and LayerNormalization.
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model, "CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"
+        )
+        # Update shape inference is needed since other fusions might add new edge which does not have shape info yet.
+        self.shape_infer_helper = self.model.infer_runtime_shape(
+            {"batch_size": 4, "seq_len": 7}, update=True
+        )
+
+        if self.shape_infer_helper is None:
+            # TODO(tianleiwu): support subgraph in shape inference or add broadcasting in SkipLayerNormalization op.
+            logger.warning("symbolic shape inference disabled or failed.")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        add = self.model.get_parent(node, 0, output_name_to_node)
+
+        # In some models there is input_ids->gather->add->LayerNorm and one of input of the
+        # add node is initializer with fixed shape which should not be fused into SkipLayerNorm
+        if add is None:
+            return
+
+        for add_input in add.input:
+            if self.model.get_initializer(add_input) != None:
+                return
+
+        # The number of input node of add should be 2
+        if len(self.model.get_parents(add)) != 2:
+            return
+
+        if self.shape_infer_helper is not None:
+            if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]):
+                logger.debug(
+                    "skip SkipLayerNormalization fusion since shape of inputs (%s, %s) are not same",
+                    add.input[0],
+                    add.input[1],
+                )
+                return
+        else:
+            layernorm_weight = self.model.get_initializer(node.input[1])
+            if layernorm_weight is not None:
+                layernorm_weight_arr = NumpyHelper.to_array(layernorm_weight)
+                hidden_size = layernorm_weight_arr.shape[0]
+            else:
+                logger.debug(
+                    "skip SkipLayerNormalization fusion since symbolic shape inference failed"
+                )
+                return
+
+        # gather_path = self.model.match_parent_path(add, ["Gather"], [None])
+        # if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None:
+        #     if self.model.match_parent_path(gather_path[0], ["ConstantOfShape"], [1]) is None:
+        #         return
+
+        if (
+            add is not None
+            and add.op_type == "Add"
+            and self.model.is_safe_to_fuse_nodes(
+                [add, node], node.output, input_name_to_nodes, output_name_to_node
+            )
+        ):
+            self.nodes_to_remove.extend([add, node])
+
+            inputs = [add.input[0], add.input[1]]
+            normalize_node = helper.make_node(
+                "CustomSkipLayerNormPluginDynamic_IxRT",
+                inputs=inputs,
+                outputs=[node.output[0]],
+                name=self.model.create_node_name(
+                    "SkipLayerNormalization", name_prefix="SkipLayerNorm"
+                ),
+            )
+            normalize_node.domain = "com.iluvatar"
+            if self.shape_infer_helper is not None:
+                hidden_size = self.shape_infer_helper.get_edge_shape(node.input[1])[-1]
+            normalize_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
+            normalize_node.attribute.extend([helper.make_attribute("type_id", 2)])
+            normalize_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "beta", self.model.get_initializer(node.input[2])
+                    )
+                ]
+            )
+            normalize_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "gamma", self.model.get_initializer(node.input[1])
+                    )
+                ]
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("plugin_namespace", "")]
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("plugin_version", "1")]
+            )
+
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionBiasSkipLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomSkipLayerNormPluginDynamic_IxRT",
+            "SkipLayerNormalization",
+            "add bias",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 4:
+            return
+
+        return_indice = []
+        nodes = self.model.match_parent_path(
+            node, ["Add", "MatMul"], [None, None], None, return_indice
+        )
+        if nodes is None:
+            return
+        assert len(return_indice) == 2
+        add_input_index = return_indice[0]
+        if add_input_index >= 2:
+            return
+
+        (add, matmul) = nodes
+
+        # bias should be one dimension
+        bias_index = -1
+        for i, input in enumerate(add.input):
+            initializer = self.model.get_initializer(input)
+            if initializer is None:
+                continue
+            bias_index = i
+            bias_weight = NumpyHelper.to_array(initializer)
+            break
+        if bias_weight is None:
+            logger.debug(f"Bias weight not found")
+            return
+        if len(bias_weight.shape) != 1:
+            logger.debug(f"Bias weight is not 1D")
+            return
+
+        subgraph_nodes = [node, add]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            logger.debug(
+                f"Skip fusing SkipLayerNormalization with Bias since it is not safe"
+            )
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        inputs = [
+            node.input[1 - add_input_index],
+            matmul.output[0],
+            node.input[2],
+            node.input[3],
+            add.input[bias_index],
+        ]
+        new_node = helper.make_node(
+            "CustomSkipLayerNormPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=node.output,
+            name=self.model.create_node_name(
+                "SkipLayerNormalization", "SkipLayerNorm_AddBias_"
+            ),
+        )
+        new_node.domain = "com.iluvatar"
+        hidden_size = self.shape_infer_helper.get_edge_shape(node.input[2])[-1]
+        new_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
+        new_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        new_node.attribute.extend(
+            [helper.make_attribute("beta", self.model.get_initializer(node.input[3]))]
+        )
+        new_node.attribute.extend(
+            [helper.make_attribute("gamma", self.model.get_initializer(node.input[2]))]
+        )
+        new_node.attribute.extend(
+            [
+                helper.make_attribute(
+                    "bias", self.model.get_initializer(add.input[bias_index])
+                )
+            ]
+        )
+        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
new file mode 100644
index 0000000000000000000000000000000000000000..a74fe9ee0a86a88f271b085ae1b946b97b394e7e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
@@ -0,0 +1,109 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionSplitQKV(Fusion):
+    """
+    Fuse FusionSplitQKV
+    """
+
+    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
+        super().__init__(model, "SplitQKV_IxRT", "MatMul")
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+
+    def create_splitqkv_node(
+        self, input: str, query_out: str, key_out: str, value_out: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        node_name = self.model.create_node_name("SplitQKV_IxRT")
+
+        new_node = helper.make_node(
+            "SplitQKV_IxRT",
+            inputs=[input],
+            outputs=[query_out, key_out, value_out],
+            name=node_name,
+        )
+        new_node.domain = "com.iluvatar"
+        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        new_node.attribute.extend(
+            [helper.make_attribute("atten_scale", 1 / self.num_heads)]
+        )
+
+        return new_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        split_query_paths = {
+            "query_path": (
+                ["Div", "Transpose", "Reshape", "Slice", "CustomFCPluginDynamic_IxRT"],
+                [0, 0, 0, 0, 0],
+            ),
+        }
+
+        split_key_paths = {
+            "key_path": (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
+        }
+
+        q_nodes, q_path = self.match_parent_path_from_dict(node, split_query_paths)
+
+        k_nodes, k_path = self.match_parent_path_from_dict(node, split_key_paths)
+
+        if (q_nodes is not None) and (k_nodes is not None):
+            (
+                q_div_node,
+                q_transpose_node,
+                q_reshape_node,
+                q_slice_node,
+                coustom_fc_node,
+            ) = q_nodes
+            k_transpose_node, k_reshape_node, k_slice_node = k_nodes
+            slice_nodes = self.model.get_children(coustom_fc_node)
+
+            if len(slice_nodes) != 3:
+                return
+            slice_nodes.remove(q_slice_node)
+            slice_nodes.remove(k_slice_node)
+            v_slice_node = slice_nodes[0]
+
+            node.input[0] = q_div_node.input[0]  # dele div
+            new_node = self.create_splitqkv_node(
+                coustom_fc_node.output[0],
+                q_slice_node.output[0],
+                k_slice_node.output[0],
+                v_slice_node.output[0],
+            )
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+            self.nodes_to_remove.append(q_slice_node)
+            self.nodes_to_remove.append(k_slice_node)
+            self.nodes_to_remove.append(v_slice_node)
+            self.nodes_to_remove.append(q_div_node)
+
+        else:
+            return
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..8edb9a5ada34fdc7ae8a5f8b0fecc0d57b57257b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
@@ -0,0 +1,321 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union, List
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+import onnx
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionSwinLAttention(Fusion):
+    """
+    Fuse SwinL subgraph into one Attention node.
+    """
+
+    def __init__(
+            self,
+            model: OnnxModel,
+    ):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["CustomFCPluginDynamic_IxRT"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_v: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        v_shape = self.model.get_initializer(reshape_v.input[1])
+        if v_shape is None:
+            logger.debug(f"{reshape_v.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        v_shape_value = NumpyHelper.to_array(v_shape)
+        if len(v_shape_value) != 3 or (v_shape_value[1] <= 0 or v_shape_value[2] <= 0):
+            logger.debug(f"v_shape_value={v_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = 1
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_v.input[0]:
+                num_heads = value_info.type.tensor_type.shape.dim[2].dim_value
+                break
+        hidden_size = v_shape_value[2]
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+            self,
+            num_heads: int,
+            hidden_size: int,
+            inputs: List[str],
+            output: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        self.fuse_pattern1(normalize_node, input_name_to_nodes, output_name_to_node)
+        self.fuse_pattern2(normalize_node, input_name_to_nodes, output_name_to_node)
+
+    def fuse_pattern2(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        """ match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC
+         """
+        logger.debug("fuse swin-L attention pass")
+        # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
+        start_node = normalize_node
+        qkv_paths = {
+            "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
+        }
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+        assert qkv_path == 'path1', 'abnormal qkv path'
+        reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
+
+        # 2. MatMul as start, go up to find v path
+        v_paths = {
+            "path1": (["Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"], [None, 0, 0])
+        }
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if not v_nodes:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        assert v_path == 'path1', 'abnormal v path'
+
+        # 3. MatMul as start, go up to find q,k paths
+        # q path
+        q_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"],
+                      [None, 0, 0, 0, 0, 0, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
+        if not q_nodes:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        assert q_path == 'path1', 'abnormal q paths found'
+
+        # get Add(bias) input name as fused Attention inputs
+        add_op, div_op = q_nodes[1], q_nodes[2]
+        relative_position_bias_name = add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
+
+        # k path
+        k_paths = {
+            "path2": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"],
+                      [None, 0, 0, 0, 1, 0, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
+        if not k_nodes:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        assert k_path == 'path2', 'abnormal k paths found'
+        # 4. Fuse 3 CustomFC into one, and fuse attention
+        # Fuse FCs
+        fc_nodes = [q_nodes[-1], k_nodes[-1], v_nodes[-1]]
+        weight = self.fuse_tensor_in_node_attrs(fc_nodes, "W", q_nodes[-1].name + "_Weight")
+        bias = self.fuse_tensor_in_node_attrs(fc_nodes, "B", q_nodes[-1].name + "_Bias")
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[q_nodes[-1].input[0]],
+            outputs=q_nodes[-1].output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", numpy_helper.to_array(bias).shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+
+        # Fuse Attention
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
+        attention_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            [fused_node.output[0], relative_position_bias_name],
+            reshape_qkv.output[0],
+        )
+        if not attention_node:
+            return
+        self.nodes_to_add.append(attention_node)
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.nodes_to_remove.extend([*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes])
+        self.prune_graph = True
+
+    def fuse_pattern1(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        """ match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC
+        """
+        logger.debug("fuse swin-L attention pass")
+        # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
+        start_node = normalize_node
+        qkv_paths = {
+            "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
+        }
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+        assert qkv_path == 'path1', 'abnormal qkv path'
+        reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
+
+        # 2. MatMul as start, go up to find v path
+        v_paths = {
+            "path1": (["Transpose", "Reshape", "Add", "Split", "MatMul"], [None, 0, 0, None, 0])
+        }
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if not v_nodes:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        assert v_path == 'path1', 'abnormal v path'
+
+        # 3. MatMul as start, go up to find q,k paths
+        # q path
+        q_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "Add", "Split", "MatMul"],
+                      [None, 0, 0, 0, 0, 0, 0, None, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
+        if not q_nodes:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        assert q_path == 'path1', 'abnormal q paths found'
+
+        # get Add(bias) input name as fused Attention inputs
+        add_op, div_op = q_nodes[1], q_nodes[2]
+        relative_position_bias_name = add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
+
+        # k path
+        k_paths = {
+            "path2": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "Add", "Split", "MatMul"],
+                      [None, 0, 0, 0, 1, 0, 0, None, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
+        if not k_nodes:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        assert k_path == 'path2', 'abnormal k paths found'
+        # 4. Attention and CustomFC have been found, now transform the found nodes to two plugin nodes
+        # Test 3 paths have the same origin
+        is_same_origin = q_nodes[-1] is k_nodes[-1] is v_nodes[-1]
+        is_same_origin &= q_nodes[-2] is k_nodes[-2] is v_nodes[-2]
+        is_same_origin &= q_nodes[-3] is not k_nodes[-2] is not v_nodes[-3]
+        if not is_same_origin:
+            print("swin-L fuse_attention: found qkv path but not has the same origin")
+            return
+        origin_matmul = q_nodes[-1]
+        fc_add = [q_nodes[-3], k_nodes[-3], v_nodes[-3]]
+        # Now fuse
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
+
+        # Fuse FC
+        weight = self.model.get_initializer(origin_matmul.input[1])
+        biases = [self.model.get_initializer(i.input[0]) for i in fc_add]
+        if not weight or not all(biases):
+            print("swin-L: couldn't find weights")
+            return
+        weight_arr = onnx.numpy_helper.to_array(weight).transpose(1,0)
+        weight.CopyFrom(numpy_helper.from_array(weight_arr))
+        bias_arr = np.concatenate([onnx.numpy_helper.to_array(i) for i in biases], axis=0)
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[origin_matmul.input[0]],
+            outputs=fc_add[0].output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", bias_arr.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", numpy_helper.from_array(bias_arr))])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        # Fuse Attention
+        attention_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            [fused_node.output[0], relative_position_bias_name],
+            reshape_qkv.output[0],
+
+        )
+        if not attention_node:
+            return
+        self.nodes_to_add.append(attention_node)
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.nodes_to_remove.extend([*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes])
+        self.prune_graph = True
+
+    def fuse_tensor_in_node_attrs(self, fc_nodes, attr_name, tensor_name):
+        result = [get_tensor_attr(i.attribute, attr_name) for i in fc_nodes]
+        result = np.concatenate(result, axis=0)
+        result = numpy_helper.from_array(result, tensor_name)
+        return result
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..661e8375973d1dd6706ad95a112ddc177a178d53
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
@@ -0,0 +1,312 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionT5Attention(Fusion):
+    """
+    Fuse T5Attention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "RMSNormPluginDynamic_IxRT"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return [0, 0]
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(
+                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
+            )
+            return [0, 0]
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        add_input_is_value = False
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            if qk_bias:
+                add_input_is_value = True
+                qk_bias_arr = NumpyHelper.to_array(qk_bias)
+                if len(qk_bias_arr.shape) == 3:
+                    qk_bias_arr = qk_bias_arr.squeeze(0)
+                has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+                if np.any(has_neg_inf):
+                    qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
+                        np.float32
+                    )
+                qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+
+        attention_inputs = [input]
+
+        # 如果add的输入不是值，而是一个边，那么这个边的值需要cast到fp32
+        cast_node = None
+        if not add_input_is_value:
+            cast_out_name = attention_node_name + "_fp32_in1"
+            cast_out_tensor = helper.make_tensor_value_info(
+                cast_out_name, TensorProto.FLOAT, [None, None, None, None]
+            )
+            # self.model.add_initializer(cast_out_name)
+            cast_node = helper.make_node(
+                "Cast",
+                inputs=[matmul_qk_add.input[1]],
+                outputs=[cast_out_tensor.name],
+                name=self.model.create_node_name("Cast"),
+                to=1,
+            )
+            self.node_name_to_graph_name[cast_node.name] = self.this_graph_name
+            attention_inputs.append(cast_out_name)
+
+        if has_qk_bias:
+            if add_input_is_value:
+                has_mask = 1
+                attention_inputs.append(qk_bias.name)
+            else:
+                has_mask = 1
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("is_t5_mode", 1)])
+
+        return attention_node, cast_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "RMSNormPluginDynamic_IxRT":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (["MatMul", "Reshape", "Transpose", "MatMul"], [0, 0, 0, 0]),
+            "path2": (["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ["path1", "path2"]:
+            (atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match T5
+        Add/Gather --> LayerNormalization --> Attention --> Add --> LayerNormalization
+         |                                                  |
+         |                                                  |
+         +---------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Gather", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
+                    root_input = child.output[0]
+
+        v_paths = {
+            "path1": (
+                ["Transpose", "Reshape", "Split", "MatMul"],
+                [1, 0, 0, None],
+            )  # T5
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == "path1":
+            (_, _, _, matmul_in_qkv) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {"path1": (["Transpose", "Reshape", "Split"], [0, 0, 0])}
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+
+        if q_path == "path1":
+            (_, reshape_q, split_q) = q_nodes
+            # print("   split_q.name : ", split_q.name)
+
+        k_paths = {
+            "path1": (["Transpose", "Reshape", "Split"], [1, 0, 0]),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+
+        if k_path == "path1":
+            (_, _, split_k) = k_nodes
+
+        if (
+            matmul_in_qkv.input[0] == root_input
+            and split_q.input[0] == matmul_in_qkv.output[0]
+            and split_k.input[0] == matmul_in_qkv.output[0]
+        ):
+            attention_last_node = reshape_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+
+            new_node, new_cast_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                matmul_in_qkv.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            if new_cast_node:
+                self.nodes_to_add.append(new_cast_node)
+
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend(
+                [attention_last_node, transpose_qkv, matmul_qkv]
+            )
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes[:-2])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5207f28f0a57f417b1cbd45fdeb88168e2baf50d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
@@ -0,0 +1,240 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Tuple
+
+import numpy
+from numpy import array_equal, ndarray
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionUtils:
+    def __init__(self, model: OnnxModel):
+        self.model: OnnxModel = model
+
+    def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]:
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input is not None and graph_input.type.tensor_type.elem_type != TensorProto.INT32:
+            cast_output, cast_node = self.cast_input_to_int32(input_name)
+            logger.debug(f"Casted graph input {input_name} to int32")
+            return True, cast_output
+
+        logger.debug(f"Did not cast graph input {input_name} to int32: found {graph_input is not None}")
+        return False, input_name
+
+    def cast_input_to_int32(self, input_name: str):
+        cast_output = input_name + "_int32"
+
+        # Avoid consequent Cast nodes.
+        inputs = [input_name]
+        output_name_to_node = self.model.output_name_to_node()
+        if input_name in output_name_to_node:
+            parent_node = output_name_to_node[input_name]
+            if parent_node and parent_node.op_type == "Cast":
+                inputs = [parent_node.input[0]]
+
+        cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output])
+        cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.INT32))])
+        self.model.add_node(cast_node)
+
+        return cast_output, cast_node
+
+    def remove_cast_int32(self, input_name: str):
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        nodes = input_name_to_nodes[input_name]
+        for node in nodes:
+            if node.op_type == "Cast":
+                is_int32 = False
+                for att in node.attribute:
+                    if att.name == "to" and att.i == int(TensorProto.INT32):
+                        is_int32 = True
+                        break
+                if is_int32:
+                    output_name = node.output[0]
+                    self.model.remove_node(node)
+                    self.model.replace_input_of_all_nodes(output_name, input_name)
+
+    @staticmethod
+    def check_node_attribute(node, attribute_name: str, expected_value, default_value=None):
+        """Verify that a node has expected value for an attribute.
+
+        Args:
+            node (NodeProto): a node to check
+            attribute_name (str): name of attribute
+            expected_value (Any): expected value of the attribute
+            default_value (Any, optional): default value if the attribute does not exist. Defaults to None.
+
+        Returns:
+            bool: whether the check is passed or not
+        """
+        value = default_value
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = helper.get_attribute_value(attr)
+
+        if isinstance(expected_value, list):
+            return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
+                expected_value, value, equal_nan=False
+            )
+        else:
+            return value == expected_value
+
+    @staticmethod
+    def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto):
+        """Transpose a 2-D INT8 TensorProto
+        Args:
+            tensor (TensorProto): tensor to be transposed
+        Returns:
+            tensor (TensorProto): transposed tensor
+        """
+        if not isinstance(tensor, onnx_proto.TensorProto):
+            raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+
+        if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8:
+            raise ValueError("Only INT8 2-D tensors can be transposed")
+
+        if tensor.raw_data:
+            int32_data = numpy.reshape(numpy.frombuffer(tensor.raw_data, dtype="int8"), tensor.dims)
+            int32_transposed_data = numpy.transpose(int32_data, [1, 0])
+            tensor.raw_data = int32_transposed_data.tobytes()
+
+        else:
+            raise ValueError("only raw buffer supported")
+
+        return tensor
+
+    @staticmethod
+    def check_qdq_node_for_fusion(node: NodeProto, model: OnnxModel, allow_per_tensor_quantization_only=True):
+        """Verify if a provided QuantizeLinear (Q) / DequantizeLinear (DQ) node is a good candidate for fusion.
+           It is a good candidate for fusion if:
+           (1) The Q/DQ node is for per-tensor quantization if allow_per_tensor_quantization_only is `True`
+           (2) The Q/DQ node should have constant scale
+           (3) The Q/DQ node should have a zero point of 0
+        Args:
+            node (NodeProto): a Q/DQ node to check
+        Returns:
+            bool: whether the check is passed or not
+        """
+        if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
+            logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}")
+
+        scale = model.get_constant_value(node.input[1])
+
+        # Scale is not constant
+        if scale is None:
+            return False
+
+        # Not per-tensor quantization
+        scale_has_single_element = scale.ndim == 0 or (scale.ndim == 1 and scale.shape[0] == 1)
+        if allow_per_tensor_quantization_only and not scale_has_single_element:
+            return False
+
+        # If the Q/DQ node has no zero point input, it is assumed to be 0 (per ONNX spec)
+        if len(node.input) == 2:
+            return True
+
+        # Zero point should be constant and should have a value of 0
+        zero_point = model.get_constant_value(node.input[2])
+
+        # Zero point and scale should have same number of dims
+        if scale.ndim != zero_point.ndim:
+            return False
+
+        # Zero point is not constant or zero point is not zero
+        if zero_point is None:
+            return False
+
+        return numpy.all(zero_point == 0)
+
+    def check_node_input_value(self, node, input_index: int, expected_value):
+        """Verify that a node has expected input value
+
+        Args:
+            node (NodeProto): a node to check
+            input_index (int): index of its input to be verified
+            expected_value (Any): expected value of the input
+
+        Returns:
+            bool: whether the check is passed or not
+        """
+        assert len(node.input) > input_index
+
+        value = self.model.get_constant_value(node.input[input_index])
+
+        if isinstance(expected_value, list):
+            return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
+                expected_value, value, equal_nan=False
+            )
+        else:
+            return value == expected_value
+
+    def remove_identity_nodes(self):
+        """Remove Identity nodes, except those right before graph output."""
+        nodes_to_remove = []
+        for node in self.model.nodes():
+            if node.op_type == "Identity":
+                if node.output[0] not in self.model.get_graphs_output_names():
+                    self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            self.model.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed {len(nodes_to_remove)} Identity nodes")
+
+    def remove_cascaded_cast_nodes(self):
+        self.model.remove_cascaded_cast_nodes()
+
+    def remove_useless_cast_nodes(self):
+        self.model.remove_useless_cast_nodes()
+
+    def remove_useless_reshape_nodes(self):
+        """Remove reshape node that is not needed based on symbolic shape inference: input and output has same shape"""
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        nodes_to_remove = []
+        for node in self.model.nodes():
+            if node.op_type == "Reshape":
+                input_shape = shape_infer.get_edge_shape(node.input[0])
+                output_shape = shape_infer.get_edge_shape(node.output[0])
+                if input_shape and output_shape and input_shape == output_shape:
+                    logger.info(
+                        f"Remove reshape node {node.name} since its input shape is same as output: {input_shape}"
+                    )
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            graph_input_names = set(self.model.get_graphs_input_names())
+            graph_output_names = set(self.model.get_graphs_output_names())
+            for node in nodes_to_remove:
+                if bool(set(node.output) & graph_output_names):
+                    if not bool(set(node.input) & graph_input_names):
+                        self.model.replace_output_of_all_nodes(node.input[0], node.output[0])
+                    else:
+                        continue
+                else:
+                    self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
+                self.model.remove_node(node)
+
+
+class NumpyHelper:
+    @staticmethod
+    def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray:
+        # When weights are in external data format but not presented, we can still test the optimizer with two changes:
+        # (1) set fill_zeros = True  (2) change load_external_data=False in optimizer.py
+        if fill_zeros:
+            from onnx import mapping
+
+            return ndarray(
+                shape=tensor.dims,
+                dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type],
+            )
+
+        return numpy_helper.to_array(tensor)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..1133877bf6717dc7a2336db9e2c7976cf35c1405
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
@@ -0,0 +1,306 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+import onnx
+import math
+
+logger = getLogger(__name__)
+
+class FusionVideoBertAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, atten_matmul: NodeProto, div: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
+        div_initializer = self.model.get_initializer(div.input[1])
+        
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+            
+        atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
+        head_dim = math.ceil(div_value*div_value)
+        hidden_size = atten_matul_shape_value[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size 
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+        
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            qk_bias_arr = NumpyHelper.to_array(qk_bias)
+            if len(qk_bias_arr.shape) == 3:
+                qk_bias_arr = qk_bias_arr.squeeze(0)
+            has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+            if np.any(has_neg_inf):
+                qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(np.float32)
+            qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+        
+        attention_inputs = [
+            input
+        ]
+        
+        if qk_bias is not None:
+            has_mask = 1
+            attention_inputs.append(qk_bias.name)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", has_qk_bias)])
+        
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1" : (["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [0, None, 0, 0, 0]),
+            "path2" : (["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ['path1', 'path2']:
+            (_, atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match videobert              
+        transpose/Add --> LayerNormalization -->  Attention -->     Add --> LayerNormalization
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == 'LayerNormalization':
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == 'LayerNormalization':
+                    root_input = child.output[0]
+
+        v_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice", "Add", "MatMul"], [1, 0, 0, 0, None]) # videobert
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == 'path1':
+            (_, _, _, add_in_qkv, matmul_in_qkv) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None])
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+        
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+        
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice"], [0, 0, 0]),
+            "path2" : (["Div", "Transpose", "Reshape", "Slice"], [0, 0, 0, 0])
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        
+        if q_path == 'path1':
+            (_, _, slice_q) = q_nodes
+        else:
+            (div, _, _, slice_q) = q_nodes
+
+        k_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
+            "path2" : (["Div", "Transpose", "Reshape", "Slice"], [1, 0, 0, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        
+        if k_path == 'path1':
+            (_, _, slice_k) = k_nodes
+        else:
+            (div, _, _, slice_k) = k_nodes
+        
+        if matmul_in_qkv.input[0] == root_input and slice_q.input[0] == add_in_qkv.output[0] and slice_k.input[0] == add_in_qkv.output[0]:
+            attention_last_node = reshape_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(atten_matmul, div)
+            
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                add_in_qkv.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes[:-2])
+            
+            # fuse head and tail transpose
+            if transpose_before_layernorm is not None:
+                node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+                for child in node_children:
+                    for i, input in enumerate(child.input):
+                        if child.input[i] == transpose_before_layernorm.output[0]:
+                            child.input[i] = transpose_before_layernorm.input[0]
+                self.nodes_to_remove.extend([transpose_before_layernorm])
+                
+                node = transpose_before_layernorm
+                while True:
+                    found = False
+                    node_children = input_name_to_nodes[node.output[0]]
+                    for child in node_children:
+                        if child is not None and child.op_type in ['SkipLayerNorm', "Add"]:
+                            node = child
+                            found = True
+                            break
+                    if not found:
+                        break
+                node_children = input_name_to_nodes[node.output[0]]
+                if len(node_children) == 1 and node_children[0].op_type == 'Transpose':
+                    transpose_node = node_children[0]
+                    transpose_children = input_name_to_nodes[transpose_node.output[0]]
+                    for i, input in enumerate(transpose_children[0].input):
+                        if transpose_children[0].input[i] == transpose_node.output[0]:
+                            transpose_children[0].input[i] = transpose_node.input[0]
+                    self.nodes_to_remove.extend([transpose_node])
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            # self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e16f17a8a7a9679f9dc52d2902297ee3d0e33a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
@@ -0,0 +1,354 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionVITAttention(Fusion):
+    """
+    Fuse VITAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, mul: NodeProto
+    ) -> Tuple[int, int]:
+        mul_initializer = self.model.get_initializer(mul.input[1])
+
+        # 检查float_data是否为空
+        if len(mul_initializer.float_data) > 0:
+            mul_value = mul_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(mul_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
+                mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the mul_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.floor(1.0 / (mul_value * mul_value)) * math.floor(
+            1.0 / (mul_value * mul_value)
+        )
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+        # print(hidden_size, num_heads)
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            qk_bias_arr = NumpyHelper.to_array(qk_bias)
+            if len(qk_bias_arr.shape) == 3:
+                qk_bias_arr = qk_bias_arr.squeeze(0)
+            has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+            if np.any(has_neg_inf):
+                qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
+                    np.float32
+                )
+            qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+
+        attention_inputs = [input]
+
+        if qk_bias is not None:
+            has_mask = 1
+            attention_inputs.append(qk_bias.name)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [0, 0, 0]),
+            "path2": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [1, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ["path1", "path2"]:
+            (custom_fc_after_atten, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match VIT
+        transpose --> LayerNormalization -->  custom_fc -> attention -> Add
+         |                                                                  |
+         |                                                                  |
+         +-------------------------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        # print("root_input: ", root_input, matmul_qkv.name)
+        v_paths = {
+            "path1": (
+                [
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "Gather",
+                    "Squeeze",
+                    "Transpose",
+                    "Unsqueeze",
+                    "Reshape",
+                    "CustomFCPluginDynamic_IxRT",
+                ],
+                [1, 0, 0, 0, 0, 0, 0, 0, 0],
+            )  # vit
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+
+        squeeze_input = custom_fc = None
+        if v_path == "path1":
+            (_, _, _, _, squeeze_input, _, _, _, custom_fc) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+        # print("qk_nodes:", qk_nodes[1].name)
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1": (
+                ["Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze"],
+                [0, 0, 0, 0, 0, 0],
+            ),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        # print("q_nodes:", q_nodes[0].name)
+        squeeze_q = mul_q = None
+        if q_path == "path1":
+            squeeze_q = q_nodes[-1]
+            mul_q = q_nodes[0]
+
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+
+        k_paths = {
+            "path1": (
+                [
+                    "Mul",
+                    "Transpose",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "Gather",
+                    "Squeeze",
+                ],
+                [1, 0, 0, 0, 0, 0, 0],
+            ),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+        # print("k_nodes:", k_nodes[0].name)
+        squeeze_k = None
+        if k_path == "path1":
+            squeeze_k = k_nodes[-1]
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+
+        if (
+            custom_fc.input[0] == root_input
+            and squeeze_input == squeeze_q
+            and squeeze_input == squeeze_k
+        ):
+            attention_last_node = transpose_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+                custom_fc_after_atten, mul_q
+            )
+
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                custom_fc.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes[:-1])
+            self.nodes_to_remove.extend(k_nodes[:-1])
+            self.nodes_to_remove.extend(v_nodes[:-1])
+
+            # fuse head and tail transpose
+            if transpose_before_layernorm is not None:
+                node_children = input_name_to_nodes[
+                    transpose_before_layernorm.output[0]
+                ]
+                for child in node_children:
+                    for i, input in enumerate(child.input):
+                        if child.input[i] == transpose_before_layernorm.output[0]:
+                            child.input[i] = transpose_before_layernorm.input[0]
+                self.nodes_to_remove.extend([transpose_before_layernorm])
+
+                node = transpose_before_layernorm
+                while True:
+                    found = False
+                    node_children = input_name_to_nodes[node.output[0]]
+                    for child in node_children:
+                        if child is not None and child.op_type in [
+                            "SkipLayerNorm",
+                            "Add",
+                        ]:
+                            node = child
+                            found = True
+                            break
+                    if not found:
+                        break
+                node_children = input_name_to_nodes[node.output[0]]
+                if len(node_children) == 1 and node_children[0].op_type == "Transpose":
+                    transpose_node = node_children[0]
+                    transpose_children = input_name_to_nodes[transpose_node.output[0]]
+                    for i, input in enumerate(transpose_children[0].input):
+                        if transpose_children[0].input[i] == transpose_node.output[0]:
+                            transpose_children[0].input[i] = transpose_node.input[0]
+                    self.nodes_to_remove.extend([transpose_node])
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            # self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d9cb2d8de05e0e59cb369c1d336649e4f8b429
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionXSoftmax(Fusion):
+    """
+    Fuse Where + Softmax + Where into one node: XSoftmax
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "XSoftmax_IxRT", "MatMul")
+
+    def create_xsoftmax_node(
+        self, data_input: str, mask_input: str, output: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        xsoftmax_node_name = self.model.create_node_name("XSoftmax")
+
+        xsoftmax_node = helper.make_node(
+            "XSoftmax_IxRT",
+            inputs=[data_input, mask_input],
+            outputs=[output],
+            name=xsoftmax_node_name,
+        )
+        xsoftmax_node.domain = "com.iluvatar"
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
+
+        return xsoftmax_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        xsoftmax_paths = {
+            "path": (["Where", "Softmax", "Where", "Add"], [None, None, None, None]),
+        }
+        xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
+            node, xsoftmax_paths
+        )
+
+        if xsoftmax_nodes is None:
+            logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
+            return
+        else:
+            (tail_where, softmax, head_where, add) = xsoftmax_nodes
+            where_inputs = [i for i in tail_where.input if i in head_where.input]
+            assert len(where_inputs) == 1
+            mask_input = where_inputs[0]
+            data_input = add.output[0]
+            data_output = tail_where.output[0]
+
+            xsoftmax_node = self.create_xsoftmax_node(
+                data_input, mask_input, data_output
+            )
+
+            self.nodes_to_add.append(xsoftmax_node)
+            self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
+            self.nodes_to_remove.append(tail_where)
+            self.nodes_to_remove.append(softmax)
+            self.nodes_to_remove.append(head_where)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba66693c965db49dc4287911fc00e2373a20efbc
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
@@ -0,0 +1,131 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import List, Tuple, Union
+
+import numpy as np
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionYoloV5Decoder(Fusion):
+    """
+    Fuse SwinL subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "YoloV5Decoder", ["Reshape"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        short_path = ["Concat", "Slice", "Sigmoid", "Transpose", "Reshape"]
+        paths = [
+            (["Concat", "Unsqueeze", "Gather", "Shape"], [1] + [None] * 3),
+            (
+                ["Concat", "Mul", "Add", "Sub", "Mul", "Slice", "Sigmoid", "Transpose"],
+                [0, 0] + [None] * 6,
+            ),
+            (
+                ["Concat", "Mul", "Pow", "Mul", "Slice", "Sigmoid", "Transpose"],
+                [0, 1] + [None] * 5,
+            ),
+            (short_path, [None] * 5),
+            (short_path + ["Concat", "Unsqueeze", "Gather", "Shape"], [None] * 9),
+        ]
+        paths_found = []
+        nodes_names_found = set()
+        nodes_found = []
+        for path_i in paths:
+            nodes = self.model.match_parent_path(normalize_node, path_i[0], path_i[1])
+            paths_found.append(nodes)
+            if nodes:
+                for n in nodes:
+                    if n.name not in nodes_names_found:
+                        nodes_names_found.add(n.name)
+                        nodes_found.append(n)
+        if not all(paths_found):
+            return
+        shape_node = paths_found[-1][-1]
+        params = self._find_yolov5_decoder_params(paths_found)
+        self._fuse_node(
+            inputs=shape_node.input, outputs=normalize_node.output, params=params
+        )
+        self.nodes_to_remove.extend(nodes_found)
+        self._delete_extra_output_edges(paths_found)
+        self.prune_graph = True
+
+    def _fuse_node(self, inputs, outputs, params):
+        fused_node = helper.make_node(
+            "YoloV5Decoder",
+            inputs=inputs,
+            outputs=outputs,
+            name=self.model.create_node_name("YoloV5Decoder"),
+        )
+        fused_node.attribute.extend(params)
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+
+    def _delete_extra_output_edges(self, paths_found):
+        transpose_node = paths_found[2][-1]
+        assert transpose_node.op_type == "Transpose"
+        out_edge = transpose_node.output[0]
+        for item in self.model.graph().output:
+            if item.name == out_edge:
+                self.model.graph().output.remove(item)
+                logger.warning(f"Output: {out_edge} is useless in graph, delete it")
+                return
+
+    def _find_yolov5_decoder_params(self, paths_found):
+        # num_class
+        concat_op = paths_found[0][0]
+        assert concat_op.op_type == "Concat"
+        num_class_arr = self.model.get_initializer(concat_op.input[2], True)
+        assert num_class_arr
+        num_class = (num_class_arr - 5).tolist()[0]
+        num_class = helper.make_attribute("num_class", num_class)
+
+        # stride
+        mul_op = paths_found[1][1]
+        assert mul_op.op_type == "Mul"
+        input_arrs = self.model.get_initializer_input_edges(mul_op.name, True)
+        assert len(input_arrs) == 1
+        stride = input_arrs[0].tolist()
+        stride = helper.make_attribute("stride", stride)
+
+        # anchor
+        mul_op = paths_found[2][1]
+        assert mul_op.op_type == "Mul"
+        anchor = self.model.get_initializer_input_edges(mul_op.name, True)
+        assert len(anchor) == 1
+        anchor = anchor[0]
+        anchor = anchor[0, :, 0, 0, :] if len(anchor.shape) == 5 else anchor[:, 0, 0, :]
+        anchor = helper.make_attribute("anchor", list(anchor.flatten()))
+
+        # fast_impl
+        fast_impl = helper.make_attribute("faster_impl", 1)
+
+        return [num_class, stride, anchor, fast_impl]
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b176058c9fdc7a5b3dbbc9ef8294d910f689cc31
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
@@ -0,0 +1,1166 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import logging
+import os
+import sys
+from collections import deque
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+from onnx import (
+    AttributeProto,
+    GraphProto,
+    ModelProto,
+    NodeProto,
+    TensorProto,
+    helper,
+    numpy_helper,
+    save_model,
+)
+
+from .float16 import convert_float_to_float16
+from .shape_infer_helper import SymbolicShapeInferenceHelper
+
+logger = logging.getLogger(__name__)
+
+
+class OnnxModel:
+    def __init__(self, model):
+        self.initialize(model)
+        self.initializer_visited: Dict[str, bool] = {}
+
+    def initialize(self, model):
+        self.model: ModelProto = model
+        self._node_name_suffix: Dict[
+            str, int
+        ] = {}  # key is node name prefix, value is the last suffix generated
+        self.shape_infer_helper: SymbolicShapeInferenceHelper = None
+        self.enable_shape_infer: bool = True
+        self.all_graphs: Optional[List[GraphProto]] = None
+
+    def disable_shape_inference(self):
+        self.enable_shape_infer = False
+
+    def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
+        if self.enable_shape_infer:
+            if self.shape_infer_helper is None or update:
+                self.shape_infer_helper = SymbolicShapeInferenceHelper(self.model)
+
+            try:
+                if self.shape_infer_helper.infer(dynamic_axis_mapping):
+                    return self.shape_infer_helper
+            except:
+                self.enable_shape_infer = (
+                    False  # disable shape inference to suppress same error message.
+                )
+                print("failed in shape inference", sys.exc_info()[0])
+
+        return None
+
+    def input_name_to_nodes(self):
+        input_name_to_nodes = {}
+        for node in self.nodes():
+            for input_name in node.input:
+                if input_name not in input_name_to_nodes:
+                    input_name_to_nodes[input_name] = [node]
+                else:
+                    input_name_to_nodes[input_name].append(node)
+        return input_name_to_nodes
+
+    def output_name_to_node(self):
+        output_name_to_node = {}
+        for node in self.nodes():
+            for output_name in node.output:
+                output_name_to_node[output_name] = node
+        return output_name_to_node
+
+    def nodes(self):
+        all_nodes = []
+        for graph in self.graphs():
+            for node in graph.node:
+                all_nodes.append(node)
+        return all_nodes
+
+    def graph(self):
+        return self.model.graph
+
+    def graphs(self):
+        if self.all_graphs is not None:
+            return self.all_graphs
+        self.all_graphs = []
+        graph_queue = [self.model.graph]
+        while graph_queue:
+            graph = graph_queue.pop(0)
+            self.all_graphs.append(graph)
+            for node in graph.node:
+                for attr in node.attribute:
+                    if attr.type == AttributeProto.AttributeType.GRAPH:
+                        assert isinstance(attr.g, GraphProto)
+                        graph_queue.append(attr.g)
+                    if attr.type == AttributeProto.AttributeType.GRAPHS:
+                        for g in attr.graphs:
+                            assert isinstance(g, GraphProto)
+                            graph_queue.append(g)
+        return self.all_graphs
+
+    def get_graphs_input_names(self):
+        input_names = []
+        for graph in self.graphs():
+            for input in graph.input:
+                input_names.append(input.name)
+        return input_names
+
+    def get_graphs_output_names(self):
+        output_names = []
+        for graph in self.graphs():
+            for output in graph.output:
+                output_names.append(output.name)
+        return output_names
+
+    def get_graph_by_node(self, node):
+        for graph in self.graphs():
+            if node in graph.node:
+                return graph
+        return None
+
+    def get_graph_by_name(self, graph_name):
+        for graph in self.graphs():
+            if graph_name == graph.name:
+                return graph
+        return None
+
+    def get_topological_insert_id(self, graph, outputs):
+        for idx, node in enumerate(graph.node):
+            for input in node.input:
+                if input in outputs:
+                    return idx
+        return len(graph.node)
+
+    def remove_node(self, node):
+        for graph in self.graphs():
+            if node in graph.node:
+                graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.node.extend([node])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            insert_idx = self.get_topological_insert_id(graph, node.output)
+            graph.node.insert(insert_idx, node)
+
+    def add_nodes(self, nodes_to_add, node_name_to_graph_name=None):
+        if node_name_to_graph_name is None:
+            self.model.graph.node.extend(nodes_to_add)
+        else:
+            for node in nodes_to_add:
+                graph_name = node_name_to_graph_name[node.name]
+                self.add_node(node, graph_name)
+
+    def add_initializer(self, tensor, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.initializer.extend([tensor])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            graph.initializer.extend([tensor])
+
+    def add_input(self, input, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.input.extend([input])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            graph.input.extend([input])
+
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    def replace_input_of_all_nodes(self, old_input_name, new_input_name):
+        for node in self.model.graph.node:
+            OnnxModel.replace_node_input(node, old_input_name, new_input_name)
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
+    def replace_output_of_all_nodes(self, old_output_name, new_output_name):
+        for node in self.model.graph.node:
+            OnnxModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def get_initializer(self, name, return_np_array=False):
+        for graph in self.graphs():
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return numpy_helper.to_array(tensor) if return_np_array else tensor
+        return None
+
+    def get_node(self, op_name):
+        for graph in self.graphs():
+            for n in graph.node:
+                if n.name == op_name:
+                    return n
+        return None
+
+    def get_initializer_input_edges(self, op_name, return_np_array=False):
+        initializers = {i.name: i for graph in self.graphs() for i in graph.initializer}
+        node = self.get_node(op_name)
+        assert node
+        result = []
+        for i in node.input:
+            if i in initializers:
+                tensor = initializers[i]
+                tensor = numpy_helper.to_array(tensor) if return_np_array else tensor
+                result.append(tensor)
+        return result
+
+    def get_nodes_by_op_type(self, op_type):
+        nodes = []
+        for node in self.nodes():
+            if node.op_type == op_type:
+                nodes.append(node)
+        return nodes
+
+    def get_children(self, node, input_name_to_nodes=None):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for node in input_name_to_nodes[output]:
+                    children.append(node)
+        return children
+
+    def get_parents(self, node, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
+    def get_parent(self, node, i, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if len(node.input) <= i:
+            return None
+
+        input = node.input[i]
+        if input not in output_name_to_node:
+            return None
+
+        return output_name_to_node[input]
+
+    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]):
+        """
+        Find parent node based on constraints on op_type.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+
+        Returns:
+            parent: The matched parent node. None if not found.
+            index: The input index of matched parent node. None if not found.
+        """
+        for i, input in enumerate(node.input):
+            if input in output_name_to_node:
+                parent = output_name_to_node[input]
+                if parent.op_type == parent_op_type and parent not in exclude:
+                    return parent, i
+                else:
+                    logger.debug(
+                        f"To find first {parent_op_type}, current {parent.op_type}"
+                    )
+        return None, None
+
+    def match_parent(
+        self,
+        node,
+        parent_op_type,
+        input_index=None,
+        output_name_to_node=None,
+        exclude=[],
+        return_indice=None,
+    ):
+        """
+        Find parent node based on constraints on op_type and index.
+        When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            input_index (int or None): only check the parent given input index of current node.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+            return_indice (list): a list to append the input index when input_index is None.
+
+        Returns:
+            parent: The matched parent node.
+        """
+        assert node is not None
+        assert input_index is None or input_index >= 0
+
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if input_index is None:
+            parent, index = self.match_first_parent(
+                node, parent_op_type, output_name_to_node, exclude
+            )
+            if return_indice is not None:
+                return_indice.append(index)
+            return parent
+
+        if input_index >= len(node.input):
+            logger.debug(f"input_index {input_index} >= node inputs {len(node.input)}")
+            return None
+
+        parent = self.get_parent(node, input_index, output_name_to_node)
+        if (
+            parent is not None
+            and parent.op_type == parent_op_type
+            and parent not in exclude
+        ):
+            return parent
+
+        if parent is not None:
+            logger.debug(f"Expect {parent_op_type}, Got {parent.op_type}")
+
+        return None
+
+    def match_parent_paths(self, node, paths, output_name_to_node):
+        for i, path in enumerate(paths):
+            assert isinstance(path, List) or isinstance(path, Tuple)
+            return_indice = []
+            matched = self.match_parent_path(
+                node, path[0], path[1], output_name_to_node, return_indice
+            )
+            if matched:
+                return i, matched, return_indice
+        return -1, None, None
+
+    def match_parent_path(
+        self,
+        node,
+        parent_op_types,
+        parent_input_index,
+        output_name_to_node=None,
+        return_indice=None,
+    ):
+        """
+        Find a sequence of input edges based on constraints on parent op_type and index.
+        When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_types (str): constraint of parent node op_type of each input edge.
+            parent_input_index (list): constraint of input index of each input edge. None means no constraint.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            return_indice (list): a list to append the input index when there is no constraint on input index of an edge.
+
+        Returns:
+            parents: a list of matched parent node.
+        """
+        assert len(parent_input_index) == len(parent_op_types)
+
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        current_node = node
+        matched_parents = []
+        for i, op_type in enumerate(parent_op_types):
+            matched_parent = self.match_parent(
+                current_node,
+                op_type,
+                parent_input_index[i],
+                output_name_to_node,
+                exclude=[],
+                return_indice=return_indice,
+            )
+            if matched_parent is None:
+                logger.debug(
+                    f"Failed to match index={i} parent_input_index={parent_input_index[i]} op_type={op_type}",
+                    stack_info=True,
+                )
+                return None
+
+            matched_parents.append(matched_parent)
+            current_node = matched_parent
+
+        return matched_parents
+
+    def find_first_child_by_type(
+        self, node, child_type, input_name_to_nodes=None, recursive=True
+    ):
+        children = self.get_children(node, input_name_to_nodes)
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == child_type:
+                return current_node
+
+            if recursive:
+                children = self.get_children(current_node, input_name_to_nodes)
+                for child in children:
+                    dq.appendleft(child)
+
+        return None
+
+    def find_first_parent_by_type(
+        self, node, parent_type, output_name_to_node=None, recursive=True
+    ):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = self.get_parents(node, output_name_to_node)
+        dq = deque(parents)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == parent_type:
+                return current_node
+
+            if recursive:
+                parents = self.get_parents(current_node, output_name_to_node)
+                for parent in parents:
+                    dq.appendleft(parent)
+
+        return None
+
+    def get_constant_value(self, output_name):
+        for node in self.get_nodes_by_op_type("Constant"):
+            if node.output[0] == output_name:
+                for att in node.attribute:
+                    if att.name == "value":
+                        return numpy_helper.to_array(att.t)
+
+        # Fall back to intializer since constant folding might have been applied.
+        initializer = self.get_initializer(output_name)
+        if initializer is not None:
+            return numpy_helper.to_array(initializer)
+
+        return None
+
+    def get_constant_input(self, node):
+        for i, input in enumerate(node.input):
+            value = self.get_constant_value(input)
+            if value is not None:
+                return i, value
+
+        return None, None
+
+    def find_constant_input(self, node, expected_value, delta=0.000001):
+        i, value = self.get_constant_input(node)
+        if (
+            value is not None
+            and value.size == 1
+            and abs(value - expected_value) < delta
+        ):
+            return i
+
+        return -1
+
+    def is_constant_with_specified_dimension(
+        self, output_name, dimensions, description
+    ):
+        value = self.get_constant_value(output_name)
+        if value is None:
+            logger.debug(f"{description} {output_name} is not initializer.")
+            return False
+
+        if len(value.shape) != dimensions:
+            logger.debug(
+                f"{description} {output_name} shall have {dimensions} dimensions. Got shape {value.shape}"
+            )
+            return False
+
+        return True
+
+    def has_constant_input(self, node, expected_value, delta=0.000001):
+        return self.find_constant_input(node, expected_value, delta) >= 0
+
+    def get_children_subgraph_nodes(
+        self, root_node, stop_nodes, input_name_to_nodes=None
+    ):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = input_name_to_nodes[root_node.output[0]]
+
+        unique_nodes = []
+
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node in stop_nodes:
+                continue
+
+            if current_node not in unique_nodes:
+                unique_nodes.append(current_node)
+
+                for output in current_node.output:
+                    if output in input_name_to_nodes:
+                        children = input_name_to_nodes[output]
+                        for child in children:
+                            dq.appendleft(child)
+
+        return unique_nodes
+
+    def tensor_shape_to_list(self, tensor_type):
+        """Convert tensor shape to list"""
+        shape_list = []
+        for d in tensor_type.shape.dim:
+            if d.HasField("dim_value"):
+                shape_list.append(d.dim_value)  # known dimension
+            elif d.HasField("dim_param"):
+                shape_list.append(d.dim_param)  # unknown dimension with symbolic name
+            else:
+                shape_list.append("?")  # shall not happen
+        return shape_list
+
+    def get_dtype(self, input_or_output: str):
+        """Try get data type given a name (could be initializer, graph input or output)."""
+        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
+
+        if input_or_output in tensor_type_map:
+            return tensor_type_map[input_or_output].tensor_type.elem_type
+
+        graph_input = self.find_graph_input(input_or_output)
+        if graph_input:
+            return graph_input.type.tensor_type.elem_type
+
+        graph_output = self.find_graph_output(input_or_output)
+        if graph_output:
+            return graph_output.type.tensor_type.elem_type
+
+        return None
+
+    @staticmethod
+    def get_node_attribute(node: NodeProto, attribute_name: str):
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = helper.get_attribute_value(attr)
+                return value
+        return None
+
+    def remove_cascaded_cast_nodes(self):
+        """Remove Cast node that are followed by another Cast node like  --> Cast --> Cast -->
+        Note that this shall be used carefully since it might introduce semantic change.
+        For example, float -> int -> float could get different value than the original float value.
+        So, it is recommended to used only in post-processing of mixed precision conversion.
+        """
+        output_name_to_node = self.output_name_to_node()
+        removed_count = 0
+        for node in self.nodes():
+            if node.op_type == "Cast":
+                parent = self.get_parent(
+                    node, 0, output_name_to_node=output_name_to_node
+                )
+                if parent and parent.op_type == "Cast":
+                    node.input[0] = parent.input[0]
+                    removed_count += 1
+
+        if removed_count > 0:
+            logger.info("Removed %d cascaded Cast nodes", removed_count)
+            self.prune_graph()
+
+    def remove_useless_cast_nodes(self):
+        """Remove cast nodes that are not needed: input and output has same data type."""
+        shape_infer = self.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            logger.info(
+                f"Skip removing useless cast nodes since shape inference failed."
+            )
+            return
+
+        def get_data_type(input_or_output_name):
+            dtype = self.get_dtype(input_or_output_name)
+            if dtype:
+                return dtype
+            if shape_infer.known_vi_[input_or_output_name].type.tensor_type.HasField(
+                "elem_type"
+            ):
+                return shape_infer.known_vi_[
+                    input_or_output_name
+                ].type.tensor_type.elem_type
+            return None
+
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Cast":
+                input_dtype = get_data_type(node.input[0])
+                output_dtype = get_data_type(node.output[0])
+                if input_dtype and input_dtype == output_dtype:
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            graph_input_names = set(self.get_graphs_input_names())
+            graph_output_names = set(self.get_graphs_output_names())
+            for node in nodes_to_remove:
+                if bool(set(node.output) & graph_output_names):
+                    if not bool(set(node.input) & graph_input_names):
+                        self.replace_output_of_all_nodes(node.input[0], node.output[0])
+                    else:
+                        continue
+                else:
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                self.remove_node(node)
+
+            logger.info(
+                "Removed %d Cast nodes with output type same as input",
+                len(nodes_to_remove),
+            )
+
+    def convert_model_float32_to_float16(self, cast_input_output=True):
+        logger.warning(
+            "The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!"
+        )
+        self.convert_float_to_float16(
+            use_symbolic_shape_infer=True, keep_io_types=cast_input_output
+        )
+
+    def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
+        """Convert a model to half (default) or mixed precision.
+           To use mixed precision, user need specify which graph inputs, outputs, operator type or list of nodes shall keep in float32.
+           By default, we use symbolic shape inference to get shape and type information. If not, ONNX shape inference will be used.
+           Note that symbolic/ONNX shape inference might fail, and the conversion might not proceed without shape and type information.
+
+        Args:
+            use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. Defaults to True.
+            keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
+                                                              If True, model inputs/outputs should be left as float32. Defaults to False.
+            op_block_list (List[str], optional): List of operator types to leave as float32.
+                                                 Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
+            node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
+            force_fp16_initializers(bool): force converting all float initializers to float16.
+                                           Default to false, which will convert only the one needed to avoid precision loss.
+            min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
+            max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
+        """
+        if "keep_io_types" not in kwargs:
+            kwargs["keep_io_types"] = True
+
+        model = self.model
+        if use_symbolic_shape_infer:
+            # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
+            shape_infer_helper = SymbolicShapeInferenceHelper(model)
+            model = shape_infer_helper.infer_shapes(
+                model, auto_merge=True, guess_output_rank=False
+            )
+
+        parameters = {"disable_shape_infer": use_symbolic_shape_infer}
+        parameters.update(
+            {
+                key: kwargs[key]
+                for key in [
+                    "keep_io_types",
+                    "min_positive_val",
+                    "max_finite_val",
+                    "op_block_list",
+                    "node_block_list",
+                    "force_fp16_initializers",
+                ]
+                if key in kwargs
+            }
+        )
+
+        fp16_model = convert_float_to_float16(model, **parameters)
+        self.initialize(fp16_model)
+
+        self.remove_cascaded_cast_nodes()
+
+        self.remove_useless_cast_nodes()
+
+    def create_node_name(self, op_type, name_prefix=None):
+        """Create a unique node name that starts with a prefix (default is operator type).
+           The name will not be duplicated with any name that generated or existed in current graphs.
+        Args:
+            op_type (str): operator type
+            name_prefix (str, optional): prefix of node name. Defaults to None.
+
+        Returns:
+            str: node name
+        """
+
+        if name_prefix:
+            prefix = name_prefix if name_prefix.endswith("_") else (name_prefix + "_")
+        else:
+            prefix = op_type + "_"
+
+        suffix: int = 0
+        if prefix in self._node_name_suffix:
+            suffix = self._node_name_suffix[prefix] + 1
+        else:
+            # Check existed node name only once for a prefix as we assume create_node_name is called for every new node in fusion.
+            for node in self.nodes():
+                if node.name and node.name.startswith(prefix):
+                    try:
+                        index = int(node.name[len(prefix) :])
+                        suffix = max(index + 1, suffix)
+                    except ValueError:
+                        continue
+
+        # Record the generated suffix so that we can avoid generating duplicated name.
+        self._node_name_suffix[prefix] = suffix
+
+        return prefix + str(suffix)
+
+    def find_graph_input(self, input_name):
+        for input in self.model.graph.input:
+            if input.name == input_name:
+                return input
+        return None
+
+    def find_graph_output(self, output_name):
+        for output in self.model.graph.output:
+            if output.name == output_name:
+                return output
+        return None
+
+    def get_parent_subgraph_nodes(self, node, stop_nodes, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        unique_nodes = []
+
+        parents = self.get_parents(node, output_name_to_node)
+        dq = deque(parents)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node in stop_nodes:
+                continue
+
+            if current_node not in unique_nodes:
+                unique_nodes.append(current_node)
+
+                for input in current_node.input:
+                    if input in output_name_to_node:
+                        dq.appendleft(output_name_to_node[input])
+
+        return unique_nodes
+
+    def get_graph_inputs(self, current_node, recursive=False):
+        """
+        Find graph inputs that linked to current node.
+        """
+        graph_inputs = []
+        for input in current_node.input:
+            if self.find_graph_input(input) and input not in graph_inputs:
+                graph_inputs.append(input)
+
+        if recursive:
+            parent_nodes = self.get_parent_subgraph_nodes(current_node, [])
+            for node in parent_nodes:
+                for input in node.input:
+                    if self.find_graph_input(input) and input not in graph_inputs:
+                        graph_inputs.append(input)
+        return graph_inputs
+
+    @staticmethod
+    def input_index(node_output, child_node):
+        index = 0
+        for input in child_node.input:
+            if input == node_output:
+                return index
+            index += 1
+        return -1
+
+    def remove_unused_constant(self):
+        input_name_to_nodes = self.input_name_to_nodes()
+
+        # remove unused constant
+        unused_nodes = []
+        nodes = self.nodes()
+        for node in nodes:
+            if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes:
+                unused_nodes.append(node)
+
+        self.remove_nodes(unused_nodes)
+
+        if len(unused_nodes) > 0:
+            logger.debug(f"Removed unused constant nodes: {len(unused_nodes)}")
+
+    def prune_graph(self, outputs=None):
+        """
+        Prune graph to keep only required outputs. It removes unnecessary inputs and nodes.
+        Nodes are not linked (directly or indirectly) to any required output will be removed.
+
+        Args:
+            outputs (list): a list of graph outputs to retain. If it is None, all graph outputs will be kept.
+        """
+        if len(self.graphs()) > 1:
+            logger.debug(f"Skip prune_graph since graph has subgraph")
+            return
+
+        if outputs is None:
+            outputs = [output.name for output in self.model.graph.output]
+
+        output_name_to_node = self.output_name_to_node()
+        all_nodes = []
+        for output in outputs:
+            if output in output_name_to_node:
+                last_node = output_name_to_node[output]
+                if last_node in all_nodes:
+                    continue
+                nodes = self.get_parent_subgraph_nodes(last_node, [])
+                all_nodes.append(last_node)
+                all_nodes.extend(nodes)
+
+        nodes_to_remove = []
+        for node in self.model.graph.node:
+            if node not in all_nodes:
+                nodes_to_remove.append(node)
+
+        self.remove_nodes(nodes_to_remove)
+
+        # remove outputs not in list
+        output_to_remove = []
+        for output in self.model.graph.output:
+            if output.name not in outputs:
+                output_to_remove.append(output)
+        for output in output_to_remove:
+            self.model.graph.output.remove(output)
+
+        # remove inputs not used by any node.
+        input_name_to_nodes = self.input_name_to_nodes()
+        input_to_remove = []
+        for input in self.model.graph.input:
+            if input.name not in input_name_to_nodes:
+                input_to_remove.append(input)
+        for input in input_to_remove:
+            self.model.graph.input.remove(input)
+
+        if input_to_remove or output_to_remove or nodes_to_remove:
+            logger.info(
+                "Graph pruned: {} inputs, {} outputs and {} nodes are removed".format(
+                    len(input_to_remove), len(output_to_remove), len(nodes_to_remove)
+                )
+            )
+
+        self.update_graph()
+
+    def update_graph(self, verbose=False):
+        graph = self.model.graph
+
+        remaining_input_names = []
+        for node in graph.node:
+            if node.op_type in ["Loop", "Scan", "If"]:
+                # TODO: handle inner graph
+                logger.debug(
+                    f"Skip update_graph since graph has operator: {node.op_type}"
+                )
+                return
+            if node.op_type != "Constant":
+                for input_name in node.input:
+                    if input_name not in remaining_input_names:
+                        remaining_input_names.append(input_name)
+        if verbose:
+            logger.debug(f"remaining input names: {remaining_input_names}")
+
+        # remove graph input that is not used
+        inputs_to_remove = []
+        for input in graph.input:
+            if input.name not in remaining_input_names:
+                inputs_to_remove.append(input)
+        for input in inputs_to_remove:
+            graph.input.remove(input)
+
+        names_to_remove = [input.name for input in inputs_to_remove]
+        logger.debug(f"remove {len(inputs_to_remove)} unused inputs: {names_to_remove}")
+
+        # remove weights that are not used
+        weights_to_remove = []
+        weights_to_keep = []
+        for initializer in graph.initializer:
+            if (
+                initializer.name not in remaining_input_names
+                and not self.find_graph_output(initializer.name)
+            ):
+                weights_to_remove.append(initializer)
+            else:
+                weights_to_keep.append(initializer.name)
+        for initializer in weights_to_remove:
+            graph.initializer.remove(initializer)
+
+        names_to_remove = [initializer.name for initializer in weights_to_remove]
+        logger.debug(
+            f"remove {len(weights_to_remove)} unused initializers: {names_to_remove}"
+        )
+        if verbose:
+            logger.debug(f"remaining initializers:{weights_to_keep}")
+
+        self.remove_unused_constant()
+
+    def is_safe_to_fuse_nodes(
+        self, nodes_to_remove, keep_outputs, input_name_to_nodes, output_name_to_node
+    ):
+        for node_to_remove in nodes_to_remove:
+            for output_to_remove in node_to_remove.output:
+                if output_to_remove in keep_outputs:
+                    continue
+
+                if output_to_remove in input_name_to_nodes:
+                    for impacted_node in input_name_to_nodes[output_to_remove]:
+                        if impacted_node not in nodes_to_remove:
+                            logger.debug(
+                                f"it is not safe to remove nodes since output {output_to_remove} is used by {impacted_node}"
+                            )
+                            return False
+        return True
+
+    @staticmethod
+    def graph_topological_sort(graph):
+        deps_count = [0] * len(graph.node)  # dependency count of each node
+        deps_to_nodes = {}  # input to node indice
+        sorted_nodes = []  # initialize sorted_nodes
+        for node_idx, node in enumerate(graph.node):
+            # CANNOT use len(node.input) directly because input can be optional
+            deps_count[node_idx] = sum(1 for _ in node.input if _)
+            if deps_count[node_idx] == 0:  # Constant doesn't depend on any inputs
+                sorted_nodes.append(graph.node[node_idx])
+                continue
+
+            for input_name in node.input:
+                if input_name not in deps_to_nodes:
+                    deps_to_nodes[input_name] = [node_idx]
+                else:
+                    deps_to_nodes[input_name].append(node_idx)
+
+        # Note: this logic only applies to top level graph since a sub graph could use intializer from parent graph
+        initializer_names = [init.name for init in graph.initializer]
+        graph_input_names = [input.name for input in graph.input]
+        input_names = initializer_names + graph_input_names
+        input_names.sort()
+        prev_input_name = None
+        for input_name in input_names:
+            if prev_input_name == input_name:
+                continue
+
+            prev_input_name = input_name
+            if input_name in deps_to_nodes:
+                for node_idx in deps_to_nodes[input_name]:
+                    deps_count[node_idx] = deps_count[node_idx] - 1
+                    if deps_count[node_idx] == 0:
+                        sorted_nodes.append(graph.node[node_idx])
+
+        start = 0
+        end = len(sorted_nodes)
+
+        while start < end:
+            for output in sorted_nodes[start].output:
+                if output in deps_to_nodes:
+                    for node_idx in deps_to_nodes[output]:
+                        deps_count[node_idx] = deps_count[node_idx] - 1
+                        if deps_count[node_idx] == 0:
+                            sorted_nodes.append(graph.node[node_idx])
+                            end = end + 1
+            start = start + 1
+
+        if end != len(graph.node):
+            raise RuntimeError(
+                f"Graph is not a DAG: end={end}, len(graph.node)={len(graph.node)}, graph.node[end]={graph.node[end]}"
+            )
+
+        graph.ClearField("node")
+        graph.node.extend(sorted_nodes)
+
+    def topological_sort(self):
+        # TODO: support graph_topological_sort() in subgraphs
+        # for graph in self.graphs():
+        #    self.graph_topological_sort(graph)
+        OnnxModel.graph_topological_sort(self.model.graph)
+
+    @staticmethod
+    def save(
+        model,
+        output_path,
+        save_as_external_data=False,
+        all_tensors_to_one_file=True,
+        size_threshold=1024,
+        convert_attribute=False,
+    ):
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+        if save_as_external_data:
+            # Save model to external data, which is needed for model size > 2GB
+            output_dir = Path(output_path).parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+            external_data_path = output_path + ".data"
+            location = (
+                Path(external_data_path).name if all_tensors_to_one_file else None
+            )
+
+            if os.path.exists(output_path):
+                logger.info(f"Delete the existed onnx file: {output_path}")
+                os.remove(output_path)
+
+            if all_tensors_to_one_file:
+                if os.path.exists(external_data_path):
+                    # Delete the external data file. Otherwise, data will be appended to existing file.
+                    logger.info(
+                        f"Delete the existed external data file: {external_data_path}"
+                    )
+                    os.remove(external_data_path)
+            else:
+                if os.listdir(output_dir):
+                    raise RuntimeError(
+                        f"Output directory ({output_dir}) for external data is not empty."
+                    )
+
+            save_model(
+                model,
+                output_path,
+                save_as_external_data=True,
+                all_tensors_to_one_file=all_tensors_to_one_file,
+                location=location,
+                size_threshold=size_threshold,
+                convert_attribute=convert_attribute,
+            )
+        else:
+            save_model(model, output_path)
+
+    def save_model_to_file(
+        self, output_path, use_external_data_format=False, all_tensors_to_one_file=True
+    ):
+        logger.info(f"Sort graphs in topological order")
+        self.topological_sort()
+
+        if output_path.endswith(".json"):  # Output text for testing small model.
+            with open(output_path, "w") as out:
+                out.write(str(model))
+        else:
+            OnnxModel.save(
+                self.model,
+                output_path,
+                use_external_data_format,
+                all_tensors_to_one_file,
+            )
+        logger.info(f"Model saved to {output_path}")
+
+    def get_graph_inputs_excluding_initializers(self):
+        """
+        Returns real graph inputs (excluding initializers from older onnx model).
+        """
+        graph_inputs = []
+        for input in self.model.graph.input:
+            if self.get_initializer(input.name) is None:
+                graph_inputs.append(input)
+        return graph_inputs
+
+    def get_opset_version(self):
+        """Get opset version of onnx domain
+
+        Raises:
+            RuntimeError: ONNX model has no opset for default domain.
+
+        Returns:
+            int: opset version of onnx domain.
+        """
+        for opset in self.model.opset_import:
+            if opset.domain in ["", "ai.onnx"]:
+                return opset.version
+        raise RuntimeError("ONNX model has no opset for default domain")
+
+    @staticmethod
+    def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool:
+        """Returns True when two tensors have same value.
+           Note that name can be different.
+
+        Args:
+            tensor1 (TensorProto): initializer 1
+            tensor2 (TensorProto): initializer 2
+
+        Returns:
+            bool: True when two intializers has same value.
+        """
+        if tensor1.data_type != tensor2.data_type or tensor1.dims != tensor2.dims:
+            return False
+        if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
+            return tensor1.raw_data == tensor2.raw_data
+        return numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)
+
+    def remove_duplicated_initializer(self):
+        """Remove initializers with duplicated values, and only keep the first one.
+        It could help reduce size of models (like ALBert) with shared weights.
+        Note: this function does not process subgraph.
+        """
+        if len(self.graphs()) > 1:
+            logger.warning("remove_duplicated_initializer does not process subgraphs.")
+
+        initializer_count = len(self.model.graph.initializer)
+
+        same = [-1] * initializer_count
+        for i in range(initializer_count - 1):
+            if same[i] >= 0:
+                continue
+            for j in range(i + 1, initializer_count):
+                if OnnxModel.has_same_value(
+                    self.model.graph.initializer[i], self.model.graph.initializer[j]
+                ):
+                    same[j] = i
+
+        count = 0
+        for i in range(initializer_count):
+            if same[i] >= 0:
+                count += 1
+                self.replace_input_of_all_nodes(
+                    self.model.graph.initializer[i].name,
+                    self.model.graph.initializer[same[i]].name,
+                )
+
+        if count > 0:
+            self.update_graph()
+            print(f"Removed {count} initializers with duplicated value")
+
+    def add_prefix_to_names(self, prefix: str):
+        """Add prefix to initializer or intermediate outputs in graph. Main graph inputs and outputs are excluded.
+        It could help avoid conflicting in name of node_args when merging two graphs.
+        Note: this function does not process subgraph.
+        """
+        if len(self.graphs()) > 1:
+            logger.warning("add_prefix_to_names does not process subgraphs.")
+
+        # Exclude the names of inputs and outputs of main graph (but not subgraphs)
+        excluded = [i.name for i in self.model.graph.input] + [
+            o.name for o in self.model.graph.output
+        ]
+
+        for initializer in self.model.graph.initializer:
+            if initializer.name not in excluded:
+                if prefix + initializer.name not in excluded:
+                    initializer.name = prefix + initializer.name
+
+        for node in self.model.graph.node:
+            # update name of node inputs
+            for j in range(len(node.input)):
+                if node.input[j] not in excluded:
+                    if prefix + node.input[j] not in excluded:
+                        node.input[j] = prefix + node.input[j]
+
+            # update name of node outputs
+            for j in range(len(node.output)):
+                if node.output[j] not in excluded:
+                    if prefix + node.output[j] not in excluded:
+                        node.output[j] = prefix + node.output[j]
+
+        for value_info in self.model.graph.value_info:
+            if value_info.name not in excluded:
+                value_info.name = prefix + value_info.name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..111444028e4ed9aa1d068f93167f8fabaca71b92
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
@@ -0,0 +1,122 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import logging
+import os
+import sys
+from typing import Dict
+
+# In ORT Package the symbolic_shape_infer.py is in ../tools
+file_path = os.path.dirname(__file__)
+if os.path.exists(os.path.join(file_path, "../tools/symbolic_shape_infer.py")):
+    sys.path.append(os.path.join(file_path, "../tools"))
+else:
+    sys.path.append(os.path.join(file_path, ".."))
+
+from .symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy
+
+logger = logging.getLogger(__name__)
+
+
+class SymbolicShapeInferenceHelper(SymbolicShapeInference):
+    def __init__(self, model, verbose=0, int_max=2**31 - 1, auto_merge=True, guess_output_rank=False):
+        super().__init__(int_max, auto_merge, guess_output_rank, verbose)
+        self.model_ = model
+        self.all_shapes_inferred_: bool = False
+        self.is_inferred_: bool = False
+        self.dynamic_axis_mapping_: Dict[str, int] = {}
+
+    def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 128):
+        """Run shape inference, and try replace dynamic axis from string to integer when mapping is provided.
+
+        Args:
+            dynamic_axis_mapping (_type_): a dictionary with name of dynamic axis as key, like {"batch_size" : 4}
+            max_runs (int, optional): limit maximum number of runs to avoid infinite loop. Defaults to 32.
+
+        Returns:
+            bool: whether all shapes has been inferred or not.
+        """
+        assert dynamic_axis_mapping is not None
+
+        if self.is_inferred_ and self.dynamic_axis_mapping_ == dynamic_axis_mapping:
+            return self.all_shapes_inferred_
+
+        self.dynamic_axis_mapping_ = dynamic_axis_mapping
+
+        self._preprocess(self.model_)
+
+        count = 0
+        while self.run_:
+            logger.debug(f"shape infer run {count}")
+            self.all_shapes_inferred_ = self._infer_impl()
+            count += 1
+            if max_runs > 0 and count >= max_runs:
+                break
+
+        self.is_inferred_ = True
+        return self.all_shapes_inferred_
+
+    def _get_sympy_shape(self, node, idx):
+        """Override it to ensure shape inference by giving the actual value of dynamic axis."""
+        sympy_shape = []
+
+        shape = self._get_shape(node, idx)
+        if shape:
+            for dim in shape:
+                if isinstance(dim, str):
+                    if dim in self.dynamic_axis_mapping_:
+                        sympy_shape.append(self.dynamic_axis_mapping_[dim])
+                    elif dim in self.symbolic_dims_:
+                        sympy_shape.append(self.symbolic_dims_[dim])
+                    else:
+                        sympy_shape.append(sympy.Symbol(dim, integer=True))
+                else:
+                    assert dim is not None
+                    sympy_shape.append(dim)
+        return sympy_shape
+
+    def get_edge_shape(self, edge):
+        """Get shape of an edge.
+
+        Args:
+            edge (str): name of edge
+
+        Returns:
+            Optional[List[int]]: the shape, or None if shape is unknown
+        """
+        assert self.all_shapes_inferred_
+        if edge not in self.known_vi_:
+            print("Cannot retrieve the shape of " + str(edge))
+            return None
+
+        type_proto = self.known_vi_[edge].type
+        shape = get_shape_from_type_proto(type_proto)
+
+        if shape is not None:
+            for i, dim in enumerate(shape):
+                if isinstance(dim, str) and dim in self.dynamic_axis_mapping_:
+                    shape[i] = self.dynamic_axis_mapping_[dim]
+
+        return shape
+
+    def compare_shape(self, edge, edge_other):
+        """Compare shape of two edges.
+
+        Args:
+            edge (str): name of edge
+            edge_other (str): name of another edge
+
+        Raises:
+            Exception: At least one shape is missed for edges to compare
+
+        Returns:
+            bool: whether the shape is same or not
+        """
+        assert self.all_shapes_inferred_
+        shape = self.get_edge_shape(edge)
+        shape_other = self.get_edge_shape(edge_other)
+        if shape is None or shape_other is None:
+            raise Exception("At least one shape is missed for edges to compare")
+        return shape == shape_other
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5157f90eedf906e3e6f24dddf03219d3ca570f7
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
@@ -0,0 +1,2431 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# -*- coding: UTF-8 -*-
+import argparse
+import logging
+
+import numpy as np
+import onnx
+import sympy
+from onnx import helper, numpy_helper, shape_inference
+from packaging import version
+
+assert version.parse(onnx.__version__) >= version.parse("1.8.0")
+
+logger = logging.getLogger(__name__)
+
+
+def get_attribute(node, attr_name, default_value=None):
+    found = [attr for attr in node.attribute if attr.name == attr_name]
+    if found:
+        return helper.get_attribute_value(found[0])
+    return default_value
+
+
+def get_dim_from_proto(dim):
+    return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None
+
+
+def is_sequence(type_proto):
+    cls_type = type_proto.WhichOneof("value")
+    assert cls_type in ["tensor_type", "sequence_type"]
+    return cls_type == "sequence_type"
+
+
+def get_shape_from_type_proto(type_proto):
+    assert not is_sequence(type_proto)
+    if type_proto.tensor_type.HasField("shape"):
+        return [get_dim_from_proto(d) for d in type_proto.tensor_type.shape.dim]
+    else:
+        return None  # note no shape is different from shape without dim (scalar)
+
+
+def get_shape_from_value_info(vi):
+    cls_type = vi.type.WhichOneof("value")
+    if cls_type is None:
+        return None
+    if is_sequence(vi.type):
+        if "tensor_type" == vi.type.sequence_type.elem_type.WhichOneof("value"):
+            return get_shape_from_type_proto(vi.type.sequence_type.elem_type)
+        else:
+            return None
+    else:
+        return get_shape_from_type_proto(vi.type)
+
+
+def make_named_value_info(name):
+    vi = onnx.ValueInfoProto()
+    vi.name = name
+    return vi
+
+
+def get_shape_from_sympy_shape(sympy_shape):
+    return [None if i is None else (int(i) if is_literal(i) else str(i)) for i in sympy_shape]
+
+
+def is_literal(dim):
+    return type(dim) in [int, np.int64, np.int32, sympy.Integer] or (hasattr(dim, "is_number") and dim.is_number)
+
+
+def handle_negative_axis(axis, rank):
+    assert axis < rank and axis >= -rank
+    return axis if axis >= 0 else rank + axis
+
+
+def get_opset(mp, domain=None):
+    domain = domain or ["", "onnx", "ai.onnx"]
+    if type(domain) != list:
+        domain = [domain]
+    for opset in mp.opset_import:
+        if opset.domain in domain:
+            return opset.version
+
+    return None
+
+
+def as_scalar(x):
+    if type(x) == list:
+        assert len(x) == 1
+        return x[0]
+    elif type(x) == np.ndarray:
+        return x.item()
+    else:
+        return x
+
+
+def as_list(x, keep_none):
+    if type(x) == list:
+        return x
+    elif type(x) == np.ndarray:
+        return list(x)
+    elif keep_none and x is None:
+        return None
+    else:
+        return [x]
+
+
+def sympy_reduce_product(x):
+    if type(x) == list:
+        value = sympy.Integer(1)
+        for v in x:
+            value = value * v
+    else:
+        value = x
+    return value
+
+
+class SymbolicShapeInference:
+    def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
+        self.dispatcher_ = {
+            "Add": self._infer_symbolic_compute_ops,
+            "ArrayFeatureExtractor": self._infer_ArrayFeatureExtractor,
+            "AveragePool": self._infer_Pool,
+            "BatchNormalization": self._infer_BatchNormalization,
+            "Cast": self._infer_Cast,
+            "CategoryMapper": self._infer_CategoryMapper,
+            "Compress": self._infer_Compress,
+            "Concat": self._infer_Concat,
+            "ConcatFromSequence": self._infer_ConcatFromSequence,
+            "Constant": self._infer_Constant,
+            "ConstantOfShape": self._infer_ConstantOfShape,
+            "Conv": self._infer_Conv,
+            "CumSum": self._pass_on_shape_and_type,
+            "Div": self._infer_symbolic_compute_ops,
+            "Einsum": self._infer_Einsum,
+            "Expand": self._infer_Expand,
+            "Equal": self._infer_symbolic_compute_ops,
+            "Floor": self._infer_symbolic_compute_ops,
+            "Gather": self._infer_Gather,
+            "GatherElements": self._infer_GatherElements,
+            "GatherND": self._infer_GatherND,
+            "Identity": self._pass_on_shape_and_type,
+            "If": self._infer_If,
+            "Loop": self._infer_Loop,
+            "MatMul": self._infer_MatMul,
+            "MatMulInteger16": self._infer_MatMulInteger,
+            "MaxPool": self._infer_Pool,
+            "Max": self._infer_symbolic_compute_ops,
+            "Min": self._infer_symbolic_compute_ops,
+            "Mul": self._infer_symbolic_compute_ops,
+            "NonMaxSuppression": self._infer_NonMaxSuppression,
+            "NonZero": self._infer_NonZero,
+            "OneHot": self._infer_OneHot,
+            "Pad": self._infer_Pad,
+            "Range": self._infer_Range,
+            "Reciprocal": self._pass_on_shape_and_type,
+            "ReduceSum": self._infer_ReduceSum,
+            "ReduceProd": self._infer_ReduceProd,
+            "Reshape": self._infer_Reshape,
+            "Resize": self._infer_Resize,
+            "Round": self._pass_on_shape_and_type,
+            "Scan": self._infer_Scan,
+            "ScatterElements": self._infer_ScatterElements,
+            "SequenceAt": self._infer_SequenceAt,
+            "SequenceInsert": self._infer_SequenceInsert,
+            "Shape": self._infer_Shape,
+            "Size": self._infer_Size,
+            "Slice": self._infer_Slice,
+            "SoftmaxCrossEntropyLoss": self._infer_SoftmaxCrossEntropyLoss,
+            "SoftmaxCrossEntropyLossInternal": self._infer_SoftmaxCrossEntropyLoss,
+            "NegativeLogLikelihoodLossInternal": self._infer_SoftmaxCrossEntropyLoss,
+            "Split": self._infer_Split,
+            "SplitToSequence": self._infer_SplitToSequence,
+            "Squeeze": self._infer_Squeeze,
+            "Sub": self._infer_symbolic_compute_ops,
+            "Tile": self._infer_Tile,
+            "TopK": self._infer_TopK,
+            "Transpose": self._infer_Transpose,
+            "Unsqueeze": self._infer_Unsqueeze,
+            "Where": self._infer_symbolic_compute_ops,
+            "ZipMap": self._infer_ZipMap,
+            "Neg": self._infer_symbolic_compute_ops,
+            # contrib ops:
+            "Attention": self._infer_Attention,
+            "BiasGelu": self._infer_BiasGelu,
+            "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
+            "FastGelu": self._infer_FastGelu,
+            "Gelu": self._infer_Gelu,
+            "LayerNormalization": self._infer_LayerNormalization,
+            "LongformerAttention": self._infer_LongformerAttention,
+            "PythonOp": self._infer_PythonOp,
+            "SkipLayerNormalization": self._infer_SkipLayerNormalization,
+        }
+        self.aten_op_dispatcher_ = {
+            "embedding": self._infer_Gather,
+            "bitwise_or": self._infer_aten_bitwise_or,
+            "diagonal": self._infer_aten_diagonal,
+            "max_pool2d_with_indices": self._infer_aten_pool2d,
+            "max": self._infer_aten_minmax,
+            "min": self._infer_aten_minmax,
+            "multinomial": self._infer_aten_multinomial,
+            "unfold": self._infer_aten_unfold,
+            "argmax": self._infer_aten_argmax,
+            "avg_pool2d": self._infer_aten_pool2d,
+            "_adaptive_avg_pool2d": self._infer_aten_pool2d,
+            "numpy_T": self._infer_Transpose,
+        }
+        self.run_ = True
+        self.suggested_merge_ = {}
+        self.symbolic_dims_ = {}
+        self.input_symbols_ = {}
+        self.auto_merge_ = auto_merge
+        self.guess_output_rank_ = guess_output_rank
+        self.verbose_ = verbose
+        self.int_max_ = int_max
+        self.subgraph_id_ = 0
+        self.prefix_ = prefix
+
+    def _add_suggested_merge(self, symbols, apply=False):
+        assert all([(type(s) == str and s in self.symbolic_dims_) or is_literal(s) for s in symbols])
+        symbols = set(symbols)
+        for k, v in self.suggested_merge_.items():
+            if k in symbols:
+                symbols.remove(k)
+                symbols.add(v)
+        map_to = None
+        # if there is literal, map to it first
+        for s in symbols:
+            if is_literal(s):
+                map_to = s
+                break
+        # when no literals, map to input symbolic dims, then existing symbolic dims
+        if map_to is None:
+            for s in symbols:
+                if s in self.input_symbols_:
+                    map_to = s
+                    break
+        if map_to is None:
+            for s in symbols:
+                if type(self.symbolic_dims_[s]) == sympy.Symbol:
+                    map_to = s
+                    break
+        # when nothing to map to, use the shorter one
+        if map_to is None:
+            if self.verbose_ > 0:
+                logger.warning("Potential unsafe merge between symbolic expressions: ({})".format(",".join(symbols)))
+            symbols_list = list(symbols)
+            lens = [len(s) for s in symbols_list]
+            map_to = symbols_list[lens.index(min(lens))]
+            symbols.remove(map_to)
+
+        for s in symbols:
+            if s == map_to:
+                continue
+            if is_literal(map_to) and is_literal(s):
+                assert int(map_to) == int(s)
+            self.suggested_merge_[s] = int(map_to) if is_literal(map_to) else map_to
+            for k, v in self.suggested_merge_.items():
+                if v == s:
+                    self.suggested_merge_[k] = map_to
+        if apply and self.auto_merge_:
+            self._apply_suggested_merge()
+
+    def _apply_suggested_merge(self, graph_input_only=False):
+        if not self.suggested_merge_:
+            return
+        for i in list(self.out_mp_.graph.input) + ([] if graph_input_only else list(self.out_mp_.graph.value_info)):
+            for d in i.type.tensor_type.shape.dim:
+                if d.dim_param in self.suggested_merge_:
+                    v = self.suggested_merge_[d.dim_param]
+                    if is_literal(v):
+                        d.dim_value = int(v)
+                    else:
+                        d.dim_param = v
+
+    def _preprocess(self, in_mp):
+        self.out_mp_ = onnx.ModelProto()
+        self.out_mp_.CopyFrom(in_mp)
+        self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
+        self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer])
+        self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
+        self.known_vi_.update(
+            dict(
+                [
+                    (
+                        i.name,
+                        helper.make_tensor_value_info(i.name, i.data_type, list(i.dims)),
+                    )
+                    for i in self.out_mp_.graph.initializer
+                ]
+            )
+        )
+
+    def _merge_symbols(self, dims):
+        if not all([type(d) == str for d in dims]):
+            if self.auto_merge_:
+                unique_dims = list(set(dims))
+                is_int = [is_literal(d) for d in unique_dims]
+                assert sum(is_int) <= 1  # if there are more than 1 unique ints, something is wrong
+                if sum(is_int) == 1:
+                    int_dim = is_int.index(1)
+                    if self.verbose_ > 0:
+                        logger.debug(
+                            "dim {} has been merged with value {}".format(
+                                unique_dims[:int_dim] + unique_dims[int_dim + 1 :],
+                                unique_dims[int_dim],
+                            )
+                        )
+                    self._check_merged_dims(unique_dims, allow_broadcast=False)
+                    return unique_dims[int_dim]
+                else:
+                    if self.verbose_ > 0:
+                        logger.debug("dim {} has been mergd with dim {}".format(unique_dims[1:], unique_dims[0]))
+                    return dims[0]
+            else:
+                return None
+        if all([d == dims[0] for d in dims]):
+            return dims[0]
+        merged = [self.suggested_merge_[d] if d in self.suggested_merge_ else d for d in dims]
+        if all([d == merged[0] for d in merged]):
+            assert merged[0] in self.symbolic_dims_
+            return merged[0]
+        else:
+            return None
+
+    # broadcast from right to left, and merge symbolic dims if needed
+    def _broadcast_shapes(self, shape1, shape2):
+        new_shape = []
+        rank1 = len(shape1)
+        rank2 = len(shape2)
+        new_rank = max(rank1, rank2)
+        for i in range(new_rank):
+            dim1 = shape1[rank1 - 1 - i] if i < rank1 else 1
+            dim2 = shape2[rank2 - 1 - i] if i < rank2 else 1
+            if dim1 == 1 or dim1 == dim2:
+                new_dim = dim2
+            elif dim2 == 1:
+                new_dim = dim1
+            else:
+                new_dim = self._merge_symbols([dim1, dim2])
+                if not new_dim:
+                    # warning about unsupported broadcast when not auto merge
+                    # note that auto merge has the risk of incorrectly merge symbols while one of them being 1
+                    # for example, 'a' = 1, 'b' = 5 at runtime is valid broadcasting, but with auto merge 'a' == 'b'
+                    if self.auto_merge_:
+                        self._add_suggested_merge([dim1, dim2], apply=True)
+                    else:
+                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))
+            new_shape = [new_dim] + new_shape
+        return new_shape
+
+    def _get_shape(self, node, idx):
+        name = node.input[idx]
+        if name in self.known_vi_:
+            vi = self.known_vi_[name]
+            return get_shape_from_value_info(vi)
+        else:
+            assert name in self.initializers_
+            return list(self.initializers_[name].dims)
+
+    def _get_shape_rank(self, node, idx):
+        return len(self._get_shape(node, idx))
+
+    def _get_sympy_shape(self, node, idx):
+        sympy_shape = []
+        for d in self._get_shape(node, idx):
+            if type(d) == str:
+                sympy_shape.append(
+                    self.symbolic_dims_[d]
+                    if d in self.symbolic_dims_
+                    else sympy.Symbol(d, integer=True, nonnegative=True)
+                )
+            else:
+                assert None != d
+                sympy_shape.append(d)
+        return sympy_shape
+
+    def _get_value(self, node, idx):
+        name = node.input[idx]
+        assert name in self.sympy_data_ or name in self.initializers_
+        return self.sympy_data_[name] if name in self.sympy_data_ else numpy_helper.to_array(self.initializers_[name])
+
+    def _try_get_value(self, node, idx):
+        if idx >= len(node.input):
+            return None
+        name = node.input[idx]
+        if name in self.sympy_data_ or name in self.initializers_:
+            return self._get_value(node, idx)
+        return None
+
+    def _update_computed_dims(self, new_sympy_shape):
+        for i, new_dim in enumerate(new_sympy_shape):
+            if not is_literal(new_dim) and not type(new_dim) == str:
+                str_dim = str(new_dim)
+                if str_dim in self.suggested_merge_:
+                    if is_literal(self.suggested_merge_[str_dim]):
+                        continue  # no need to create dim for literals
+                    new_sympy_shape[i] = self.symbolic_dims_[self.suggested_merge_[str_dim]]
+                else:
+                    # add new_dim if it's a computational expression
+                    if not str(new_dim) in self.symbolic_dims_:
+                        self.symbolic_dims_[str(new_dim)] = new_dim
+
+    def _onnx_infer_single_node(self, node):
+        # skip onnx shape inference for some ops, as they are handled in _infer_*
+        skip_infer = node.op_type in [
+            "If",
+            "Loop",
+            "Scan",
+            "SplitToSequence",
+            "ZipMap",  # contrib ops
+            "Attention",
+            "BiasGelu",
+            "EmbedLayerNormalization",
+            "FastGelu",
+            "Gelu",
+            "LayerNormalization",
+            "LongformerAttention",
+            "SkipLayerNormalization",
+            "PythonOp",
+        ]
+
+        if not skip_infer:
+            # Only pass initializers that satisfy the following condition:
+            # (1) Operator need value of some input for shape inference.
+            #     For example, Unsqueeze in opset 13 uses the axes input to calculate shape of output.
+            # (2) opset version >= 9. In older version, initializer is required in graph input by onnx spec.
+            # (3) The initializer is not in graph input. The means the node input is "constant" in inference.
+            initializers = []
+            if (get_opset(self.out_mp_) >= 9) and node.op_type in ["Unsqueeze"]:
+                initializers = [
+                    self.initializers_[name]
+                    for name in node.input
+                    if (name in self.initializers_ and name not in self.graph_inputs_)
+                ]
+
+            # run single node inference with self.known_vi_ shapes
+            tmp_graph = helper.make_graph(
+                [node],
+                "tmp",
+                [self.known_vi_[i] for i in node.input if i],
+                [make_named_value_info(i) for i in node.output],
+                initializers,
+            )
+
+            self.tmp_mp_.graph.CopyFrom(tmp_graph)
+
+            self.tmp_mp_ = shape_inference.infer_shapes(self.tmp_mp_)
+
+        for i_o in range(len(node.output)):
+            o = node.output[i_o]
+            vi = self.out_mp_.graph.value_info.add()
+            if not skip_infer:
+                vi.CopyFrom(self.tmp_mp_.graph.output[i_o])
+            else:
+                vi.name = o
+            self.known_vi_[o] = vi
+
+    def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph_id=True):
+        if self.verbose_ > 2:
+            logger.debug(
+                "Inferencing subgraph of node {} with output({}...): {}".format(node.name, node.output[0], node.op_type)
+            )
+        # node inputs are not passed directly to the subgraph
+        # it's up to the node dispatcher to prepare subgraph input
+        # for example, with Scan/Loop, subgraph input shape would be trimmed from node input shape
+        # besides, inputs in subgraph could shadow implicit inputs
+        subgraph_inputs = set([i.name for i in list(subgraph.initializer) + list(subgraph.input)])
+        subgraph_implicit_input = set([name for name in self.known_vi_.keys() if not name in subgraph_inputs])
+        tmp_graph = helper.make_graph(
+            list(subgraph.node),
+            "tmp",
+            list(subgraph.input) + [self.known_vi_[i] for i in subgraph_implicit_input],
+            [make_named_value_info(i.name) for i in subgraph.output],
+        )
+        tmp_graph.initializer.extend([i for i in self.out_mp_.graph.initializer if i.name in subgraph_implicit_input])
+        tmp_graph.initializer.extend(subgraph.initializer)
+        self.tmp_mp_.graph.CopyFrom(tmp_graph)
+
+        symbolic_shape_inference = SymbolicShapeInference(
+            self.int_max_,
+            self.auto_merge_,
+            self.guess_output_rank_,
+            self.verbose_,
+            prefix=self.prefix_ + "_" + str(self.subgraph_id_),
+        )
+        if inc_subgraph_id:
+            self.subgraph_id_ += 1
+
+        all_shapes_inferred = False
+        symbolic_shape_inference._preprocess(self.tmp_mp_)
+        symbolic_shape_inference.suggested_merge_ = self.suggested_merge_.copy()
+        while symbolic_shape_inference.run_:
+            all_shapes_inferred = symbolic_shape_inference._infer_impl(self.sympy_data_.copy())
+        symbolic_shape_inference._update_output_from_vi()
+        if use_node_input:
+            # if subgraph uses node input, it needs to update to merged dims
+            subgraph.ClearField("input")
+            subgraph.input.extend(symbolic_shape_inference.out_mp_.graph.input[: len(node.input)])
+        subgraph.ClearField("output")
+        subgraph.output.extend(symbolic_shape_inference.out_mp_.graph.output)
+        subgraph.ClearField("value_info")
+        subgraph.value_info.extend(symbolic_shape_inference.out_mp_.graph.value_info)
+        subgraph.ClearField("node")
+        subgraph.node.extend(symbolic_shape_inference.out_mp_.graph.node)
+        # for new symbolic dims from subgraph output, add to main graph symbolic dims
+        subgraph_shapes = [get_shape_from_value_info(o) for o in symbolic_shape_inference.out_mp_.graph.output]
+        subgraph_new_symbolic_dims = set(
+            [d for s in subgraph_shapes if s for d in s if type(d) == str and not d in self.symbolic_dims_]
+        )
+        new_dims = {}
+        for d in subgraph_new_symbolic_dims:
+            assert d in symbolic_shape_inference.symbolic_dims_
+            new_dims[d] = symbolic_shape_inference.symbolic_dims_[d]
+        self.symbolic_dims_.update(new_dims)
+        return symbolic_shape_inference
+
+    def _get_int_values(self, node, broadcast=False):
+        values = [self._try_get_value(node, i) for i in range(len(node.input))]
+        if all([v is not None for v in values]):
+            # some shape compute is in floating point, cast to int for sympy
+            for i, v in enumerate(values):
+                if type(v) != np.ndarray:
+                    continue
+                if len(v.shape) > 1:
+                    new_v = None  # ignore value for rank > 1
+                elif len(v.shape) == 0:
+                    new_v = int(v.item())
+                else:
+                    assert len(v.shape) == 1
+                    new_v = [int(vv) for vv in v]
+                values[i] = new_v
+        values_len = [len(v) if type(v) == list else 0 for v in values]
+        max_len = max(values_len)
+        if max_len >= 1 and broadcast:
+            # broadcast
+            for i, v in enumerate(values):
+                if v is None:
+                    continue  # don't broadcast if value is unknown
+                if type(v) == list:
+                    if len(v) < max_len:
+                        values[i] = v * max_len
+                    else:
+                        assert len(v) == max_len
+                else:
+                    values[i] = [v] * max_len
+        return values
+
+    def _compute_on_sympy_data(self, node, op_func):
+        assert len(node.output) == 1
+        values = self._get_int_values(node, broadcast=True)
+        if all([v is not None for v in values]):
+            is_list = [type(v) == list for v in values]
+            as_list = any(is_list)
+            if as_list:
+                self.sympy_data_[node.output[0]] = [op_func(vs) for vs in zip(*values)]
+            else:
+                self.sympy_data_[node.output[0]] = op_func(values)
+
+    def _pass_on_sympy_data(self, node):
+        assert len(node.input) == 1 or node.op_type in [
+            "Reshape",
+            "Unsqueeze",
+            "Squeeze",
+        ]
+        self._compute_on_sympy_data(node, lambda x: x[0])
+
+    def _pass_on_shape_and_type(self, node):
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                self._get_shape(node, 0),
+            )
+        )
+
+    def _new_symbolic_dim(self, prefix, dim):
+        new_dim = "{}_d{}".format(prefix, dim)
+        if new_dim in self.suggested_merge_:
+            v = self.suggested_merge_[new_dim]
+            new_symbolic_dim = sympy.Integer(int(v)) if is_literal(v) else v
+        else:
+            new_symbolic_dim = sympy.Symbol(new_dim, integer=True, nonnegative=True)
+            self.symbolic_dims_[new_dim] = new_symbolic_dim
+        return new_symbolic_dim
+
+    def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0):
+        return self._new_symbolic_dim(
+            "{}{}_{}_o{}_".format(
+                node.op_type,
+                self.prefix_,
+                list(self.out_mp_.graph.node).index(node),
+                out_idx,
+            ),
+            dim,
+        )
+
+    def _new_symbolic_shape(self, rank, node, out_idx=0):
+        return [self._new_symbolic_dim_from_output(node, out_idx, i) for i in range(rank)]
+
+    def _compute_conv_pool_shape(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        if len(node.input) > 1:
+            W_shape = self._get_sympy_shape(node, 1)
+            rank = len(W_shape) - 2  # number of spatial axes
+            kernel_shape = W_shape[-rank:]
+            sympy_shape[1] = W_shape[0]
+        else:
+            W_shape = None
+            kernel_shape = get_attribute(node, "kernel_shape")
+            rank = len(kernel_shape)
+
+        assert len(sympy_shape) == rank + 2
+
+        # only need to symbolic shape inference if input has symbolic dims in spatial axes
+        is_symbolic_dims = [not is_literal(i) for i in sympy_shape[-rank:]]
+
+        if not any(is_symbolic_dims):
+            shape = get_shape_from_value_info(self.known_vi_[node.output[0]])
+            if len(shape) > 0:
+                assert len(sympy_shape) == len(shape)
+                sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]]
+                return sympy_shape
+
+        dilations = get_attribute(node, "dilations", [1] * rank)
+        strides = get_attribute(node, "strides", [1] * rank)
+        effective_kernel_shape = [(k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)]
+        pads = get_attribute(node, "pads")
+        if pads is None:
+            pads = [0] * (2 * rank)
+            auto_pad = get_attribute(node, "auto_pad", b"NOTSET").decode("utf-8")
+            if auto_pad != "VALID" and auto_pad != "NOTSET":
+                try:
+                    residual = [sympy.Mod(d, s) for d, s in zip(sympy_shape[-rank:], strides)]
+                    total_pads = [
+                        max(0, (k - s) if r == 0 else (k - r))
+                        for k, s, r in zip(effective_kernel_shape, strides, residual)
+                    ]
+                except TypeError:  # sympy may throw TypeError: cannot determine truth value of Relational
+                    total_pads = [
+                        max(0, (k - s)) for k, s in zip(effective_kernel_shape, strides)
+                    ]  # assuming no residual if sympy throws error
+            elif auto_pad == "VALID":
+                total_pads = []
+            else:
+                total_pads = [0] * rank
+        else:
+            assert len(pads) == 2 * rank
+            total_pads = [p1 + p2 for p1, p2 in zip(pads[:rank], pads[rank:])]
+
+        ceil_mode = get_attribute(node, "ceil_mode", 0)
+        for i in range(rank):
+            effective_input_size = sympy_shape[-rank + i]
+            if len(total_pads) > 0:
+                effective_input_size = effective_input_size + total_pads[i]
+            if ceil_mode:
+                strided_kernel_positions = sympy.ceiling(
+                    (effective_input_size - effective_kernel_shape[i]) / strides[i]
+                )
+            else:
+                strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) // strides[i]
+            sympy_shape[-rank + i] = strided_kernel_positions + 1
+        return sympy_shape
+
+    def _check_merged_dims(self, dims, allow_broadcast=True):
+        if allow_broadcast:
+            dims = [d for d in dims if not (is_literal(d) and int(d) <= 1)]
+        if not all([d == dims[0] for d in dims]):
+            self._add_suggested_merge(dims, apply=True)
+
+    def _compute_matmul_shape(self, node, output_dtype=None):
+        lhs_shape = self._get_shape(node, 0)
+        rhs_shape = self._get_shape(node, 1)
+        lhs_rank = len(lhs_shape)
+        rhs_rank = len(rhs_shape)
+        lhs_reduce_dim = 0
+        rhs_reduce_dim = 0
+        assert lhs_rank > 0 and rhs_rank > 0
+        if lhs_rank == 1 and rhs_rank == 1:
+            new_shape = []
+        elif lhs_rank == 1:
+            rhs_reduce_dim = -2
+            new_shape = rhs_shape[:rhs_reduce_dim] + [rhs_shape[-1]]
+        elif rhs_rank == 1:
+            lhs_reduce_dim = -1
+            new_shape = lhs_shape[:lhs_reduce_dim]
+        else:
+            lhs_reduce_dim = -1
+            rhs_reduce_dim = -2
+            new_shape = self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]) + [lhs_shape[-2]] + [rhs_shape[-1]]
+        # merge reduce dim
+        self._check_merged_dims(
+            [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]],
+            allow_broadcast=False,
+        )
+        if output_dtype is None:
+            # infer output_dtype from input type when not specified
+            output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_shape))
+
+    def _fuse_tensor_type(self, node, out_idx, dst_type, src_type):
+        """
+        update dst_tensor_type to be compatible with src_tensor_type when dimension mismatches
+        """
+        dst_tensor_type = (
+            dst_type.sequence_type.elem_type.tensor_type if is_sequence(dst_type) else dst_type.tensor_type
+        )
+        src_tensor_type = (
+            src_type.sequence_type.elem_type.tensor_type if is_sequence(src_type) else src_type.tensor_type
+        )
+        if dst_tensor_type.elem_type != src_tensor_type.elem_type:
+            node_id = node.name if node.name else node.op_type
+            raise ValueError(
+                f"For node {node_id}, dst_tensor_type.elem_type != src_tensor_type.elem_type: "
+                f"{onnx.onnx_pb.TensorProto.DataType.Name(dst_tensor_type.elem_type)} vs "
+                f"{onnx.onnx_pb.TensorProto.DataType.Name(src_tensor_type.elem_type)}"
+            )
+        if dst_tensor_type.HasField("shape"):
+            for di, ds in enumerate(zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim)):
+                if ds[0] != ds[1]:
+                    # create a new symbolic dimension for node/out_idx/mismatch dim id in dst_tensor_type for tensor_type
+                    # for sequence_type, clear the dimension
+                    new_dim = onnx.TensorShapeProto.Dimension()
+                    if not is_sequence(dst_type):
+                        new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, out_idx, di))
+                    dst_tensor_type.shape.dim[di].CopyFrom(new_dim)
+        else:
+            dst_tensor_type.CopyFrom(src_tensor_type)
+
+    def _infer_ArrayFeatureExtractor(self, node):
+        data_shape = self._get_shape(node, 0)
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape[:-1] + indices_shape,
+            )
+        )
+
+    def _infer_symbolic_compute_ops(self, node):
+        funcs = {
+            "Add": lambda l: l[0] + l[1],
+            "Div": lambda l: l[0] // l[1],  # integer div in sympy
+            "Equal": lambda l: l[0] == l[1],
+            "Floor": lambda l: sympy.floor(l[0]),
+            "Max": lambda l: l[1]
+            if is_literal(l[0]) and int(l[0]) < -self.int_max_
+            else (l[0] if is_literal(l[1]) and int(l[1]) < -self.int_max_ else sympy.Max(l[0], l[1])),
+            "Min": lambda l: l[1]
+            if is_literal(l[0]) and int(l[0]) > self.int_max_
+            else (l[0] if is_literal(l[1]) and int(l[1]) > self.int_max_ else sympy.Min(l[0], l[1])),
+            "Mul": lambda l: l[0] * l[1],
+            "Sub": lambda l: l[0] - l[1],
+            "Where": lambda l: l[1] if l[0] else l[2],
+            "Neg": lambda l: -l[0],
+        }
+        assert node.op_type in funcs
+        self._compute_on_sympy_data(node, funcs[node.op_type])
+
+    def _infer_Cast(self, node):
+        self._pass_on_sympy_data(node)
+
+    def _infer_CategoryMapper(self, node):
+        input_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        if input_type == onnx.TensorProto.STRING:
+            output_type = onnx.TensorProto.INT64
+        else:
+            output_type = onnx.TensorProto.STRING
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_type, self._get_shape(node, 0)))
+
+    def _infer_Compress(self, node):
+        input_shape = self._get_shape(node, 0)
+        # create a new symbolic dimension for Compress output
+        compress_len = str(self._new_symbolic_dim_from_output(node))
+        axis = get_attribute(node, "axis")
+        if axis == None:
+            # when axis is not specified, input is flattened before compress so output is 1D
+            output_shape = [compress_len]
+        else:
+            output_shape = input_shape
+            output_shape[handle_negative_axis(axis, len(input_shape))] = compress_len
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+
+    def _infer_Concat(self, node):
+        if any([i in self.sympy_data_ or i in self.initializers_ for i in node.input]):
+            values = self._get_int_values(node)
+            if all([v is not None for v in values]):
+                assert 0 == get_attribute(node, "axis")
+                self.sympy_data_[node.output[0]] = []
+                for i in range(len(node.input)):
+                    value = values[i]
+                    if type(value) == list:
+                        self.sympy_data_[node.output[0]].extend(value)
+                    else:
+                        self.sympy_data_[node.output[0]].append(value)
+
+        sympy_shape = self._get_sympy_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis"), len(sympy_shape))
+        for i_idx in range(1, len(node.input)):
+            input_shape = self._get_sympy_shape(node, i_idx)
+            if input_shape:
+                sympy_shape[axis] = sympy_shape[axis] + input_shape[axis]
+        self._update_computed_dims(sympy_shape)
+        # merge symbolic dims for non-concat axes
+        for d in range(len(sympy_shape)):
+            if d == axis:
+                continue
+            dims = [self._get_shape(node, i_idx)[d] for i_idx in range(len(node.input)) if self._get_shape(node, i_idx)]
+            if all([d == dims[0] for d in dims]):
+                continue
+            merged = self._merge_symbols(dims)
+            if type(merged) == str:
+                sympy_shape[d] = self.symbolic_dims_[merged] if merged else None
+            else:
+                sympy_shape[d] = merged
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_ConcatFromSequence(self, node):
+        seq_shape = self._get_shape(node, 0)
+        new_axis = 1 if get_attribute(node, "new_axis") else 0
+        axis = handle_negative_axis(get_attribute(node, "axis"), len(seq_shape) + new_axis)
+        concat_dim = str(self._new_symbolic_dim_from_output(node, 0, axis))
+        new_shape = seq_shape
+        if new_axis:
+            new_shape = seq_shape[:axis] + [concat_dim] + seq_shape[axis:]
+        else:
+            new_shape[axis] = concat_dim
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.sequence_type.elem_type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_Constant(self, node):
+        t = get_attribute(node, "value")
+        self.sympy_data_[node.output[0]] = numpy_helper.to_array(t)
+
+    def _infer_ConstantOfShape(self, node):
+        sympy_shape = self._get_int_values(node)[0]
+        vi = self.known_vi_[node.output[0]]
+        if sympy_shape is not None:
+            if type(sympy_shape) != list:
+                sympy_shape = [sympy_shape]
+            self._update_computed_dims(sympy_shape)
+            # update sympy data if output type is int, and shape is known
+            if vi.type.tensor_type.elem_type == onnx.TensorProto.INT64 and all([is_literal(x) for x in sympy_shape]):
+                self.sympy_data_[node.output[0]] = np.ones(
+                    [int(x) for x in sympy_shape], dtype=np.int64
+                ) * numpy_helper.to_array(get_attribute(node, "value", 0))
+        else:
+            # create new dynamic shape
+            # note input0 is a 1D vector of shape, the new symbolic shape has the rank of the shape vector length
+            sympy_shape = self._new_symbolic_shape(self._get_shape(node, 0)[0], node)
+
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_Conv(self, node):
+        sympy_shape = self._compute_conv_pool_shape(node)
+        self._update_computed_dims(sympy_shape)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_Einsum(self, node):
+        # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
+        equation = get_attribute(node, "equation")
+        equation = equation.replace(b" ", b"")
+        mid_index = equation.find(b"->")
+        left_equation = equation[:mid_index] if mid_index != -1 else equation
+
+        num_operands = 0
+        num_ellipsis = 0
+        num_ellipsis_indices = 0
+
+        letter_to_dim = {}
+
+        terms = left_equation.split(b",")
+        for term in terms:
+            ellipsis_index = term.find(b"...")
+            shape = self._get_shape(node, num_operands)
+            rank = len(shape)
+            if ellipsis_index != -1:
+                if num_ellipsis == 0:
+                    num_ellipsis_indices = rank - len(term) + 3
+                num_ellipsis = num_ellipsis + 1
+            for i in range(1, rank + 1):
+                letter = term[-i]
+                if letter != 46:  # letter != b'.'
+                    dim = shape[-i]
+                    if letter not in letter_to_dim.keys():
+                        letter_to_dim[letter] = dim
+                    elif type(dim) != sympy.Symbol:
+                        letter_to_dim[letter] = dim
+            num_operands = num_operands + 1
+
+        new_sympy_shape = []
+        from collections import OrderedDict
+
+        num_letter_occurrences = OrderedDict()
+        if mid_index != -1:
+            right_equation = equation[mid_index + 2 :]
+            right_ellipsis_index = right_equation.find(b"...")
+            if right_ellipsis_index != -1:
+                for i in range(num_ellipsis_indices):
+                    new_sympy_shape.append(shape[i])
+            for c in right_equation:
+                if c != 46:  # c != b'.'
+                    new_sympy_shape.append(letter_to_dim[c])
+        else:
+            for i in range(num_ellipsis_indices):
+                new_sympy_shape.append(shape[i])
+            for c in left_equation:
+                if c != 44 and c != 46:  # c != b',' and c != b'.':
+                    if c in num_letter_occurrences:
+                        num_letter_occurrences[c] = num_letter_occurrences[c] + 1
+                    else:
+                        num_letter_occurrences[c] = 1
+            for key, value in num_letter_occurrences.items():
+                if value == 1:
+                    new_sympy_shape.append(letter_to_dim[key])
+
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_sympy_shape))
+
+    def _infer_Expand(self, node):
+        expand_to_shape = as_list(self._try_get_value(node, 1), keep_none=True)
+        if expand_to_shape is not None:
+            # new_shape's dim can come from shape value
+            self._update_computed_dims(expand_to_shape)
+            shape = self._get_shape(node, 0)
+            new_shape = self._broadcast_shapes(shape, get_shape_from_sympy_shape(expand_to_shape))
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    new_shape,
+                )
+            )
+
+    def _infer_Gather(self, node):
+        data_shape = self._get_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", 0), len(data_shape))
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape[:axis] + indices_shape + data_shape[axis + 1 :],
+            )
+        )
+        # for 1D input, do some sympy compute
+        if node.input[0] in self.sympy_data_ and len(data_shape) == 1 and 0 == get_attribute(node, "axis", 0):
+            idx = self._try_get_value(node, 1)
+            if idx is not None:
+                data = self.sympy_data_[node.input[0]]
+                if type(data) == list:
+                    if type(idx) == np.ndarray and len(idx.shape) == 1:
+                        self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx]
+                    else:
+                        self.sympy_data_[node.output[0]] = data[int(idx)]
+                else:
+                    assert idx == 0 or idx == -1
+                    self.sympy_data_[node.output[0]] = data
+
+    def _infer_GatherElements(self, node):
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                indices_shape,
+            )
+        )
+
+    def _infer_GatherND(self, node):
+        data_shape = self._get_shape(node, 0)
+        data_rank = len(data_shape)
+        indices_shape = self._get_shape(node, 1)
+        indices_rank = len(indices_shape)
+        last_index_dimension = indices_shape[-1]
+        assert is_literal(last_index_dimension) and last_index_dimension <= data_rank
+        new_shape = indices_shape[:-1] + data_shape[last_index_dimension:]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_If(self, node):
+        # special case for constant condition, in case there are mismatching shape from the non-executed branch
+        subgraphs = [
+            get_attribute(node, "then_branch"),
+            get_attribute(node, "else_branch"),
+        ]
+        cond = self._try_get_value(node, 0)
+        if cond is not None:
+            if as_scalar(cond) > 0:
+                subgraphs[1].CopyFrom(subgraphs[0])
+            else:
+                subgraphs[0].CopyFrom(subgraphs[1])
+
+        for i_sub, subgraph in enumerate(subgraphs):
+            subgraph_infer = self._onnx_infer_subgraph(node, subgraph, use_node_input=False)
+            for i_out in range(len(node.output)):
+                vi = self.known_vi_[node.output[i_out]]
+                if i_sub == 0:
+                    vi.CopyFrom(subgraph.output[i_out])
+                    vi.name = node.output[i_out]
+                else:
+                    self._fuse_tensor_type(node, i_out, vi.type, subgraph.output[i_out].type)
+
+                # pass on sympy data from subgraph, if cond is constant
+                if cond is not None and i_sub == (0 if as_scalar(cond) > 0 else 1):
+                    if subgraph.output[i_out].name in subgraph_infer.sympy_data_:
+                        self.sympy_data_[vi.name] = subgraph_infer.sympy_data_[subgraph.output[i_out].name]
+
+    def _infer_Loop(self, node):
+        subgraph = get_attribute(node, "body")
+        assert len(subgraph.input) == len(node.input)
+        num_loop_carried = len(node.input) - 2  # minus the length and initial loop condition
+        # when sequence_type is used as loop carried input
+        # needs to run subgraph infer twice if the tensor shape in sequence contains None
+        for i, si in enumerate(subgraph.input):
+            si_name = si.name
+            si.CopyFrom(self.known_vi_[node.input[i]])
+            si.name = si_name
+
+        self._onnx_infer_subgraph(node, subgraph)
+
+        # check subgraph input/output for shape changes in loop carried variables
+        # for tensor_type, create new symbolic dim when changing, i.e., output = Concat(input, a)
+        # for sequence_type, propagate from output to input
+        need_second_infer = False
+        for i_out in range(1, num_loop_carried + 1):
+            so = subgraph.output[i_out]
+            so_shape = get_shape_from_value_info(so)
+            if is_sequence(so.type):
+                if so_shape and None in so_shape:
+                    # copy shape from output to input
+                    # note that loop input is [loop_len, cond, input_0, input_1, ...]
+                    # while loop output is [cond, output_0, output_1, ...]
+                    subgraph.input[i_out + 1].type.sequence_type.elem_type.CopyFrom(so.type.sequence_type.elem_type)
+                    need_second_infer = True
+            else:
+                si = subgraph.input[i_out + 1]
+                si_shape = get_shape_from_value_info(si)
+                for di, dims in enumerate(zip(si_shape, so_shape)):
+                    if dims[0] != dims[1]:
+                        new_dim = onnx.TensorShapeProto.Dimension()
+                        new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, i_out, di))
+                        si.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+                        so.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+                        need_second_infer = True
+
+        if need_second_infer:
+            if self.verbose_ > 2:
+                logger.debug(
+                    "Rerun Loop: {}({}...), because of sequence in loop carried variables".format(
+                        node.name, node.output[0]
+                    )
+                )
+            self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False)
+
+        # create a new symbolic dimension for iteration dependent dimension
+        loop_iter_dim = str(self._new_symbolic_dim_from_output(node))
+        for i in range(len(node.output)):
+            vi = self.known_vi_[node.output[i]]
+            vi.CopyFrom(subgraph.output[i + 1])  # first subgraph output is condition, not in node output
+            if i >= num_loop_carried:
+                assert not is_sequence(vi.type)  # TODO: handle loop accumulation in sequence_type
+                subgraph_vi_dim = subgraph.output[i + 1].type.tensor_type.shape.dim
+                vi.type.tensor_type.shape.ClearField("dim")
+                vi_dim = vi.type.tensor_type.shape.dim
+                vi_dim.add().dim_param = loop_iter_dim
+                vi_dim.extend(list(subgraph_vi_dim))
+            vi.name = node.output[i]
+
+    def _infer_MatMul(self, node):
+        self._compute_matmul_shape(node)
+
+    def _infer_MatMulInteger(self, node):
+        self._compute_matmul_shape(node, onnx.TensorProto.INT32)
+
+    def _infer_NonMaxSuppression(self, node):
+        selected = str(self._new_symbolic_dim_from_output(node))
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [selected, 3]))
+
+    def _infer_NonZero(self, node):
+        input_rank = self._get_shape_rank(node, 0)
+        # create a new symbolic dimension for NonZero output
+        nz_len = str(self._new_symbolic_dim_from_output(node, 0, 1))
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], vi.type.tensor_type.elem_type, [input_rank, nz_len]))
+
+    def _infer_OneHot(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        depth = self._try_get_value(node, 1)
+        axis = get_attribute(node, "axis", -1)
+        axis = handle_negative_axis(axis, len(sympy_shape) + 1)
+        new_shape = get_shape_from_sympy_shape(
+            sympy_shape[:axis]
+            + [self._new_symbolic_dim_from_output(node) if not is_literal(depth) else depth]
+            + sympy_shape[axis:]
+        )
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[2]].type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_Pad(self, node):
+        if get_opset(self.out_mp_) <= 10:
+            pads = get_attribute(node, "pads")
+        else:
+            pads = self._try_get_value(node, 1)
+
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+
+        if pads is not None:
+            assert len(pads) == 2 * rank
+            new_sympy_shape = [
+                d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])
+            ]
+            self._update_computed_dims(new_sympy_shape)
+        else:
+            # dynamic pads, create new symbolic dimensions
+            new_sympy_shape = self._new_symbolic_shape(rank, node)
+        output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape))
+        )
+
+    def _infer_Pool(self, node):
+        sympy_shape = self._compute_conv_pool_shape(node)
+        self._update_computed_dims(sympy_shape)
+        for o in node.output:
+            if not o:
+                continue
+            vi = self.known_vi_[o]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    o,
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(sympy_shape),
+                )
+            )
+
+    def _infer_aten_bitwise_or(self, node):
+        shape0 = self._get_shape(node, 0)
+        shape1 = self._get_shape(node, 1)
+        new_shape = self._broadcast_shapes(shape0, shape1)
+        t0 = self.known_vi_[node.input[0]]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], t0.type.tensor_type.elem_type, new_shape))
+
+    def _infer_aten_diagonal(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+        offset = self._try_get_value(node, 1)
+        dim1 = self._try_get_value(node, 2)
+        dim2 = self._try_get_value(node, 3)
+
+        assert offset is not None and dim1 is not None and dim2 is not None
+        dim1 = handle_negative_axis(dim1, rank)
+        dim2 = handle_negative_axis(dim2, rank)
+
+        new_shape = []
+        for dim, val in enumerate(sympy_shape):
+            if dim not in [dim1, dim2]:
+                new_shape.append(val)
+
+        shape1 = sympy_shape[dim1]
+        shape2 = sympy_shape[dim2]
+        if offset >= 0:
+            diag_shape = sympy.Max(0, sympy.Min(shape1, shape2 - offset))
+        else:
+            diag_shape = sympy.Max(0, sympy.Min(shape1 + offset, shape2))
+        new_shape.append(diag_shape)
+
+        if node.output[0]:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_shape),
+                )
+            )
+
+    def _infer_aten_multinomial(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+        assert rank in [1, 2]
+        num_samples = self._try_get_value(node, 1)
+        di = rank - 1
+        last_dim = num_samples if num_samples else str(self._new_symbolic_dim_from_output(node, 0, di))
+        output_shape = sympy_shape[:-1] + [last_dim]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                onnx.TensorProto.INT64,
+                get_shape_from_sympy_shape(output_shape),
+            )
+        )
+
+    def _infer_aten_pool2d(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        assert len(sympy_shape) == 4
+        sympy_shape[-2:] = [self._new_symbolic_dim_from_output(node, 0, i) for i in [2, 3]]
+        self._update_computed_dims(sympy_shape)
+        for i, o in enumerate(node.output):
+            if not o:
+                continue
+            vi = self.known_vi_[o]
+            elem_type = onnx.TensorProto.INT64 if i == 1 else self.known_vi_[node.input[0]].type.tensor_type.elem_type
+            vi.CopyFrom(helper.make_tensor_value_info(o, elem_type, get_shape_from_sympy_shape(sympy_shape)))
+
+    def _infer_aten_minmax(self, node):
+        vi = self.known_vi_[node.output[0]]
+        if len(node.input) == 1:
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, []
+                )
+            )
+        else:
+            assert len(node.input) == 3
+            keepdim = self._try_get_value(node, 2)
+            assert keepdim is not None  # can only handle known keepdim case.
+            dim = self._try_get_value(node, 1)
+            if dim is None:
+                rank = self._get_shape_rank(node, 0)
+                output_shape = self._new_symbolic_shape(rank if keepdim else rank - 1, node)
+            else:
+                shape = self._get_sympy_shape(node, 0)
+                dim = handle_negative_axis(dim, len(shape))
+                output_shape = shape[:dim]
+                if keepdim:
+                    output_shape += [1]
+                output_shape += shape[dim + 1 :]
+
+            output_shape = get_shape_from_sympy_shape(output_shape)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, output_shape
+                )
+            )
+            vi1 = self.known_vi_[node.output[1]]
+            vi1.CopyFrom(helper.make_tensor_value_info(node.output[1], onnx.TensorProto.INT64, output_shape))
+
+    def _infer_aten_unfold(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        dimension = self._try_get_value(node, 1)
+        size = self._try_get_value(node, 2)
+        step = self._try_get_value(node, 3)
+        if dimension is not None and size is not None and step is not None:
+            assert dimension < len(sympy_shape)
+            sympy_shape[dimension] = (sympy_shape[dimension] - size) // step + 1
+            sympy_shape.append(size)
+        else:
+            rank = len(sympy_shape)
+            sympy_shape = self._new_symbolic_shape(rank + 1, node)
+        self._update_computed_dims(sympy_shape)
+        if node.output[0]:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(sympy_shape),
+                )
+            )
+
+    def _infer_aten_argmax(self, node):
+        new_shape = None
+        if node.input[1] == "":
+            # The argmax of the flattened input is returned.
+            new_shape = []
+        else:
+            dim = self._try_get_value(node, 1)
+            keepdim = self._try_get_value(node, 2)
+            if keepdim is not None:
+                sympy_shape = self._get_sympy_shape(node, 0)
+                if dim is not None:
+                    dim = handle_negative_axis(dim, len(sympy_shape))
+                    if keepdim:
+                        sympy_shape[dim] = 1
+                    else:
+                        del sympy_shape[dim]
+                else:
+                    rank = len(sympy_shape)
+                    sympy_shape = self._new_symbolic_shape(rank if keepdim else rank - 1, node)
+                self._update_computed_dims(sympy_shape)
+                new_shape = get_shape_from_sympy_shape(sympy_shape)
+        if node.output[0] and new_shape is not None:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, new_shape))
+
+    def _infer_BatchNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+        # this works for opsets < 14 and 14 since we check i < len(node.output) in the loop
+        for i in [1, 2, 3, 4]:
+            if i < len(node.output) and node.output[i] != "":
+                # all of these parameters have the same shape as the 1st input
+                self._propagate_shape_and_type(node, input_index=1, output_index=i)
+
+    def _infer_Range(self, node):
+        vi = self.known_vi_[node.output[0]]
+        input_data = self._get_int_values(node)
+        if all([i is not None for i in input_data]):
+            start = as_scalar(input_data[0])
+            limit = as_scalar(input_data[1])
+            delta = as_scalar(input_data[2])
+            new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start) / delta), 0)]
+        else:
+            new_sympy_shape = [self._new_symbolic_dim_from_output(node)]
+        self._update_computed_dims(new_sympy_shape)
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+    def _infer_ReduceSum(self, node):
+        keep_dims = get_attribute(node, "keepdims", 1)
+        if get_opset(self.out_mp_) >= 13 and len(node.input) > 1:
+            # ReduceSum changes axes to input[1] in opset 13
+            axes = self._try_get_value(node, 1)
+            vi = self.known_vi_[node.output[0]]
+            if axes is None:
+                assert keep_dims  # can only handle keep_dims==True when axes is unknown, by generating new ranks
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        get_shape_from_sympy_shape(self._new_symbolic_shape(self._get_shape_rank(node, 0), node)),
+                    )
+                )
+            else:
+                shape = self._get_shape(node, 0)
+                output_shape = []
+                axes = [handle_negative_axis(a, len(shape)) for a in axes]
+                for i, d in enumerate(shape):
+                    if i in axes:
+                        if keep_dims:
+                            output_shape.append(1)
+                    else:
+                        output_shape.append(d)
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        output_shape,
+                    )
+                )
+
+    def _infer_ReduceProd(self, node):
+        axes = get_attribute(node, "axes")
+        keep_dims = get_attribute(node, "keepdims", 1)
+        if keep_dims == 0 and axes == [0]:
+            data = self._get_int_values(node)[0]
+            if data is not None:
+                self.sympy_data_[node.output[0]] = sympy_reduce_product(data)
+
+    def _infer_Reshape(self, node):
+        shape_value = self._try_get_value(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        if shape_value is None:
+            shape_shape = self._get_shape(node, 1)
+            assert len(shape_shape) == 1
+            shape_rank = shape_shape[0]
+            assert is_literal(shape_rank)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(self._new_symbolic_shape(shape_rank, node)),
+                )
+            )
+        else:
+            input_sympy_shape = self._get_sympy_shape(node, 0)
+            total = int(1)
+            for d in input_sympy_shape:
+                total = total * d
+            new_sympy_shape = []
+            deferred_dim_idx = -1
+            non_deferred_size = int(1)
+            for i, d in enumerate(shape_value):
+                if type(d) == sympy.Symbol:
+                    new_sympy_shape.append(d)
+                elif d == 0:
+                    new_sympy_shape.append(input_sympy_shape[i])
+                    non_deferred_size = non_deferred_size * input_sympy_shape[i]
+                else:
+                    new_sympy_shape.append(d)
+                if d == -1:
+                    deferred_dim_idx = i
+                elif d != 0:
+                    non_deferred_size = non_deferred_size * d
+
+            assert new_sympy_shape.count(-1) < 2
+            if -1 in new_sympy_shape:
+                new_dim = total // non_deferred_size
+                new_sympy_shape[deferred_dim_idx] = new_dim
+
+            self._update_computed_dims(new_sympy_shape)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_sympy_shape),
+                )
+            )
+
+        self._pass_on_sympy_data(node)
+
+    def _infer_Resize(self, node):
+        vi = self.known_vi_[node.output[0]]
+        input_sympy_shape = self._get_sympy_shape(node, 0)
+        if get_opset(self.out_mp_) <= 10:
+            scales = self._try_get_value(node, 1)
+            if scales is not None:
+                new_sympy_shape = [sympy.simplify(sympy.floor(d * s)) for d, s in zip(input_sympy_shape, scales)]
+                self._update_computed_dims(new_sympy_shape)
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        get_shape_from_sympy_shape(new_sympy_shape),
+                    )
+                )
+        else:
+            roi = self._try_get_value(node, 1)
+            scales = self._try_get_value(node, 2)
+            sizes = self._try_get_value(node, 3)
+            if sizes is not None:
+                new_sympy_shape = [sympy.simplify(sympy.floor(s)) for s in sizes]
+                self._update_computed_dims(new_sympy_shape)
+            elif scales is not None:
+                rank = len(scales)
+                if get_attribute(node, "coordinate_transformation_mode") == "tf_crop_and_resize":
+                    assert len(roi) == 2 * rank
+                    roi_start = list(roi)[:rank]
+                    roi_end = list(roi)[rank:]
+                else:
+                    roi_start = [0] * rank
+                    roi_end = [1] * rank
+                scales = list(scales)
+                new_sympy_shape = [
+                    sympy.simplify(sympy.floor(d * (end - start) * scale))
+                    for d, start, end, scale in zip(input_sympy_shape, roi_start, roi_end, scales)
+                ]
+                self._update_computed_dims(new_sympy_shape)
+            else:
+                new_sympy_shape = self._new_symbolic_shape(self._get_shape_rank(node, 0), node)
+
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_sympy_shape),
+                )
+            )
+
+    def _infer_Scan(self, node):
+        subgraph = get_attribute(node, "body")
+        num_scan_inputs = get_attribute(node, "num_scan_inputs")
+        scan_input_axes = get_attribute(node, "scan_input_axes", [0] * num_scan_inputs)
+        num_scan_states = len(node.input) - num_scan_inputs
+        scan_input_axes = [
+            handle_negative_axis(ax, self._get_shape_rank(node, i + num_scan_states))
+            for i, ax in enumerate(scan_input_axes)
+        ]
+        # We may have cases where the subgraph has optional inputs that appear in both subgraph's input and initializer,
+        # but not in the node's input. In such cases, the input model might be invalid, but let's skip those optional inputs.
+        assert len(subgraph.input) >= len(node.input)
+        subgraph_inputs = subgraph.input[: len(node.input)]
+        for i, si in enumerate(subgraph_inputs):
+            subgraph_name = si.name
+            si.CopyFrom(self.known_vi_[node.input[i]])
+            if i >= num_scan_states:
+                scan_input_dim = si.type.tensor_type.shape.dim
+                scan_input_dim.remove(scan_input_dim[scan_input_axes[i - num_scan_states]])
+            si.name = subgraph_name
+        self._onnx_infer_subgraph(node, subgraph)
+        num_scan_outputs = len(node.output) - num_scan_states
+        scan_output_axes = get_attribute(node, "scan_output_axes", [0] * num_scan_outputs)
+        scan_input_dim = get_shape_from_type_proto(self.known_vi_[node.input[-1]].type)[scan_input_axes[-1]]
+        for i, o in enumerate(node.output):
+            vi = self.known_vi_[o]
+            if i >= num_scan_states:
+                shape = get_shape_from_type_proto(subgraph.output[i].type)
+                new_dim = handle_negative_axis(scan_output_axes[i - num_scan_states], len(shape) + 1)
+                shape = shape[:new_dim] + [scan_input_dim] + shape[new_dim:]
+                vi.CopyFrom(helper.make_tensor_value_info(o, subgraph.output[i].type.tensor_type.elem_type, shape))
+            else:
+                vi.CopyFrom(subgraph.output[i])
+            vi.name = o
+
+    def _infer_ScatterElements(self, node):
+        data_shape = self._get_shape(node, 0)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape,
+            )
+        )
+
+    def _infer_SequenceAt(self, node):
+        # need to create new symbolic dimension if sequence shape has None:
+        seq_shape = self._get_shape(node, 0)
+        vi = self.known_vi_[node.output[0]]
+        if seq_shape is not None:
+            for di, d in enumerate(seq_shape):
+                if d is not None:
+                    continue
+                new_dim = onnx.TensorShapeProto.Dimension()
+                new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, 0, di))
+                vi.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+
+    def _infer_SequenceInsert(self, node):
+        # workaround bug in onnx's shape inference
+        vi_seq = self.known_vi_[node.input[0]]
+        vi_tensor = self.known_vi_[node.input[1]]
+        vi_out_seq = self.known_vi_[node.output[0]]
+        vi_out_seq.CopyFrom(vi_seq)
+        vi_out_seq.name = node.output[0]
+        self._fuse_tensor_type(node, 0, vi_out_seq.type, vi_tensor.type)
+
+    def _infer_Shape(self, node):
+        self.sympy_data_[node.output[0]] = self._get_sympy_shape(node, 0)
+
+    def _infer_Size(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        self.sympy_data_[node.output[0]] = sympy_reduce_product(sympy_shape)
+        self.known_vi_[node.output[0]].CopyFrom(
+            helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [])
+        )
+
+    def _infer_Slice(self, node):
+        def less_equal(x, y):
+            try:
+                return bool(x <= y)
+            except TypeError:
+                pass
+            try:
+                return bool(y >= x)
+            except TypeError:
+                pass
+            try:
+                return bool(-x >= -y)
+            except TypeError:
+                pass
+            try:
+                return bool(-y <= -x)
+            except TypeError:
+                # the last attempt; this may raise TypeError
+                return bool(y - x >= 0)
+
+        def handle_negative_index(index, bound):
+            """normalizes a negative index to be in [0, bound)"""
+            try:
+                if not less_equal(0, index):
+                    if is_literal(index) and index <= -self.int_max_:
+                        # this case is handled separately
+                        return index
+                    return bound + index
+            except TypeError:
+                logger.warning("Cannot determine if {} < 0".format(index))
+            return index
+
+        if get_opset(self.out_mp_) <= 9:
+            axes = get_attribute(node, "axes")
+            starts = get_attribute(node, "starts")
+            ends = get_attribute(node, "ends")
+            if not axes:
+                axes = list(range(len(starts)))
+            steps = [1] * len(axes)
+        else:
+            starts = as_list(self._try_get_value(node, 1), keep_none=True)
+            ends = as_list(self._try_get_value(node, 2), keep_none=True)
+            axes = self._try_get_value(node, 3)
+            steps = self._try_get_value(node, 4)
+            if axes is None and not (starts is None and ends is None):
+                axes = list(range(0, len(starts if starts is not None else ends)))
+            if steps is None and not (starts is None and ends is None):
+                steps = [1] * len(starts if starts is not None else ends)
+            axes = as_list(axes, keep_none=True)
+            steps = as_list(steps, keep_none=True)
+
+        new_sympy_shape = self._get_sympy_shape(node, 0)
+        if starts is None or ends is None:
+            if axes is None:
+                for i in range(len(new_sympy_shape)):
+                    new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
+            else:
+                new_sympy_shape = get_shape_from_sympy_shape(new_sympy_shape)
+                for i in axes:
+                    new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
+        else:
+            for i, s, e, t in zip(axes, starts, ends, steps):
+                e = handle_negative_index(e, new_sympy_shape[i])
+                if is_literal(e):
+                    if e >= self.int_max_:
+                        e = new_sympy_shape[i]
+                    elif e <= -self.int_max_:
+                        e = 0 if s > 0 else -1
+                    elif is_literal(new_sympy_shape[i]):
+                        if e < 0:
+                            e = max(0, e + new_sympy_shape[i])
+                        e = min(e, new_sympy_shape[i])
+                    else:
+                        if e > 0:
+                            e = (
+                                sympy.Min(e, new_sympy_shape[i]) if e > 1 else e
+                            )  # special case for slicing first to make computation easier
+                else:
+                    if is_literal(new_sympy_shape[i]):
+                        e = sympy.Min(e, new_sympy_shape[i])
+                    else:
+                        try:
+                            if not less_equal(e, new_sympy_shape[i]):
+                                e = new_sympy_shape[i]
+                        except Exception:
+                            logger.warning(
+                                "Unable to determine if {} <= {}, treat as equal".format(e, new_sympy_shape[i])
+                            )
+                            e = new_sympy_shape[i]
+
+                s = handle_negative_index(s, new_sympy_shape[i])
+                if is_literal(new_sympy_shape[i]) and is_literal(s):
+                    s = max(0, min(s, new_sympy_shape[i]))
+
+                new_sympy_shape[i] = sympy.simplify((e - s + t + (-1 if t > 0 else 1)) // t)
+
+            self._update_computed_dims(new_sympy_shape)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+        # handle sympy_data if needed, for slice in shape computation
+        if (
+            node.input[0] in self.sympy_data_
+            and [0] == axes
+            and len(starts) == 1
+            and len(ends) == 1
+            and len(steps) == 1
+        ):
+            input_sympy_data = self.sympy_data_[node.input[0]]
+            if type(input_sympy_data) == list or (
+                type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1
+            ):
+                self.sympy_data_[node.output[0]] = input_sympy_data[starts[0] : ends[0] : steps[0]]
+
+    def _infer_SoftmaxCrossEntropyLoss(self, node):
+        vi = self.known_vi_[node.output[0]]
+        elem_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi.type.tensor_type.elem_type = elem_type
+        vi.type.tensor_type.shape.CopyFrom(onnx.TensorShapeProto())
+
+        if len(node.output) > 1:
+            data_shape = self._get_shape(node, 0)
+            vi = self.known_vi_[node.output[1]]
+            vi.CopyFrom(helper.make_tensor_value_info(vi.name, elem_type, data_shape))
+
+    def _infer_Split_Common(self, node, make_value_info_func):
+        input_sympy_shape = self._get_sympy_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", 0), len(input_sympy_shape))
+        split = get_attribute(node, "split")
+        if not split:
+            num_outputs = len(node.output)
+            split = [input_sympy_shape[axis] / sympy.Integer(num_outputs)] * num_outputs
+            self._update_computed_dims(split)
+        else:
+            split = [sympy.Integer(s) for s in split]
+
+        for i_o in range(len(split)):
+            vi = self.known_vi_[node.output[i_o]]
+            vi.CopyFrom(
+                make_value_info_func(
+                    node.output[i_o],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(input_sympy_shape[:axis] + [split[i_o]] + input_sympy_shape[axis + 1 :]),
+                )
+            )
+            self.known_vi_[vi.name] = vi
+
+    def _infer_Split(self, node):
+        self._infer_Split_Common(node, helper.make_tensor_value_info)
+
+    def _infer_SplitToSequence(self, node):
+        self._infer_Split_Common(node, helper.make_sequence_value_info)
+
+    def _infer_Squeeze(self, node):
+        input_shape = self._get_shape(node, 0)
+        op_set = get_opset(self.out_mp_)
+
+        # Depending on op-version 'axes' are provided as attribute or via 2nd input
+        if op_set < 13:
+            axes = get_attribute(node, "axes")
+            assert self._try_get_value(node, 1) is None
+        else:
+            axes = self._try_get_value(node, 1)
+            assert get_attribute(node, "axes") is None
+
+        if axes is None:
+            # No axes have been provided (neither via attribute nor via input).
+            # In this case the 'Shape' op should remove all axis with dimension 1.
+            # For symbolic dimensions we guess they are !=1.
+            output_shape = [s for s in input_shape if s != 1]
+            if self.verbose_ > 0:
+                symbolic_dimensions = [s for s in input_shape if type(s) != int]
+                if len(symbolic_dimensions) > 0:
+                    logger.debug(
+                        f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
+                        + f"Assuming the following dimensions are never equal to 1: {symbolic_dimensions}"
+                    )
+        else:
+            axes = [handle_negative_axis(a, len(input_shape)) for a in axes]
+            output_shape = []
+            for i in range(len(input_shape)):
+                if i not in axes:
+                    output_shape.append(input_shape[i])
+                else:
+                    assert input_shape[i] == 1 or type(input_shape[i]) != int
+                    if self.verbose_ > 0 and type(input_shape[i]) != int:
+                        logger.debug(
+                            f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
+                            + f"Assuming the dimension '{input_shape[i]}' at index {i} of the input to be equal to 1."
+                        )
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+        self._pass_on_sympy_data(node)
+
+    def _infer_Tile(self, node):
+        repeats_value = self._try_get_value(node, 1)
+        new_sympy_shape = []
+        if repeats_value is not None:
+            input_sympy_shape = self._get_sympy_shape(node, 0)
+            for i, d in enumerate(input_sympy_shape):
+                new_dim = d * repeats_value[i]
+                new_sympy_shape.append(new_dim)
+            self._update_computed_dims(new_sympy_shape)
+        else:
+            new_sympy_shape = self._new_symbolic_shape(self._get_shape_rank(node, 0), node)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+    def _infer_TopK(self, node):
+        rank = self._get_shape_rank(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", -1), rank)
+        new_shape = self._get_shape(node, 0)
+
+        if get_opset(self.out_mp_) <= 9:
+            k = get_attribute(node, "k")
+        else:
+            k = self._get_int_values(node)[1]
+
+        if k == None:
+            k = self._new_symbolic_dim_from_output(node)
+        else:
+            k = as_scalar(k)
+
+        if type(k) in [int, str]:
+            new_shape[axis] = k
+        else:
+            new_sympy_shape = self._get_sympy_shape(node, 0)
+            new_sympy_shape[axis] = k
+            self._update_computed_dims(
+                new_sympy_shape
+            )  # note that TopK dim could be computed in sympy_data, so need to update computed_dims when it enters shape
+            new_shape = get_shape_from_sympy_shape(new_sympy_shape)
+
+        for i_o in range(len(node.output)):
+            vi = self.known_vi_[node.output[i_o]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[i_o], vi.type.tensor_type.elem_type, new_shape))
+
+    def _infer_Transpose(self, node):
+        if node.input[0] in self.sympy_data_:
+            data_shape = self._get_shape(node, 0)
+            perm = get_attribute(node, "perm", reversed(list(range(len(data_shape)))))
+            input_data = self.sympy_data_[node.input[0]]
+            self.sympy_data_[node.output[0]] = (
+                np.transpose(np.array(input_data).reshape(*data_shape), axes=tuple(perm)).flatten().tolist()
+            )
+
+    def _infer_Unsqueeze(self, node):
+        input_shape = self._get_shape(node, 0)
+        op_set = get_opset(self.out_mp_)
+
+        # Depending on op-version 'axes' are provided as attribute or via 2nd input
+        if op_set < 13:
+            axes = get_attribute(node, "axes")
+            assert self._try_get_value(node, 1) is None
+        else:
+            axes = self._try_get_value(node, 1)
+            assert get_attribute(node, "axes") is None
+
+        output_rank = len(input_shape) + len(axes)
+        axes = [handle_negative_axis(a, output_rank) for a in axes]
+
+        input_axis = 0
+        output_shape = []
+        for i in range(output_rank):
+            if i in axes:
+                output_shape.append(1)
+            else:
+                output_shape.append(input_shape[input_axis])
+                input_axis += 1
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+
+        self._pass_on_sympy_data(node)
+
+    def _infer_ZipMap(self, node):
+        map_key_type = None
+        if get_attribute(node, "classlabels_int64s") is not None:
+            map_key_type = onnx.TensorProto.INT64
+        elif get_attribute(node, "classlabels_strings") is not None:
+            map_key_type = onnx.TensorProto.STRING
+
+        assert map_key_type is not None
+        new_vi = onnx.ValueInfoProto()
+        new_vi.name = node.output[0]
+        new_vi.type.sequence_type.elem_type.map_type.value_type.tensor_type.elem_type = onnx.TensorProto.FLOAT
+        new_vi.type.sequence_type.elem_type.map_type.key_type = map_key_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(new_vi)
+
+    def _infer_Attention(self, node):
+        shape = self._get_shape(node, 0)
+        shape_bias = self._get_shape(node, 2)
+        assert len(shape) == 3 and len(shape_bias) == 1
+        qkv_hidden_sizes_attr = get_attribute(node, "qkv_hidden_sizes")
+        if qkv_hidden_sizes_attr is not None:
+            assert len(qkv_hidden_sizes_attr) == 3
+            shape[2] = int(qkv_hidden_sizes_attr[2])
+        else:
+            shape[2] = int(shape_bias[0] / 3)
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape))
+
+        if len(node.output) > 1:
+            # input shape: (batch_size, sequence_length, hidden_size)
+            # past shape: (2, batch_size, num_heads, past_sequence_length, head_size)
+            # mask shape: (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) or (batch_size, 1, max_seq_len, max_seq_len)
+            # present shape: (2, batch_size, num_heads, total_sequence_length, head_size), where total_sequence_length=sequence_length+past_sequence_length
+            input_shape = self._get_shape(node, 0)
+            past_shape = self._get_shape(node, 4)
+            mask_shape = self._get_shape(node, 3)
+            if len(past_shape) == 5:
+                if len(mask_shape) in [2, 3]:
+                    past_shape[3] = mask_shape[-1]
+                elif isinstance(input_shape[1], int) and isinstance(past_shape[3], int):
+                    past_shape[3] = input_shape[1] + past_shape[3]
+                else:
+                    past_shape[3] = f"{past_shape[3]}+{input_shape[1]}"
+                vi = self.known_vi_[node.output[1]]
+                vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, past_shape))
+
+    def _infer_BiasGelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_FastGelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_Gelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_LayerNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_LongformerAttention(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_EmbedLayerNormalization(self, node):
+        input_ids_shape = self._get_shape(node, 0)
+        word_embedding_shape = self._get_shape(node, 2)
+        assert len(input_ids_shape) == 2 and len(word_embedding_shape) == 2
+        output_shape = input_ids_shape + [word_embedding_shape[1]]
+
+        word_embedding_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], word_embedding_dtype, output_shape))
+
+        mask_index_shape = [input_ids_shape[0]]
+        vi = self.known_vi_[node.output[1]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[1], onnx.TensorProto.INT32, mask_index_shape))
+
+        if len(node.output) > 2:
+            # Optional output of add before layer nomalization is done
+            # shape is same as the output
+            vi = self.known_vi_[node.output[2]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[2], word_embedding_dtype, output_shape))
+
+    def _infer_SkipLayerNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_PythonOp(self, node):
+        output_tensor_types = get_attribute(node, "output_tensor_types")
+        assert output_tensor_types
+        output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
+        assert output_tensor_ranks
+
+        # set the context output seperately.
+        # The first output is autograd's context.
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, []))
+
+        # Outputs after autograd's context are tensors.
+        # We assume their ranks are fixed for different model inputs.
+        for i in range(len(node.output) - 1):
+            # Process the i-th tensor outputs.
+            vi = self.known_vi_[node.output[i + 1]]
+            sympy_shape = self._new_symbolic_shape(output_tensor_ranks[i], node)
+            shape = get_shape_from_sympy_shape(sympy_shape)
+            value_info = helper.make_tensor_value_info(node.output[i + 1], output_tensor_types[i], shape)
+            vi.CopyFrom(value_info)
+
+    def _propagate_shape_and_type(self, node, input_index=0, output_index=0):
+        shape = self._get_shape(node, input_index)
+        output_dtype = self.known_vi_[node.input[input_index]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[output_index]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[output_index], output_dtype, shape))
+
+    def _is_none_dim(self, dim_value):
+        if type(dim_value) != str:
+            return False
+        if "unk__" not in dim_value:
+            return False
+        if dim_value in self.symbolic_dims_.keys():
+            return False
+        return True
+
+    def _is_shape_contains_none_dim(self, out_shape):
+        for out in out_shape:
+            if self._is_none_dim(out):
+                return out
+        return None
+
+    def _infer_impl(self, start_sympy_data=None):
+        self.sympy_data_ = start_sympy_data or {}
+        self.out_mp_.graph.ClearField("value_info")
+        self._apply_suggested_merge(graph_input_only=True)
+        self.input_symbols_ = set()
+        for i in self.out_mp_.graph.input:
+            input_shape = get_shape_from_value_info(i)
+            if input_shape is None:
+                continue
+
+            if is_sequence(i.type):
+                input_dims = i.type.sequence_type.elem_type.tensor_type.shape.dim
+            else:
+                input_dims = i.type.tensor_type.shape.dim
+
+            for i_dim, dim in enumerate(input_shape):
+                if dim is None:
+                    # some models use None for symbolic dim in input, replace it with a string
+                    input_dims[i_dim].dim_param = str(self._new_symbolic_dim(i.name, i_dim))
+
+            self.input_symbols_.update([d for d in input_shape if type(d) == str])
+
+        for s in self.input_symbols_:
+            if s in self.suggested_merge_:
+                s_merge = self.suggested_merge_[s]
+                assert s_merge in self.symbolic_dims_
+                self.symbolic_dims_[s] = self.symbolic_dims_[s_merge]
+            else:
+                # Since inputs are not produced by other ops, we can assume positivity
+                self.symbolic_dims_[s] = sympy.Symbol(s, integer=True, positive=True)
+        # create a temporary ModelProto for single node inference
+        # note that we remove initializer to have faster inference
+        # for tensor ops like Reshape/Tile/Expand that read initializer, we need to do sympy computation based inference anyways
+        self.tmp_mp_ = onnx.ModelProto()
+        self.tmp_mp_.CopyFrom(self.out_mp_)
+        self.tmp_mp_.graph.ClearField("initializer")
+
+        # compute prerequesite for node for topological sort
+        # node with subgraphs may have dependency on implicit inputs, which will affect topological sort
+        prereq_for_node = {}  # map from node to all its inputs, including implicit ones in subgraph
+
+        def get_prereq(node):
+            names = set(i for i in node.input if i)
+            subgraphs = []
+            if "If" == node.op_type:
+                subgraphs = [
+                    get_attribute(node, "then_branch"),
+                    get_attribute(node, "else_branch"),
+                ]
+            elif node.op_type in ["Loop", "Scan"]:
+                subgraphs = [get_attribute(node, "body")]
+            for g in subgraphs:
+                g_outputs_and_initializers = {i.name for i in g.initializer}
+                g_prereq = set()
+                for n in g.node:
+                    g_outputs_and_initializers.update(n.output)
+                for n in g.node:
+                    g_prereq.update([i for i in get_prereq(n) if i not in g_outputs_and_initializers])
+                names.update(g_prereq)
+                # remove subgraph inputs from g_prereq since those are local-only
+                for i in g.input:
+                    if i.name in names:
+                        names.remove(i.name)
+            return names
+
+        for n in self.tmp_mp_.graph.node:
+            prereq_for_node[n.output[0]] = get_prereq(n)
+
+        # topological sort nodes, note there might be dead nodes so we check if all graph outputs are reached to terminate
+        sorted_nodes = []
+        sorted_known_vi = set([i.name for i in list(self.out_mp_.graph.input) + list(self.out_mp_.graph.initializer)])
+        if any([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
+            # Loop/Scan will have some graph output in graph inputs, so don't do topological sort
+            sorted_nodes = self.out_mp_.graph.node
+        else:
+            while not all([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
+                old_sorted_nodes_len = len(sorted_nodes)
+                for node in self.out_mp_.graph.node:
+                    if (node.output[0] not in sorted_known_vi) and all(
+                        [i in sorted_known_vi for i in prereq_for_node[node.output[0]] if i]
+                    ):
+                        sorted_known_vi.update(node.output)
+                        sorted_nodes.append(node)
+                if old_sorted_nodes_len == len(sorted_nodes) and not all(
+                    [o.name in sorted_known_vi for o in self.out_mp_.graph.output]
+                ):
+                    raise Exception("Invalid model with cyclic graph")
+
+        for node in sorted_nodes:
+            assert all([i in self.known_vi_ for i in node.input if i])
+            self._onnx_infer_single_node(node)
+            known_aten_op = False
+            if node.op_type in self.dispatcher_:
+                self.dispatcher_[node.op_type](node)
+            elif node.op_type in ["ConvTranspose"]:
+                # onnx shape inference ops like ConvTranspose may have empty shape for symbolic input
+                # before adding symbolic compute for them
+                # mark the output type as UNDEFINED to allow guessing of rank
+                vi = self.known_vi_[node.output[0]]
+                if len(vi.type.tensor_type.shape.dim) == 0:
+                    vi.type.tensor_type.elem_type = onnx.TensorProto.UNDEFINED
+            elif node.op_type == "ATen" and node.domain == "org.pytorch.aten":
+                for attr in node.attribute:
+                    # TODO: Is overload_name needed?
+                    if attr.name == "operator":
+                        aten_op_name = attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s
+                        if aten_op_name in self.aten_op_dispatcher_:
+                            known_aten_op = True
+                            self.aten_op_dispatcher_[aten_op_name](node)
+                        break
+
+            if self.verbose_ > 2:
+                logger.debug(node.op_type + ": " + node.name)
+                for i, name in enumerate(node.input):
+                    logger.debug(
+                        "  Input {}: {} {}".format(i, name, "initializer" if name in self.initializers_ else "")
+                    )
+
+            # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb']
+            # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case
+            if node.op_type in [
+                "Add",
+                "Sub",
+                "Mul",
+                "Div",
+                "MatMul",
+                "MatMulInteger",
+                "MatMulInteger16",
+                "Where",
+                "Sum",
+            ]:
+                vi = self.known_vi_[node.output[0]]
+                out_rank = len(get_shape_from_type_proto(vi.type))
+                in_shapes = [self._get_shape(node, i) for i in range(len(node.input))]
+                for d in range(out_rank - (2 if node.op_type in ["MatMul", "MatMulInteger", "MatMulInteger16"] else 0)):
+                    in_dims = [s[len(s) - out_rank + d] for s in in_shapes if len(s) + d >= out_rank]
+                    if len(in_dims) > 1:
+                        self._check_merged_dims(in_dims, allow_broadcast=True)
+
+            for i_o in range(len(node.output)):
+                vi = self.known_vi_[node.output[i_o]]
+                out_type = vi.type
+                out_type_kind = out_type.WhichOneof("value")
+
+                # do not process shape for non-tensors
+                if out_type_kind not in ["tensor_type", "sparse_tensor_type", None]:
+                    if self.verbose_ > 2:
+                        if out_type_kind == "sequence_type":
+                            seq_cls_type = out_type.sequence_type.elem_type.WhichOneof("value")
+                            if "tensor_type" == seq_cls_type:
+                                logger.debug(
+                                    "  {}: sequence of {} {}".format(
+                                        node.output[i_o],
+                                        str(get_shape_from_value_info(vi)),
+                                        onnx.TensorProto.DataType.Name(
+                                            vi.type.sequence_type.elem_type.tensor_type.elem_type
+                                        ),
+                                    )
+                                )
+                            else:
+                                logger.debug("  {}: sequence of {}".format(node.output[i_o], seq_cls_type))
+                        else:
+                            logger.debug("  {}: {}".format(node.output[i_o], out_type_kind))
+                    continue
+
+                out_shape = get_shape_from_value_info(vi)
+                out_type_undefined = out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED
+                if self.verbose_ > 2:
+                    logger.debug(
+                        "  {}: {} {}".format(
+                            node.output[i_o],
+                            str(out_shape),
+                            onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type),
+                        )
+                    )
+                    if node.output[i_o] in self.sympy_data_:
+                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))
+
+                # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
+                if (
+                    out_shape is not None and (None in out_shape or self._is_shape_contains_none_dim(out_shape))
+                ) or out_type_undefined:
+                    if self.auto_merge_:
+                        if node.op_type in [
+                            "Add",
+                            "Sub",
+                            "Mul",
+                            "Div",
+                            "MatMul",
+                            "MatMulInteger",
+                            "MatMulInteger16",
+                            "Concat",
+                            "Where",
+                            "Sum",
+                            "Equal",
+                            "Less",
+                            "Greater",
+                            "LessOrEqual",
+                            "GreaterOrEqual",
+                            "Min",
+                            "Max",
+                        ]:
+                            shapes = [self._get_shape(node, i) for i in range(len(node.input))]
+                            if node.op_type in [
+                                "MatMul",
+                                "MatMulInteger",
+                                "MatMulInteger16",
+                            ]:
+                                if None in out_shape or self._is_shape_contains_none_dim(out_shape):
+                                    if None in out_shape:
+                                        idx = out_shape.index(None)
+                                    else:
+                                        idx = out_shape.index(self._is_shape_contains_none_dim(out_shape))
+                                    dim_idx = [len(s) - len(out_shape) + idx for s in shapes]
+                                    # only support auto merge for MatMul for dim < rank-2 when rank > 2
+                                    assert len(shapes[0]) > 2 and dim_idx[0] < len(shapes[0]) - 2
+                                    assert len(shapes[1]) > 2 and dim_idx[1] < len(shapes[1]) - 2
+                        elif node.op_type == "Expand":
+                            # auto merge for cases like Expand([min(batch, 1), min(seq, 512)], [batch, seq])
+                            shapes = [
+                                self._get_shape(node, 0),
+                                self._get_value(node, 1),
+                            ]
+                        else:
+                            shapes = []
+
+                        if shapes:
+                            for idx in range(len(out_shape)):
+                                if out_shape[idx] is not None and not self._is_none_dim(out_shape[idx]):
+                                    continue
+                                # note that the broadcasting rule aligns from right to left
+                                # if a tensor has a lower rank (dim_idx[idx] < 0), it would automatically broadcast and need no merge
+                                dim_idx = [len(s) - len(out_shape) + idx for s in shapes]
+                                if len(dim_idx) > 0:
+                                    self._add_suggested_merge(
+                                        [
+                                            s[i] if is_literal(s[i]) else str(s[i])
+                                            for s, i in zip(shapes, dim_idx)
+                                            if i >= 0
+                                        ]
+                                    )
+                            self.run_ = True
+                        else:
+                            self.run_ = False
+                    else:
+                        self.run_ = False
+
+                    # create new dynamic dims for ops not handled by symbolic shape inference
+                    if self.run_ == False and not node.op_type in self.dispatcher_ and not known_aten_op:
+                        is_unknown_op = out_type_undefined and (out_shape is None or len(out_shape) == 0)
+                        if is_unknown_op:
+                            # unknown op to ONNX, maybe from higher opset or other domain
+                            # only guess the output rank from input 0 when using guess_output_rank option
+                            out_rank = self._get_shape_rank(node, 0) if self.guess_output_rank_ else -1
+                        else:
+                            # valid ONNX op, but not handled by symbolic shape inference, just assign dynamic shape
+                            out_rank = len(out_shape)
+
+                        if out_rank >= 0:
+                            new_shape = self._new_symbolic_shape(out_rank, node, i_o)
+                            if out_type_undefined:
+                                # guess output data type from input vi if not defined
+                                out_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+                            else:
+                                # otherwise, use original data type
+                                out_dtype = vi.type.tensor_type.elem_type
+                            vi.CopyFrom(
+                                helper.make_tensor_value_info(
+                                    vi.name,
+                                    out_dtype,
+                                    get_shape_from_sympy_shape(new_shape),
+                                )
+                            )
+
+                            if self.verbose_ > 0:
+                                if is_unknown_op:
+                                    logger.debug(
+                                        "Possible unknown op: {} node: {}, guessing {} shape".format(
+                                            node.op_type, node.name, vi.name
+                                        )
+                                    )
+                                if self.verbose_ > 2:
+                                    logger.debug(
+                                        "  {}: {} {}".format(
+                                            node.output[i_o],
+                                            str(new_shape),
+                                            vi.type.tensor_type.elem_type,
+                                        )
+                                    )
+
+                            self.run_ = True
+                            continue  # continue the inference after guess, no need to stop as no merge is needed
+
+                    if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined:
+                        logger.debug("Stopping at incomplete shape inference at " + node.op_type + ": " + node.name)
+                        logger.debug("node inputs:")
+                        for i in node.input:
+                            logger.debug(self.known_vi_[i])
+                        logger.debug("node outputs:")
+                        for o in node.output:
+                            logger.debug(self.known_vi_[o])
+                        if self.auto_merge_ and not out_type_undefined:
+                            logger.debug("Merging: " + str(self.suggested_merge_))
+                    return False
+
+        self.run_ = False
+        return True
+
+    def _update_output_from_vi(self):
+        for output in self.out_mp_.graph.output:
+            if output.name in self.known_vi_:
+                output.CopyFrom(self.known_vi_[output.name])
+
+    @staticmethod
+    def infer_shapes(in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=False, verbose=0):
+        onnx_opset = get_opset(in_mp)
+        if (not onnx_opset) or onnx_opset < 7:
+            logger.warning("Only support models of onnx opset 7 and above.")
+            return None
+        symbolic_shape_inference = SymbolicShapeInference(int_max, auto_merge, guess_output_rank, verbose)
+        all_shapes_inferred = False
+        symbolic_shape_inference._preprocess(in_mp)
+        while symbolic_shape_inference.run_:
+            all_shapes_inferred = symbolic_shape_inference._infer_impl()
+        symbolic_shape_inference._update_output_from_vi()
+        if not all_shapes_inferred:
+            raise Exception("Incomplete symbolic shape inference")
+        return symbolic_shape_inference.out_mp_
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="The input model file")
+    parser.add_argument("--output", help="The output model file")
+    parser.add_argument(
+        "--auto_merge",
+        help="Automatically merge symbolic dims when confliction happens",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--int_max",
+        help="maximum value for integer to be treated as boundless for ops like slice",
+        type=int,
+        default=2**31 - 1,
+    )
+    parser.add_argument(
+        "--guess_output_rank",
+        help="guess output rank to be the same as input 0 for unknown ops",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--verbose",
+        help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--save_as_external_data",
+        help="Saving an ONNX model to external data",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--all_tensors_to_one_file",
+        help="Saving all the external data to one file",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--external_data_location",
+        help="The file location to save the external file",
+        default="./",
+    )
+    parser.add_argument(
+        "--external_data_size_threshold",
+        help="The size threshold for external data",
+        type=int,
+        default=1024,
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    logger.info("input model: " + args.input)
+    if args.output:
+        logger.info("output model " + args.output)
+    logger.info("Doing symbolic shape inference...")
+    out_mp = SymbolicShapeInference.infer_shapes(
+        onnx.load(args.input),
+        args.int_max,
+        args.auto_merge,
+        args.guess_output_rank,
+        args.verbose,
+    )
+    if args.output and out_mp:
+        if args.save_as_external_data:
+            onnx.save_model(
+                out_mp,
+                args.output,
+                save_as_external_data=True,
+                all_tensors_to_one_file=args.all_tensors_to_one_file,
+                location=args.external_data_location,
+                size_threshold=args.external_data_size_threshold,
+                convert_attribute=False,
+            )
+        else:
+            onnx.save(out_mp, args.output)
+        logger.info("Done!")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4228878179aa0c3fa63fd9656087de5a90d5e31c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
@@ -0,0 +1,3 @@
+sympy
+packaging
+onnxsim
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..396998600124e124f2fc8f78483d68a27ca7e4ed
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
@@ -0,0 +1,19 @@
+onnx==1.15.0
+onnxsim==0.4.36
+tf2onnx==1.16.1
+onnxruntime
+onnxoptimizer==0.3.13
+bert-tensorflow==1.0.1
+
+pandas==2.1.1
+numpy==1.23.0
+matplotlib
+scikit-learn
+opencv-python==4.6.0.66
+opencv-python-headless
+tokenization==1.0.7
+tokenizers==0.13.3
+sentencepiece==0.1.96
+typing_extensions==4.10.0
+
+py-libnuma==1.2
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
new file mode 100644
index 0000000000000000000000000000000000000000..0834ab6dd3247dbfad47996e8fba22dcd29c4cab
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -0,0 +1,660 @@
+# Copyright 2023 Graphcore Ltd.
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import torch
+import logging
+import numpy as np
+from tqdm import tqdm
+import threading
+import importlib
+
+from general_perf.backends import runtime_backend
+from general_perf.backends.ILUVATAR.common import init_by_tensorrt, setup_io_bindings
+from general_perf.backends.ILUVATAR.common import Task, TaskThread
+from cuda import cuda, cudart
+import numa
+
+from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
+
+log = logging.getLogger("RuntimeBackendILUVATAR")
+
+Dims = None
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "INT32":torch.int32,
+    "LONG": torch.long,
+    "INT64": torch.int64,
+    "BOOL": torch.bool
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+class RuntimeBackendILUVATAR(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendILUVATAR, self).__init__()
+        self.hardware_type = "ILUVATAR"
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.engine = None
+        self.context = None
+        self.batch_size = -1
+        self.workload = None
+        self.predict_fps = None
+        self.predict_time = None
+        self.task = None
+        self.inputs = None
+        self.outputs = None
+        self.allocations = None
+        numa.memory.set_local_alloc()
+        numa.schedule.run_on_nodes(0)
+
+    def isSDmodel(self, model_name):
+        result = False
+        if model_name == 'vae-decoder-onnx-fp32' or model_name == 'vae-encoder-onnx-fp32' or model_name == 'clip-onnx-fp32':
+            result = True
+        return result
+
+    # Dual-core inference of Tian SoC BI-150 graphics card
+    def benchmark(self, dataloader):
+        performance_reports = []
+        merged_dict = {}
+        model_name = self.configs["model"].split("-")[0]
+        
+        workers = []
+        lock = threading.Lock()
+        for i in range(2):
+            device_id = i
+            self.task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock, framework=model_name)
+
+            work = TaskThread(self.task.run, [])
+            workers.append(work)
+            work.start()
+            work.join()
+        
+        if model_name != 'gpt2':
+            if not self.isSDmodel(self.configs["model"]):
+                del self.engine
+                del self.context
+            
+        if len(performance_reports[0]) == len(performance_reports[1]):
+            if performance_reports[0].keys() == performance_reports[1].keys():
+
+                qps = performance_reports[0]['QPS'] + performance_reports[1]['QPS']
+                avg_latency = round(((performance_reports[0]['AVG Latency'] + performance_reports[1]['AVG Latency']) / 2.0), 2)
+                p99_latency = round(((performance_reports[0]['P99 Latency'] + performance_reports[1]['P99 Latency']) / 2.0), 2)
+
+                merged_dict['BS'] = performance_reports[0]['BS']
+                merged_dict['QPS'] = qps
+                merged_dict['AVG Latency'] = avg_latency
+                merged_dict["P99 Latency"] = p99_latency
+
+                if model_name != 'gpt2':
+                    predict_qps = performance_reports[0]['predict QPS'] + performance_reports[1]['predict QPS']
+                    predict_avg_latency = round(((performance_reports[0]['predict AVG Latency'] + performance_reports[1]['predict AVG Latency']) / 2.0), 2)
+                    predict_p99_latency = round(((performance_reports[0]['predict P99 Latency'] + performance_reports[1]['predict P99 Latency']) / 2.0), 2)
+
+                    merged_dict['predict QPS'] = predict_qps
+                    merged_dict['predict AVG Latency'] = predict_avg_latency
+                    merged_dict["predict P99 Latency"] = predict_p99_latency
+                
+        return merged_dict
+    
+    def init_allocs(self):
+        if self.inputs is not None:
+            for i in range(len(self.inputs)):
+                err, = cudart.cudaFree(self.inputs[i]["allocation"])
+                assert err == cudart.cudaError_t.cudaSuccess
+
+            for i in range(len(self.outputs)):
+                err, = cudart.cudaFree(self.outputs[i]["allocation"])
+                assert err == cudart.cudaError_t.cudaSuccess
+            self.inputs = None
+
+    def get_allocs(self):
+        if self.inputs is None:
+            self.inputs, self.outputs, self.allocations = setup_io_bindings(self.engine, self.context)
+        return self.inputs, self.outputs, self.allocations
+
+    def predict_dump(self, feeds):
+        input_tensors = []
+        i = 0
+
+        model_name = self.configs["model"].split("-")[0]
+    
+        if model_name != 'gpt2':
+            if model_name == 'deberta':
+                keys = list(feeds.keys())
+                input_ids = feeds[keys[0]]
+                attention_mask = feeds[keys[1]]
+                input_tensors = [input_ids, attention_mask]
+
+            else:
+                for key, _ in feeds.items():
+                    input_tensors.append(feeds[key])
+                    i += 1
+
+            # ixrt inference
+            engine = self.engine
+            assert engine
+            context = self.context
+            assert context
+
+            # set dynamic shape
+            input_tensor_map = self.configs["segments"][0]["input_tensor_map"]
+            input_shape = input_tensor_map.values()
+
+            i = 0
+            for input_name, _ in input_tensor_map.items():
+                if model_name == 'widedeep':
+                    input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32))
+                    input_names = [
+                        "new_categorical_placeholder:0",
+                        "new_numeric_placeholder:0",
+                        "import/head/predictions/zeros_like:0"
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'new_categorical_placeholder:0':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'new_numeric_placeholder:0':
+                            input_shape = input_tensors[1].shape
+                        if input_name == 'import/head/predictions/zeros_like:0':
+                            input_shape = input_tensors[2].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                elif model_name == 'deberta':
+                    input_names = [
+                        "input_ids.1",
+                        "attention_mask.1",
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'input_ids.1':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'attention_mask.1':
+                            input_shape = input_tensors[1].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                else:
+                    input_shape = input_tensors[i].shape
+                    input_idx = engine.get_binding_index(input_name)
+                    context.set_binding_shape(input_idx, Dims(input_shape))
+                    i += 1
+            
+            # Setup I/O bindings
+            inputs, outputs, allocations = self.get_allocs()
+
+            # Prepare the output data
+            outputs_list = []
+            for i in range(len(outputs)):
+                output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+                outputs_list.append(output)
+
+            data_batch_list = []
+            for i in range(len(input_tensors)):
+                data_batch = np.ascontiguousarray(input_tensors[i])
+                data_batch_list.append(data_batch)
+
+        return input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list
+
+    def predict_timing(self, input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list):
+        model_name = self.configs["model"].split("-")[0]
+        
+        # H2D: host to device
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaHostRegister(data_batch_list[i], inputs[i]["nbytes"], 2)
+
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaMemcpy(
+                        inputs[i]["allocation"],
+                        data_batch_list[i],
+                        inputs[i]["nbytes"],
+                        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+            )
+
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaHostUnregister(data_batch_list[i])
+
+        starttime = time.time()
+        context.execute_v2(allocations)
+        endtime = time.time()
+
+        self.predict_time = endtime - starttime
+        
+        # D2H: device to host
+        for i in range(len(outputs)):
+            (err, )= cudart.cudaMemcpy(outputs_list[i], 
+                        outputs[i]["allocation"], 
+                        outputs[i]["nbytes"], 
+                        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+            )
+        
+        result = {}
+
+        output_tensor_map = self.configs["segments"][0]["output_tensor_map"]
+        output_name = output_tensor_map.split(",")
+
+        for i in range(len(output_name)):
+            if model_name == 'yolov5':
+                result[output_name[0]] = outputs_list[0]
+                break
+
+            result[output_name[i]] = outputs_list[i]
+        
+        if model_name == 'videobert':
+            return outputs_list
+        
+        elif model_name == 'gpt2':
+            return None
+        
+        else:
+            return result
+
+    def predict(self, feeds):
+        # The deberta model is currently unable to undergo accuracy testing temporarily
+        input_tensors = []
+        i = 0
+
+        model_name = self.configs["model"].split("-")[0]
+        if self.isSDmodel(self.configs["model"]):
+            for key, _ in feeds.items():
+                tmp_tensor = torch.tensor(feeds[key],
+                                        dtype=pt_dtype_map[self.input_type[i]])
+                input_tensors.append(tmp_tensor)
+                i += 1
+
+            self.predict_sd(input_tensors)
+            return
+        
+        elif model_name != 'gpt2':
+            if model_name == 'deberta':
+                keys = list(feeds.keys())
+                input_ids = np.array(feeds[keys[0]], dtype=INPUT_TYPE[self.input_type[i]])
+                attention_mask = np.array(feeds[keys[1]], dtype=INPUT_TYPE[self.input_type[i]])
+                input_tensors = [input_ids, attention_mask]
+
+            else:
+                trans_index = [0, 1, 2]
+                if model_name == 'bert' and self.configs['compile_precision'] == 'INT8':
+                    trans_index = [0, 2, 1]
+
+                for key, _ in feeds.items():
+                    tmp_tensor = np.array(feeds[key], dtype=INPUT_TYPE[self.input_type[trans_index[i]]])
+                    input_tensors.append(tmp_tensor)
+                    i += 1
+
+            # ixrt inference
+            engine = self.engine
+            assert engine
+            context = self.context
+            assert context
+
+            # set dynamic shape
+            input_tensor_map = self.configs["segments"][0]["input_tensor_map"]
+            input_shape = input_tensor_map.values()
+
+            i = 0
+            for input_name, _ in input_tensor_map.items():
+                if model_name == 'widedeep':
+                    input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32))
+                    input_names = [
+                        "new_categorical_placeholder:0",
+                        "new_numeric_placeholder:0",
+                        "import/head/predictions/zeros_like:0"
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'new_categorical_placeholder:0':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'new_numeric_placeholder:0':
+                            input_shape = input_tensors[1].shape
+                        if input_name == 'import/head/predictions/zeros_like:0':
+                            input_shape = input_tensors[2].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                elif model_name == 'deberta':
+                    input_names = [
+                        "input_ids.1",
+                        "attention_mask.1",
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'input_ids.1':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'attention_mask.1':
+                            input_shape = input_tensors[1].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                else:
+                    input_shape = input_tensors[i].shape
+                    input_idx = engine.get_binding_index(input_name)
+                    context.set_binding_shape(input_idx, Dims(input_shape))
+                    i += 1
+            
+            # Setup I/O bindings
+            inputs, outputs, allocations = self.get_allocs()
+
+            # Prepare the output data
+            outputs_list = []
+            for i in range(len(outputs)):
+                output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+                outputs_list.append(output)
+
+            data_batch_list = []
+            for i in range(len(input_tensors)):
+                data_batch = np.ascontiguousarray(input_tensors[i])
+                data_batch_list.append(data_batch)
+
+            # H2D: host to device
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaHostRegister(data_batch_list[i], inputs[i]["nbytes"], 2)
+
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaMemcpy(
+                            inputs[i]["allocation"],
+                            data_batch_list[i],
+                            inputs[i]["nbytes"],
+                            cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+                )
+            
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaHostUnregister(data_batch_list[i])
+
+            starttime = time.time()
+            context.execute_v2(allocations)
+            endtime = time.time()
+
+            self.predict_time = endtime - starttime
+            
+            # D2H: device to host
+            for i in range(len(outputs)):
+                (err, )= cudart.cudaMemcpy(outputs_list[i], 
+                            outputs[i]["allocation"], 
+                            outputs[i]["nbytes"], 
+                            cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+                )
+            
+            # Free Gpu Memory
+            # cuda-python
+            self.init_allocs()
+            
+            result = {}
+
+            output_tensor_map = self.configs["segments"][0]["output_tensor_map"]
+            output_name = output_tensor_map.split(",")
+
+            for i in range(len(output_name)):
+                if model_name == 'yolov5':
+                    result[output_name[0]] = outputs_list[0]
+                    break
+
+                result[output_name[i]] = outputs_list[i]
+        else:
+            self.predict_igie(feeds)
+            
+        if model_name == 'videobert':
+            return outputs_list
+        
+        elif model_name == 'gpt2':
+            return None
+        
+        else:
+            return result
+    
+    def predict_igie(self, dataloader):
+        tvm = importlib.import_module("tvm")
+        self.task.module.set_input("input_ids", tvm.nd.array(dataloader["input_ids"].astype('int64'), self.device))
+        self.task.module.run()
+        output = self.task.module.get_output(0)
+
+        return output
+    
+    def benchmark_interact(self, dataloader):
+        batch_size = self.get_loaded_batch_size()
+        iterations = self.workload['iterations']
+        model_name = self.configs["model"].split("-")[0]
+        times_range = []
+        predict_range = []
+        report = {}
+        report["BS"] = batch_size
+
+        if model_name == 'gpt2':
+            self.load_igie(batch_size)
+        elif self.isSDmodel(self.configs["model"]):
+            self.load_sd(batch_size)   
+    
+        test_data = self._get_fake_samples(batch_size=batch_size,
+                        shape=self.configs['segments'][0]['input_tensor_map'],
+                        input_type=self.configs['input_type'])
+        
+        # Free Gpu Memory
+        # cuda-python
+        self.init_allocs()
+
+        for _ in range(30):
+            self.predict(test_data)
+
+        for _ in range(iterations):
+            if model_name != 'gpt2' and model_name != 'vae' and model_name != 'clip':
+                input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list = self.predict_dump(test_data)
+
+                start_time = time.time()
+                self.predict_timing(input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list)
+                end_time = time.time()
+            
+            else:
+                start_time = time.time()
+                self.predict(test_data)
+                end_time = time.time()
+
+            times_range.append(end_time - start_time)
+            predict_range.append(self.predict_time)           
+
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * self.batch_size / avg_latency)
+
+        if model_name != 'gpt2':
+            predict_range.sort()
+            predict_tail_latency = round(
+                predict_range[int(len(predict_range) * 0.99)] * 1000, 2)
+            predict_avg_latency = round(sum(predict_range) / iterations * 1000, 2)
+            fps = int(1000.0 * batch_size / predict_avg_latency)
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(self.batch_size, qps, avg_latency, tail_latency))
+        
+        # log.info(
+        #     'Batch size is {}, fps: {}, predict_avg_latency:{}, predict_tail_latency:{}'.
+        #     format(self.batch_size, fps, predict_avg_latency, tail_latency))
+
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        if model_name != 'gpt2':
+            report['predict QPS'] = fps
+            report['predict AVG Latency'] = predict_avg_latency
+            report['predict P99 Latency'] = predict_tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        # return self.workload['batch_sizes'][0]
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        global Dims
+
+        # load engine
+        model = self.configs['model']
+        model_name = self.configs['model'].split("-")[0]
+        model_path = self.configs['model_path']
+
+        precision = self.configs['compile_precision'].replace('FP32', 'FP16')
+
+        if precision == 'FP16':
+            if model_name == 'resnet50' or model_name == 'bert' or model_name == 'albert' or model == 'deberta' or model_name == 'yolov5':
+                mod = importlib.import_module("tensorrt")
+                Dims = getattr(mod, "Dims")
+            else:
+                mod = importlib.import_module("tensorrt")
+                Dims = getattr(mod, "Dims")
+
+        if precision == 'INT8':
+            mod = importlib.import_module("tensorrt")
+            Dims = getattr(mod, "Dims")     
+
+        load_ixrt_plugin(model=model_name, precision=precision)
+
+        if model_name == 'gpt2':
+            self.batch_size = batch_size
+            return
+        
+        elif self.isSDmodel(model):
+            self.batch_size = batch_size
+            #self.load_sd(batch_size)
+            return
+        
+        if self.configs['compile_precision'] == 'FP16':
+            if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+                engine_path = model_path.split(".")[0] + "_end.engine"
+
+            elif model_name == 'widedeep' or model_name == 'roformer':
+                engine_path = model_path + "/" + model + "_end.engine"
+                    
+            elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin' \
+                or model_name == 'resnet50':
+                engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
+
+            else:
+                engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+            
+            if model_name == 'widedeep':      
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"
+
+            if model_name == 'roformer':
+                engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"     
+            
+            if model_name == 'deberta':
+                engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"
+
+        if self.configs['compile_precision'] == 'INT8':
+            if model_name == 'widedeep':
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape" + ".engine"    
+            
+            if model_name == 'resnet50':
+                engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine"
+
+            if model_name == 'yolov5':
+                engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine"    
+
+            if model_name == 'bert':
+                engine_path = "general_perf/model_zoo/regular/open_bert/bert_zijie_int8_b196.engine"
+
+        engine, context = init_by_tensorrt(engine_path)
+
+        self.model_runtimes.append(engine)
+
+        self.input_type = self.configs['input_type']
+        
+        self.batch_size = batch_size
+        self.engine = engine
+        self.context = context
+
+
+    def load_sd(self, batch_size):
+        model_path = self.configs['model_path']
+
+        import onnx
+        from onnx2torch import convert
+        
+        device = "cpu"
+        if torch.cuda.is_available():
+            device = "cuda"
+
+        self.model_sd = convert(model_path).to(device)
+
+        self.input_type = self.configs['input_type']
+        self.batch_size = batch_size
+        pass
+
+    def predict_sd(self, dataloader):
+        self.model_sd = self.model_sd.eval()
+        dataloader = dataloader[0].to('cuda')
+        torch.cuda.synchronize()
+        starttime = time.time()
+        out = self.model_sd(dataloader)
+        torch.cuda.synchronize()
+        endtime = time.time()
+
+        self.predict_time = endtime - starttime
+
+        return out
+
+    def load_igie(self, batch_size):
+        model = self.configs['model']
+        model_path = self.configs['model_path']
+
+        tvm = importlib.import_module("tvm")
+        from general_perf.backends.ILUVATAR.utils import get_target
+
+        target, _ = get_target('iluvatar_with_all_libs')
+        device = tvm.device(target.kind.name, self.task.device_id)
+        engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(batch_size) + ".so"
+        lib = tvm.runtime.load_module(engine_path)
+        self.task.module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+        self.device = device
+        self.batch_size = batch_size
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6b310791e21426c778fed9cee85fc069ff1630
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py
@@ -0,0 +1,20 @@
+from .file import load_json, save_json
+from .timer import Timer
+
+
+from .argument import get_args
+from .import_model import import_model_to_igie
+from .target import get_target
+
+from .dataloader import get_dataloader_from_args, download_builtin_data
+
+
+from .imagenet_metric import get_topk_accuracy
+from .coco_metric import COCO2017Evaluator, COCO2017EvaluatorForYolox, COCO2017EvaluatorForYolov4
+
+from .quantization import igie_quantize_model_from_args, onnx_quantize_model_from_args
+
+from .mod_rewriter import modify_seq_len_for_nlp
+from .stauts_checker import check_status
+
+from .compile_engine import compile_engine_from_args
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c2f253fff5ab8ec3d06ea60f0d0331c99df009d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py
@@ -0,0 +1,331 @@
+import argparse
+import os
+import sys
+import json
+from numbers import Number
+
+def to_bool(value):
+    if isinstance(value, bool):
+        return value
+    elif isinstance(value, str):
+        return value.lower() in ("yes", "true", "t", "1")
+    elif isinstance(value, Number):
+        return value != 0
+    else:
+        return False
+
+
+def get_args_parser():
+
+    parser = argparse.ArgumentParser()
+
+    # always required
+    parser.add_argument("--model_path",
+                        type=str,
+                        required=True,
+                        help="model path or model name in torchviso")
+
+    parser.add_argument("--input",
+                        type=str,
+                        required=True,
+                        dest="input",
+                        nargs='+',
+                        help="""
+                            input name and shape/dtype, format shoul be input_name:input_shape or input_name:input_shape/dtype,
+                            and use space to connect multiple inputs,
+                            if dtype is not given, we assuem the dtype is float32
+                            single input case: --input input1:1,3,224,224
+                            multiple inputs case: --input input1:32,3,224,224 input2:32,100
+                            miltiple inputs with differnet dtype case: --input input1:32,3,224,224/float32 input2:32,100/int64
+                            """)
+                        
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision")
+    
+    ## common optional
+    parser.add_argument("--target",
+                        type=str,
+                        choices=["llvm", "iluvatar", "iluvatar_with_cudnn_cublas",  "iluvatar_with_ixinfer", "iluvatar_with_all_libs"],
+                        default="iluvatar_with_all_libs",
+                        help="""IGIE compile target
+                            llvm: cpu only
+                            iluvatar: gpu without any other accerelate library
+                            iluvatar_with_cudnn_cublas: gpu with all accerelate library cudnn/cublas
+                            iluvatar_with_ixinfer: gpu with all accerelate library ixinfer
+                            iluvatar_with_all_libs: gpu with all accerelate library cudnn/cublas/ixinfer
+                            """)
+    
+    parser.add_argument("--engine_path",
+                        type=str,
+                        default=None,
+                        help="save path of engine, save in pwd if not provided")
+
+    parser.add_argument("--warmup",
+                        type=int,
+                        default=3,
+                        help="numbe of warmup before test")
+    
+    # parser.add_argument("--test_count",
+    #                     type=int,
+    #                     default=None,
+    #                     help="number of batch to test, test all batch if not specified")
+
+    parser.add_argument("--verbose",
+                        type=to_bool,
+                        default=False,
+                        help="dump igie mod to file if is True")
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader")
+    
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=None,
+                        help="""model batch size for datalodaer,
+                            use the first dimension of the first input when not specified 
+                            this argument will be useful for multi-input case:
+                            e.g. input_ids:1000,22 pixel_values:32,3,224,224 attention_mask:1000,22
+                            """)
+    
+    ## dataset
+    parser.add_argument("--use_imagenet",
+                        type=to_bool,
+                        default=False,
+                        help="use imagenet val dataet for calibration and test")
+    
+    parser.add_argument("--use_coco2017",
+                        type=to_bool,
+                        default=False,
+                        help="use coco2017 val datatset for calibration and test")
+
+    # parser.add_argument("--custom_data_path",
+    #                     type=str,
+    #                     default=None,
+    #                     help="user-provided custom data path to define user's datalodaer"
+    #                     )
+
+    parser.add_argument("--input_layout",
+                        type=str,
+                        choices=["NHWC", "NCHW"],
+                        default="NCHW",
+                        help="model input layout, only works for cv model")
+
+    parser.add_argument("--calibration_file_path",
+                        type=str,
+                        default=None,
+                        help="user-provided calibration npy data path, only used for calibration")
+    
+    ## custom quantization config
+    parser.add_argument("--automatic_yolo_quantization",
+                        type=to_bool,
+                        default=False,
+                        help="automaticlly find the best strategy for yolo by skipping the yolo detect node quantization")    
+    
+    parser.add_argument("--quantization_config_path",
+                        type=str,
+                        default=None,
+                        help="quantization config path for onnxruntime, should be a json file, refer to igie-doc for more infomation")    
+    
+    
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=to_bool,
+                        default=False,
+                        help="run performance test only")
+    
+    parser.add_argument('--just_export',
+                        type=to_bool,
+                        default=False,
+                        help="just export engine and return")
+    
+    ## other custom option
+    
+    parser.add_argument("--custom_option",
+                        type=str,
+                        default=None,
+                        dest="custom_option",
+                        nargs='+',
+                        help="""
+                            user-provided custom key:value option, use space to connect multiple option,
+                            bool value will be cast to Python bool type automaticaly,
+                            single option case: --custom_option my_data_path:/local/data
+                            multiple option case: --custom_option my_data_path:/local/data use_optionA:True
+                            """)
+    
+    
+    return parser
+
+
+
+def _parse_framework(args_dict):
+    model_path_or_name = args_dict["model_path"]
+    framework = None
+ 
+    # NOTE(chen.chen):
+    # We rely on the suffix to distinguish the source framework of the model,
+    # e.g. model.onnx, model.pb, etc. 
+    
+    # But if the model_path is_not exists, we will try to find it from torchvision and raise except when not found
+    # e.g. resnet18, resnet50
+    
+    if os.path.exists(model_path_or_name):
+        ext = os.path.splitext(model_path_or_name)[1]
+        
+        if ext == ".onnx":
+            framework = "onnx"
+        elif ext == ".pb":
+            framework = "tensorflow"
+        elif ext == ".pt":
+            framework = "pytorch"
+        else:
+            raise ValueError(f"{ext} is not supported yet")
+    else:            
+        # NOTE(chen.chen)
+        # paddle model saved as a directory
+        # so we need check if it is a paddle model here
+        paddle_model = f"{model_path_or_name}.pdmodel"
+        if os.path.exists(paddle_model):
+            framework = "paddle"
+        else:        
+            # NOTE(chen.chen):
+            # we support use torchvision pretrained model
+            # when model_path has no extension, we will try to find it from torchvision
+            # e.g. --model_path resnet50
+            framework = "pytorch"
+
+    args_dict["model_framework"] = framework
+
+        
+
+def _parse_input(args_dict):
+    input_list = args_dict.pop("input")    
+    
+    input_dict = {}
+    input_name_list = []
+    input_shape_list = []
+    input_dtype_list = []
+    batch_size = None
+    for i in input_list:
+        name, shape_dtype = i.rsplit(":", 1)
+        if "/" in shape_dtype:
+            shape, dtype = shape_dtype.split("/")
+            dtype = dtype.replace("fp", "float")
+            input_dtype_list.append(dtype)
+        else:
+            shape = shape_dtype
+            input_dtype_list.append("float32")
+        shape = tuple([int(j) for j in shape.split(",")])
+        input_dict[name] = shape
+        input_name_list.append(name)
+        input_shape_list.append(shape)
+        
+        if batch_size is None:
+            batch_size = shape[0]
+    
+    args_dict["input_dict"] = input_dict
+    args_dict["input_name_list"] = input_name_list
+    args_dict["input_shape_list"] = input_shape_list
+    args_dict["input_dtype_list"] = input_dtype_list
+    if args_dict["batch_size"] is None:
+        args_dict["batch_size"] = batch_size
+
+
+def _parse_engine_path(args_dict):
+    if args_dict["engine_path"] is None:
+        model_base_name = os.path.splitext(os.path.split(args_dict["model_path"])[1])[0]
+        args_dict["engine_path"] = f"{model_base_name}_batchsize_{args_dict['batch_size']}_{args_dict['precision']}.so"
+    assert args_dict["engine_path"].endswith("so")
+
+   
+def _parse_custom_option(args_dict):
+    custom_option_dict = {}
+    if args_dict["custom_option"] is not None :
+        custom_option = args_dict.pop("custom_option")
+        
+        for option in custom_option:
+            key, value = option.split(":", 1)
+            if value.lower() == "true":
+                value = True
+            elif value.lower() == "false":
+                value = False
+            elif "," in value:
+                value = value.split(",")
+            custom_option_dict[key] = value
+    
+    required_pass = custom_option_dict.get("required_pass", [])
+    if not isinstance(required_pass, list):
+        required_pass = [required_pass]
+    
+    args_dict["required_pass"] = required_pass
+    args_dict["custom_option"] = custom_option_dict
+
+
+def _parse_dataset(args_dict):
+    args_dict["use_builtin_data"] = args_dict["use_imagenet"] or args_dict["use_coco2017"]
+    if not args_dict["use_builtin_data"]:
+        args_dict["perf_only"] = True
+
+def _parse_quantization_config(args_dict):
+    
+    quantization_config_path = args_dict["quantization_config_path"]
+    if quantization_config_path is not None:
+        assert os.path.exists(quantization_config_path)
+        
+        with open(quantization_config_path, "r") as f:
+            data = json.load(f)
+        args_dict["quantization_config"] = data
+    else:
+        args_dict["quantization_config"] = {}
+
+
+
+def get_args(return_dict=False):   
+    if sys.version_info.major != 3 and sys.version_info.minor < 7:
+        raise ValueError(f"need at least python3.7, got {sys.version}")
+    
+    args_dict = vars(get_args_parser().parse_args())
+
+    _parse_framework(args_dict)
+    _parse_input(args_dict)
+    _parse_engine_path(args_dict)
+    _parse_quantization_config(args_dict)
+    _parse_dataset(args_dict)
+    _parse_custom_option(args_dict)
+    
+    from pprint import pprint
+    pprint(args_dict, indent=2)  
+
+    if return_dict:
+        return args_dict
+    
+    return argparse.Namespace(**args_dict)
+    
+
+
+if __name__ == "__main__":
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444/float32 input2:32,100/int64 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444/float32 input2:32,100/fp16 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8 --custom_option my_data_path:/local/data use_optionA:True
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8 --custom_option my_data_path:/local/data use_optionA:True required_pass:pass1,pass2,pass3
+    args = get_args(return_dict=True)
+    
+    from pprint import pprint
+    pprint(args)
+    
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4e468bb79d66a0c75b593460369c516fba3f309
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py
@@ -0,0 +1,622 @@
+import os
+import json
+import cv2
+import numpy as np
+
+import torch
+import torchvision
+from pycocotools.coco import COCO
+
+def get_coco_accuracy(pred_json, ann_json):
+    coco = COCO(annotation_file=ann_json)
+    coco_pred = coco.loadRes(pred_json)
+    try:
+        from .fastCoCoeval.fast_coco_eval_api import COCOeval_opt as COCOeval
+        coco_evaluator = COCOeval(cocoGt=coco, cocoDt=coco_pred, iouType="bbox")
+    except:
+        from pycocotools.cocoeval import COCOeval
+        print("Can't import fastCoCoeval, Using PyCoCcotools API ...")
+        coco_evaluator = COCOeval(cocoGt=coco, cocoDt=coco_pred, iouType="bbox")
+            
+    coco_evaluator.evaluate()
+    coco_evaluator.accumulate()
+    coco_evaluator.summarize()
+    return coco_evaluator.stats
+
+coco80_to_coco91 = [
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+    23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+    46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+    65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88,
+    89, 90
+]
+
+coco80_to_coco91_dict = {idx: i for idx, i in enumerate(coco80_to_coco91)}
+coco91_to_coco80_dict = {i: idx for idx, i in enumerate(coco80_to_coco91)}
+
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114)):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[
+        1]  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im,
+                            top,
+                            bottom,
+                            left,
+                            right,
+                            cv2.BORDER_CONSTANT,
+                            value=color)  # add border
+    return im, ratio, (dw, dh)
+
+
+def box_area(box):
+    # box = xyxy(4,n)
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+
+def box_iou(box1, box2, eps=1e-7):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    (a1, a2), (b1, b2) = box1[:, None].chunk(2, 2), box2.chunk(2, 1)
+    inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
+
+    # IoU = inter / (area1 + area2 - inter)
+    return inter / (box_area(box1.T)[:, None] + box_area(box2.T) - inter + eps)
+
+
+def xyxy2xywh(x):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+
+
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+
+def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
+    # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw  # top left x
+    y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh  # top left y
+    y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw  # bottom right x
+    y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh  # bottom right y
+    return y
+
+
+def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
+    if clip:
+        clip_boxes(x, (h - eps, w - eps))  # warning: inplace clip
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = ((x[:, 0] + x[:, 2]) / 2) / w  # x center
+    y[:, 1] = ((x[:, 1] + x[:, 3]) / 2) / h  # y center
+    y[:, 2] = (x[:, 2] - x[:, 0]) / w  # width
+    y[:, 3] = (x[:, 3] - x[:, 1]) / h  # height
+    return y
+
+
+def xyn2xy(x, w=640, h=640, padw=0, padh=0):
+    # Convert normalized segments into pixel segments, shape (n,2)
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = w * x[:, 0] + padw  # top left x
+    y[:, 1] = h * x[:, 1] + padh  # top left y
+    return y
+
+
+def segment2box(segment, width=640, height=640):
+    # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy)
+    x, y = segment.T  # segment xy
+    inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
+    x, y, = x[inside], y[inside]
+    return np.array([x.min(), y.min(), x.max(),
+                     y.max()]) if any(x) else np.zeros((1, 4))  # xyxy
+
+
+def segments2boxes(segments):
+    # Convert segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh)
+    boxes = []
+    for s in segments:
+        x, y = s.T  # segment xy
+        boxes.append([x.min(), y.min(), x.max(), y.max()])  # cls, xyxy
+    return xyxy2xywh(np.array(boxes))  # cls, xywh
+
+
+def resample_segments(segments, n=1000):
+    # Up-sample an (n,2) segment
+    for i, s in enumerate(segments):
+        s = np.concatenate((s, s[0:1, :]), axis=0)
+        x = np.linspace(0, len(s) - 1, n)
+        xp = np.arange(len(s))
+        segments[i] = np.concatenate([
+            np.interp(x, xp, s[:, i]) for i in range(2)
+        ]).reshape(2, -1).T  # segment xy
+    return segments
+
+
+def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
+    # Rescale boxes (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    boxes[:, [0, 2]] -= pad[0]  # x padding
+    boxes[:, [1, 3]] -= pad[1]  # y padding
+    boxes[:, :4] /= gain
+    clip_boxes(boxes, img0_shape)
+    return boxes
+
+
+def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    segments[:, 0] -= pad[0]  # x padding
+    segments[:, 1] -= pad[1]  # y padding
+    segments /= gain
+    clip_segments(segments, img0_shape)
+    return segments
+
+
+def clip_boxes(boxes, shape):
+    # Clip boxes (xyxy) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[:, 0].clamp_(0, shape[1])  # x1
+        boxes[:, 1].clamp_(0, shape[0])  # y1
+        boxes[:, 2].clamp_(0, shape[1])  # x2
+        boxes[:, 3].clamp_(0, shape[0])  # y2
+    else:  # np.array (faster grouped)
+        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+
+
+def clip_segments(boxes, shape):
+    # Clip segments (xy1,xy2,...) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[:, 0].clamp_(0, shape[1])  # x
+        boxes[:, 1].clamp_(0, shape[0])  # y
+    else:  # np.array (faster grouped)
+        boxes[:, 0] = boxes[:, 0].clip(0, shape[1])  # x
+        boxes[:, 1] = boxes[:, 1].clip(0, shape[0])  # y
+
+
+def non_max_suppression(
+        prediction,
+        conf_thres=0.25,
+        iou_thres=0.45,
+        classes=None,
+        agnostic=False,
+        multi_label=True,
+        labels=(),
+        max_det=300,
+        nm=0,  # number of masks
+):
+    """Non-Maximum Suppression (NMS) on inference results to reject overlapping detections
+
+    Returns:
+         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
+    """
+
+    if isinstance(
+            prediction, (list, tuple)
+    ):  # YOLOv5 model in validation model, output = (inference_out, loss_out)
+        prediction = prediction[0]  # select only inference output
+
+    bs = prediction.shape[0]  # batch size
+    nc = prediction.shape[2] - nm - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Checks
+    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
+    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
+
+    # Settings
+    # min_wh = 2  # (pixels) minimum box width and height
+    max_wh = 7680  # (pixels) maximum box width and height
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 0.5 + 0.05 * bs  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    # t = time.time()
+    mi = 5 + nc  # mask start index
+    output = [torch.zeros((0, 6 + nm))] * bs
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            lb = labels[xi]
+            v = torch.zeros((len(lb), nc + nm + 5), device=x.device)
+            v[:, :4] = lb[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(lb)), lb[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box/Mask
+        box = xywh2xyxy(
+            x[:, :4])  # center_x, center_y, width, height) to (x1, y1, x2, y2)
+        mask = x[:, mi:]  # zero columns if no masks
+
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat(
+                (box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
+        else:  # best class only
+            conf, j = x[:, 5:mi].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float(), mask),
+                          1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(
+                descending=True)[:max_nms]]  # sort by confidence
+        else:
+            x = x[x[:, 4].argsort(descending=True)]  # sort by confidence
+
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:,
+                                        4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n <
+                      3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
+                1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+    return output
+
+
+
+
+#NOTE(chen.chen):
+# just work for coco2017 val using pycocotools
+# maybe we need some abstraction here for generic coco-like dataset
+class COCO2017Evaluator:    
+    def __init__(self,
+                 label_path,
+                 image_size=640,
+                 with_nms=False,
+                 conf_thres=0.001,
+                 iou_thres=0.65):
+        self.with_nms = with_nms
+        self.conf_thres = conf_thres
+        self.iou_thres = iou_thres
+        self.label_path = label_path
+        self.image_size = image_size
+
+        self.jdict = []
+
+        # iou vector for mAP@0.5:0.95
+        self.iouv = torch.linspace(0.5, 0.95, 10)  
+        self.niou = self.iouv.numel()
+    
+    def evaluate(self, pred, all_inputs, nms_count=None):
+        im = all_inputs[0]
+        targets = all_inputs[1]
+        paths = all_inputs[2]
+        shapes = all_inputs[3]
+
+        _, _, height, width = im.shape
+        targets[:, 2:] *= np.array((width, height, width, height))
+        
+        if self.with_nms:
+            assert nms_count is not None
+            tmp_out = []
+            for boxes, count in zip(pred, nms_count):
+                count = count[0]
+                boxes = boxes[:count, :]
+                boxes_cp = boxes.copy()
+                # (x1,y1,x2,y2,class_id,score)
+                # To (x1,y1,x2,y2,score,class_id)
+                boxes[:, 4] = boxes_cp[:, 5]
+                boxes[:, 5] = boxes_cp[:, 4]
+                tmp_out.append(torch.from_numpy(boxes))
+            pred = tmp_out   
+        else:
+            pred = torch.from_numpy(pred)
+            pred = non_max_suppression(pred, self.conf_thres, self.iou_thres)
+        for idx, det in enumerate(pred):
+            img_path = paths[idx]
+
+            predn = det
+            shape = shapes[idx][0]
+            scale_boxes(im[idx].shape[1:], predn[:, :4], shape, shapes[idx][1])  # native-space pred
+
+            self._save_one_json(predn, self.jdict, img_path, coco80_to_coco91)  # append to COCO-JSON dictionary
+        
+
+    def _save_one_json(self, predn, jdict, path, class_map):
+        # Save one JSON result in the format
+        # {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}
+        image_id = int(os.path.splitext(os.path.basename(path))[0])
+        box = xyxy2xywh(predn[:, :4])
+        box[:, :2] -= box[:, 2:] / 2
+        for p, b in zip(predn.tolist(), box.tolist()):
+            jdict.append({
+                'image_id': image_id,
+                'category_id': class_map[int(p[5])],
+                'bbox': [round(x, 3) for x in b],
+                'score': round(p[4], 5)
+            })
+
+
+    def summary(self):
+        if len(self.jdict):
+            pred_json = os.path.join("coco2017_predictions.json")
+            with open(pred_json, 'w') as f:
+                json.dump(self.jdict, f)
+            result = get_coco_accuracy(pred_json, self.label_path)
+        else:
+            raise ValueError("can not find generated json dict for pycocotools")
+        return result
+
+# coco2017 val evaluator For Yolox
+class COCO2017EvaluatorForYolox(COCO2017Evaluator):
+    def evaluate(self, pred, all_inputs):
+        im = all_inputs[0]
+        img_path = all_inputs[1]
+        img_info = all_inputs[2]
+        
+        _, _, height, width = im.shape
+        img_size = [height, width]
+
+        pred = torch.from_numpy(self.Detect(pred, img_size=[height, width]))
+
+        nms_outputs = self.postprocess(
+                    pred, conf_thre=self.conf_thres, nms_thre=self.iou_thres
+                )
+
+        for (output, org_img, path) in zip(nms_outputs, img_info, img_path):
+            if output is None:
+                continue
+            
+            bboxes = output[:, 0:4]
+
+            img_h, img_w = org_img
+
+            scale = min(img_size[0] / float(img_h), img_size[1] / float(img_w))
+
+            bboxes /= scale
+            cls = output[:, 6]
+            scores = output[:, 4] * output[:, 5]
+            
+            bboxes = self._xyxy2xywh(bboxes)
+
+            self._save_one_json(bboxes, cls, scores, self.jdict, path, coco80_to_coco91)
+
+    def Detect(self, outputs, img_size):
+        grids = []
+        expanded_strides = []
+
+        strides = [8, 16, 32]
+
+        hsizes = [img_size[0] // stride for stride in strides]
+        wsizes = [img_size[1] // stride for stride in strides]
+
+        for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+            xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+            grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            expanded_strides.append(np.full((*shape, 1), stride))
+
+        grids = np.concatenate(grids, 1)
+        expanded_strides = np.concatenate(expanded_strides, 1)
+        outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+        outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+        
+        return outputs
+    
+    def postprocess(self, prediction, num_classes=80, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+        box_corner = prediction.new(prediction.shape)
+        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+        prediction[:, :, :4] = box_corner[:, :, :4]
+
+        output = [None for _ in range(len(prediction))]
+
+        for i, image_pred in enumerate(prediction):
+            # If none are remaining => process next image
+            if not image_pred.size(0):
+                continue
+            # Get score and class with highest confidence
+            class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+
+            conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+            # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+            detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
+            detections = detections[conf_mask]
+            
+            if not detections.size(0):
+                continue
+            if class_agnostic:
+                nms_out_index = torchvision.ops.nms(
+                    detections[:, :4],
+                    detections[:, 4] * detections[:, 5],
+                    nms_thre,
+                )
+            else:
+                nms_out_index = torchvision.ops.batched_nms(
+                    detections[:, :4],
+                    detections[:, 4] * detections[:, 5],
+                    detections[:, 6],
+                    nms_thre,
+                )
+            detections = detections[nms_out_index]
+
+            if output[i] is None:
+                output[i] = detections
+            else:
+                output[i] = torch.cat((output[i], detections))
+
+        return output
+
+    def _xyxy2xywh(self, bboxes):
+        bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+        bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+        return bboxes
+
+    def _save_one_json(self, bboxes, class_, scores, jdict, path, class_map):
+        image_id = int(os.path.splitext(os.path.basename(path))[0])
+        for box, score, cls in zip(bboxes.numpy().tolist(), scores.numpy().tolist(), class_.numpy().tolist()):
+            jdict.append({
+                'image_id': image_id,
+                'category_id': class_map[int(cls)],
+                'bbox': box,
+                'score': score
+            })
+
+
+# coco2017 val evaluator For Yolov4
+class COCO2017EvaluatorForYolov4(COCO2017EvaluatorForYolox):
+    def evaluate(self, pred, all_inputs):
+        im = all_inputs[0]
+        img_path = all_inputs[1]
+        img_info = all_inputs[2]
+
+        boxes = torch.squeeze(torch.from_numpy(pred[0]), dim=2)
+        confs = torch.from_numpy(pred[1])
+        detections = torch.cat((boxes, confs.float()), 2)
+
+        nms_outputs = self.postprocess(
+            detections, conf_thre=self.conf_thres, nms_thre=self.iou_thres
+        )
+
+        for (output, org_img, path) in zip(nms_outputs, img_info, img_path):
+            if output is None:
+                continue
+            
+            bboxes = output[:, 0:4]
+            img_h, img_w = org_img
+            bboxes[:, 0] *= img_w
+            bboxes[:, 2] *= img_w
+            bboxes[:, 1] *= img_h
+            bboxes[:, 3] *= img_h
+
+            cls = output[:, 5]
+            scores = output[:, 4]
+            
+            bboxes = self._xyxy2xywh(bboxes)
+            self._save_one_json(bboxes, cls, scores, self.jdict, path, coco80_to_coco91)
+    
+    def postprocess(self, prediction, num_classes=80, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+        output = [None for _ in range(len(prediction))]
+
+        for i, image_pred in enumerate(prediction):
+            # If none are remaining => process next image
+            if not image_pred.size(0):
+                continue
+            # Get score and class with highest confidence
+            class_conf, class_pred = torch.max(image_pred[:, 4: 4 + num_classes], 1, keepdim=True)
+
+            conf_mask = (class_conf.squeeze() >= conf_thre).squeeze()
+            detections = torch.cat((image_pred[:, :4], class_conf, class_pred.float()), 1)
+            detections = detections[conf_mask]
+
+            if not detections.size(0):
+                continue
+            if class_agnostic:
+                nms_out_index = torchvision.ops.nms(
+                    detections[:, :4],
+                    detections[:, 4],
+                    nms_thre,
+                )
+            else:
+                nms_out_index = torchvision.ops.batched_nms(
+                    detections[:, :4],
+                    detections[:, 4],
+                    detections[:, 5],
+                    nms_thre,
+                )
+            detections = detections[nms_out_index]
+
+            if output[i] is None:
+                output[i] = detections
+            else:
+                output[i] = torch.cat((output[i], detections))
+
+        return output
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..e310f6b8bd6d5d53414c820fc7ca1c2b1c471d9e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py
@@ -0,0 +1,19 @@
+import os
+import tvm
+
+from .import_model import import_model_to_igie
+from .target import get_target
+
+
+# a simple wrapper for compile engine and get module
+def compile_engine_from_args(args):
+    target, device = get_target(args.target)
+    
+    if not os.path.exists(args.engine_path):
+        mod, params = import_model_to_igie(args.model_path, args.input_dict, args.model_framework)
+        lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision, verbose=args.verbose, required_pass=args.required_pass)
+        lib.export_library(args.engine_path)
+    else:
+        lib = tvm.runtime.load_module(args.engine_path)   
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+    return module
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a01ef7e7485abc2ec3f2effbb6b2faf97dc3229
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py
@@ -0,0 +1,595 @@
+import os
+import numpy as np
+from PIL import Image
+from collections import defaultdict
+
+import tensorflow as tf
+try:
+    tf = tf.compat.v1
+except ImportError:
+    tf = tf
+tf.enable_eager_execution()
+
+import torch
+import torchvision
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+from pycocotools.coco import COCO
+
+from .coco_metric import *
+
+_igie_cache_dir = os.path.expanduser("~/.igie_cache")
+_bulitin_data_url = "http://10.113.3.3/data/CI_DATA/ci_data.tar.gz"
+_builtin_data_path = os.path.join(_igie_cache_dir, "modelzoo_data")
+_symbolic_link_data_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data")
+
+
+### Tensorflow image pre-process function
+def _mean_image_subtraction(image, means):
+    """Subtracts the given means from each image channel."""
+    if image.get_shape().ndims != 3:
+        raise ValueError('Input must be of size [height, width, C>0]')
+    num_channels = image.get_shape().as_list()[-1]
+    if len(means) != num_channels:
+        raise ValueError('len(means) must match the number of channels')
+    channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
+    for i in range(num_channels):
+        channels[i] -= means[i]
+    return tf.concat(axis=2, values=channels)
+
+def _central_crop(image, crop_height, crop_width):
+    shape = tf.shape(image)
+    height, width = shape[0], shape[1]
+
+    amount_to_be_cropped_h = (height - crop_height)
+    crop_top = amount_to_be_cropped_h // 2
+    amount_to_be_cropped_w = (width - crop_width)
+    crop_left = amount_to_be_cropped_w // 2
+    return tf.slice(image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
+
+def _aspect_preserving_resize(image, resize_min):
+    """Resize images preserving the original aspect ratio.
+    """
+    shape = tf.shape(image)
+    height, width = shape[0], shape[1]
+    new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+    return _resize_image(image, new_height, new_width)
+
+def _smallest_size_at_least(height, width, resize_min):
+    resize_min = tf.cast(resize_min, tf.float32)
+    # Convert to floats to make subsequent calculations go smoothly.
+    height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+    smaller_dim = tf.minimum(height, width)
+    scale_ratio = resize_min / smaller_dim
+    # Convert back to ints to make heights and widths that TF ops will accept.
+    new_height = tf.cast(height * scale_ratio, tf.int32)
+    new_width = tf.cast(width * scale_ratio, tf.int32)
+    return new_height, new_width
+
+def _resize_image(image, height, width):
+    return tf.image.resize(image, [height, width], method=tf.image.ResizeMethod.BILINEAR, preserve_aspect_ratio=False)
+
+
+
+### Pytorch image pre-process function
+def _torch_imagenet_preprocess(image_path):
+    img = Image.open(image_path).convert('RGB')
+    # preprocess image to nomalized tensor for pytorch
+    _PYTORCH_IMAGENET_PREPROCESS = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
+                                    0.229, 0.224, 0.225]),
+        ]
+    )
+    img = _PYTORCH_IMAGENET_PREPROCESS(img)
+    return img
+
+
+### Tensorflow image pre-process function
+def _tf_imagenet_preprocess(image_path):
+    img = Image.open(image_path).convert('RGB')
+    _TF_IMAGENET_PREPROCESS = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]
+    )
+    
+    img = _TF_IMAGENET_PREPROCESS(img)
+    img *= 255.0
+    assert len(img.shape) == 3
+    img = transforms.Normalize(mean=[123.68, 116.78, 103.94], std=[1, 1, 1])(img)
+    img = img.permute((1, 2, 0)) # CHW -> HWC
+    
+    return img
+
+
+class ImageNetDataset(torch.utils.data.Dataset):
+    def __init__(self, image_dir_path, label_dir_path="", layout="NHWC", image_size=(224, 224)):
+        super().__init__()
+        self.image_dir_path = image_dir_path
+        self.label_dir_path = label_dir_path
+        self.layout = layout
+        
+        if len(image_size) == 1:
+            self.image_height = self.image_width = image_size
+        if len(image_size) == 2:
+            self.image_height = image_size[0]
+            self.image_width = image_size[1]
+        assert self.layout in ["NHWC", "NCHW"], f"layout should be NHWC or NCHW, got {self.layout} "
+        self.img_list = os.listdir(self.image_dir_path)
+        self.label_dict = self.get_label_dict()
+        
+        self.images = []
+        self.length = 0
+
+        for image_dir in self.img_list:
+            image_path = os.path.join(self.image_dir_path, image_dir)
+            if os.path.isdir(image_path):
+                for image in os.listdir(image_path):
+                    self.images.append(os.path.join(image_path, image))
+                    self.length += 1
+
+    def __getitem__(self, index):
+        ## NHWC pre-process for tensorflow
+        if self.layout == "NHWC":
+            processed_image = _tf_imagenet_preprocess(self.images[index])
+            # image = cv2.imread(self.images[index])
+            # image = cv2.cvtColor(image, 4)
+            # resize_image = _aspect_preserving_resize(image, 256)
+            # crop_image = _central_crop(resize_image, self.image_height, self.image_width)  
+            # crop_image.set_shape([self.image_height, self.image_width, 3])
+            # crop_image = tf.to_float(crop_image)
+            # processed_image = _mean_image_subtraction(crop_image, [123.68, 116.78, 103.94]).numpy()
+        
+        ## NCHW pre-process for Pytorch
+        elif self.layout == "NCHW":
+            processed_image = _torch_imagenet_preprocess(self.images[index])
+        else:
+            raise ValueError("Unsupported data layout")
+
+        image_name = self.images[index].split('/')[-1].strip()
+        label = self.label_dict[image_name]
+
+        return processed_image, label
+
+    def __len__(self):
+        return self.length
+
+    def get_label_dict(self):
+        image_label = {}
+        label_path = os.path.join(self.image_dir_path, 'val.txt')
+        if self.label_dir_path != "":
+            label_path = self.label_dir_path
+        if os.path.exists(label_path):
+            with open(label_path, 'r') as file:
+                lines = file.readlines()
+        
+            for line in lines:
+                image = line.split(' ')[0].strip()
+                label = line.split(' ')[1].strip()
+                image_label[image] = int(label)
+        
+        return image_label
+
+def get_imagenet_dataloader(data_path, batch_size, num_workers, model_framework, input_layout):
+    if model_framework == "tensorflow":
+        val_dir = os.path.join(data_path, "val")
+        dataset = ImageNetDataset(val_dir, layout="NHWC")
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers, drop_last=True)
+
+    else:
+        assert input_layout == "NCHW"
+        val_dir = os.path.join(data_path, 'validation')
+        assert os.path.isdir(val_dir), f"{val_dir} does not exist, please specify correct data path"
+
+        dataset = torchvision.datasets.ImageFolder(
+            val_dir,
+            transforms.Compose(
+                [
+                    transforms.Resize(256, interpolation=InterpolationMode.BILINEAR),
+                    transforms.CenterCrop(224),
+                    transforms.PILToTensor(),
+                    transforms.ConvertImageDtype(torch.float),
+                    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+                ]
+            )
+        )
+
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers, drop_last=True)
+
+    return dataloader
+
+class COCO2017Dataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 image_dir_path,
+                 label_json_path,
+                 image_size=640,
+                 pad_color=114,
+                 val_mode=True,
+                 input_layout="NCHW"):
+
+        self.image_dir_path = image_dir_path
+        self.label_json_path = label_json_path
+        self.image_size = image_size
+        self.pad_color = pad_color
+        self.val_mode = val_mode
+        self.input_layout = input_layout
+
+        self.coco = COCO(annotation_file=self.label_json_path)
+        
+        if self.val_mode:
+            self.img_ids = list(sorted(self.coco.imgs.keys()))  # 5000
+        else:  # train mode need images with labels
+            self.img_ids = sorted(list(self.coco.imgToAnns.keys()))  # 4952
+
+    def __len__(self):
+        return len(self.img_ids)
+
+    def __getitem__(self, index):
+        img_path = self._get_image_path(index)
+        img, (h0, w0), (h, w) = self._load_image(index)
+
+        img, ratio, pad = letterbox(img,
+                                    self.image_size,
+                                    color=(self.pad_color, self.pad_color, self.pad_color))
+        shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling
+
+        # load label
+        raw_label = self._load_json_label(index)
+        # normalized xywh to pixel xyxy format
+        raw_label[:, 1:] = xywhn2xyxy(raw_label[:, 1:],
+                                      ratio[0] * w,
+                                      ratio[1] * h,
+                                      padw=pad[0],
+                                      padh=pad[1])
+
+        raw_label[:, 1:] = xyxy2xywhn(raw_label[:, 1:],
+                                      w=img.shape[1],
+                                      h=img.shape[0],
+                                      clip=True,
+                                      eps=1E-3)
+
+        nl = len(raw_label)  # number of labels
+        labels_out = np.zeros((nl, 6))
+        labels_out[:, 1:] = raw_label
+
+        # Convert
+        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img = np.ascontiguousarray(img) / 255.0  # 0~1 np array
+        if self.input_layout == "NHWC":
+            img = img.transpose((1, 2, 0))
+
+        return img, labels_out, img_path, shapes
+
+    def _get_image_path(self, index):
+        idx = self.img_ids[index]
+        path = self.coco.loadImgs(idx)[0]["file_name"]
+        img_path = os.path.join(self.image_dir_path, path)
+        return img_path
+
+    def _load_image(self, index):
+        img_path = self._get_image_path(index)
+
+        im = cv2.imread(img_path)  # BGR
+        h0, w0 = im.shape[:2]  # orig hw
+        r = self.image_size / max(h0, w0)  # ratio
+        if r != 1:  # if sizes are not equal
+            im = cv2.resize(im, (int(w0 * r), int(h0 * r)), interpolation=cv2.INTER_LINEAR)
+        return im.astype("float32"), (h0, w0), im.shape[:2]  # im, hw_original, hw_resized
+
+    def _load_json_label(self, index):
+        _, (h0, w0), _ = self._load_image(index)
+
+        idx = self.img_ids[index]
+        ann_ids = self.coco.getAnnIds(imgIds=idx)
+        targets = self.coco.loadAnns(ids=ann_ids)
+
+        labels = []
+        for target in targets:
+            cat = target["category_id"]
+            coco80_cat = coco91_to_coco80_dict[cat]
+            cat = np.array([[coco80_cat]])
+
+            x, y, w, h = target["bbox"]
+            x1, y1, x2, y2 = x, y, int(x + w), int(y + h)
+            xyxy = np.array([[x1, y1, x2, y2]])
+            xywhn = xyxy2xywhn(xyxy, w0, h0)
+            labels.append(np.hstack((cat, xywhn)))
+
+        if labels:
+            labels = np.vstack(labels)
+        else:
+            if self.val_mode:
+                # for some image without label
+                labels = np.zeros((1, 5))
+            else:
+                raise ValueError(f"set val_mode = False to use images with labels")
+
+        return labels
+
+    @staticmethod
+    def collate_fn(batch):
+        im, label, path, shapes = zip(*batch)
+        for i, lb in enumerate(label):
+            lb[:, 0] = i
+        return np.concatenate([i[None] for i in im], axis=0), np.concatenate(label, 0), path, shapes
+
+# Datasets just for Yolox
+class COCO2017DatasetForYolox(COCO2017Dataset):
+    def __getitem__(self, index):
+        img_path = self._get_image_path(index)
+        img = self._load_image(img_path)
+
+        img, r = self.preproc(img, input_size=self.image_size)
+        
+        return img, img_path, r
+
+    def _load_image(self, img_path):
+        img = cv2.imread(img_path)
+        assert img is not None, f"file {img_path} not found"
+
+        return img
+    
+    def preproc(self, img, input_size, swap=(2, 0, 1)):
+        if len(img.shape) == 3:
+            padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+        else:
+            padded_img = np.ones(input_size, dtype=np.uint8) * 114
+        
+        org_img = (img.shape[0], img.shape[1])
+        r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * r), int(img.shape[0] * r)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.uint8)
+        padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+        padded_img = padded_img.transpose(swap)
+        padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+        return padded_img, org_img
+
+    @staticmethod
+    def collate_fn(batch):
+        im, img_path, r = zip(*batch)
+        return np.concatenate([i[None] for i in im], axis=0), img_path, r
+
+# Datasets just for Yolox
+class COCO2017DatasetForYolov4(COCO2017DatasetForYolox):
+    def preproc(self, img, input_size, swap=(2, 0, 1)):
+        org_img = (img.shape[0], img.shape[1])
+        img_ = cv2.resize(img, (input_size[0], input_size[1]))
+        img_ = cv2.cvtColor(img_, cv2.COLOR_BGR2RGB)
+        img_ = img_.transpose(swap) / 255.0
+        img_ = np.ascontiguousarray(img_, dtype=np.float32)
+        return img_, org_img
+    
+def get_coco2017_dataloader(data_path, label_path, batch_size, image_size, num_workers, model_framework, input_layout, custom_option=None):
+    # TODO(chen.chen)
+    # we only support pytorch-like coco2017 data preprocess
+    # some problems may occur when the data preprocess is different, e.g. tensorflow
+    assert model_framework != "tensorflow"
+    if custom_option == 'yolox':
+        dataset = COCO2017DatasetForYolox(data_path, label_path, image_size=(image_size, image_size), input_layout=input_layout)
+    elif custom_option == 'yolov4':
+        dataset = COCO2017DatasetForYolov4(data_path, label_path, image_size=(image_size, image_size), input_layout=input_layout)
+    else:
+        dataset = COCO2017Dataset(data_path, label_path, image_size, input_layout=input_layout)
+        
+    # NOTE(chen.chen)
+    # we should validate all images in the datasets to use pycocotools
+    # so we do not drop last batch which maybe smaller than a normal batch
+    # you should pad the batch dimension in the outside
+    dataloader = torch.utils.data.DataLoader(dataset,
+                                            batch_size=batch_size,
+                                            drop_last=False,
+                                            num_workers=num_workers,
+                                            collate_fn=dataset.collate_fn)
+
+    return dataloader
+
+
+class FakeDataSet(torch.utils.data.Dataset):
+    def __init__(self, input_name_list, input_shape_list, input_dtype_list):
+        self.input_name_list = input_name_list
+        self.input_shape_list = input_shape_list
+        self.input_dtype_list = input_dtype_list
+
+        self.max_length = 100000
+
+    def __len__(self):
+        return self.max_length
+        
+    def __getitem__(self, _):
+        input_data = []
+        for shape, dtype in zip(self.input_shape_list, self.input_dtype_list):
+            if dtype.startswith("float"):
+                data = np.random.randn(*shape[1:]).astype(dtype)
+            elif dtype.startswith("int"):
+                data = np.random.randint(0, 10, shape[1:]).astype(dtype)
+            else:
+                raise ValueError(f"unsupported dtype: {dtype}")
+        
+            input_data.append(data)
+            
+        return tuple(input_data)
+        
+
+    @staticmethod
+    def collate_fn(batch):
+        batch_input_data = []
+        for i in zip(*batch):
+            data = np.concatenate([j[np.newaxis,:] for j in i], axis=0)
+            batch_input_data.append(data)
+        return tuple(batch_input_data)        
+
+class NumpyDataSet(torch.utils.data.Dataset):
+    def __init__(self, input_name_list, input_shape_list, input_dtype_list, path):
+        self.input_name_list = input_name_list
+        self.input_shape_list = input_shape_list
+        self.input_dtype_list = input_dtype_list
+        self.path = path
+
+        self.ext = os.path.splitext(self.path)[-1]
+        assert self.ext.endswith(".npy") or self.ext.endswith(".data")
+
+        self.dtype_size_map = {
+            "fp32": np.dtype("float32"),
+            "float32": np.dtype("float32"),
+            "fp16": np.dtype("float16"),
+            "float16": np.dtype("float16"),
+            "int8": np.dtype("int8")
+        }
+        
+        self._process_numpy_data()
+   
+    def _process_numpy_data(self):
+        if self.ext.endswith(".npy"):
+            self.total_data_number = len(self.input_name_list)
+            
+            self.data = np.load(self.path, allow_pickle=True)
+            assert len(self.data) == self.total_data_number, f"np data length should be {self.total_data_number}, got {len(self.data)}"        
+            self.length = self.data[0].shape[0]
+        
+        elif self.ext.endswith(".data"): 
+            with open(self.path, mode='rb') as f:
+                calibrate_data = f.read()
+            
+            total_bytes = 0
+            input_size_list = []
+            for shape, dtype in zip(self.input_shape_list, self.input_dtype_list):
+                size = np.prod(shape) * self.dtype_size_map[dtype].itemsize
+                input_size_list.append(size)
+                total_bytes += size
+            
+            assert (len(calibrate_data) % total_bytes == 0), f"calibrate_data size({len(calibrate_data)}) don't match one batch size({total_bytes}) multiple."
+            
+            index = 0
+            npy_data_dict = defaultdict(list)
+            while index < len(calibrate_data):
+                for name, shape, dtype, size in zip(self.input_name_list, self.input_shape_list, self.input_dtype_list, input_size_list):   
+                    data = np.frombuffer(calibrate_data[index: index + size], dtype=dtype).reshape(shape)
+                    npy_data_dict[name].append(data)
+                    index += size
+
+            self.data = []
+            for v in npy_data_dict.values():
+                data = np.concatenate(v, axis=0)
+                self.data.append(data)
+                
+            self.length = self.data[0].shape[0]
+        else:
+            raise 
+
+    def __len__(self):
+        return self.length
+        
+    def __getitem__(self, index):
+        input_data = []
+        for i in self.data:
+            input_data.append(i[index])
+        return tuple(input_data)
+        
+    @staticmethod
+    def collate_fn(batch):
+        batch_input_data = []
+        for i in zip(*batch):
+            data = np.concatenate([j[np.newaxis,:] for j in i], axis=0)
+            batch_input_data.append(data)
+        return tuple(batch_input_data)        
+
+def download_builtin_data():
+    if not os.path.exists(_builtin_data_path):
+        if not os.path.exists(_igie_cache_dir):
+            os.makedirs(_igie_cache_dir)
+
+        pwd = os.getcwd()
+        os.chdir(_igie_cache_dir)
+        
+        cmd = f"wget {_bulitin_data_url}"
+        os.system(cmd)
+
+        cmd = f"tar -xzf ci_data.tar.gz"
+        os.system(cmd)
+                
+        os.chdir(pwd)
+    
+    if os.path.exists(_builtin_data_path) and not os.path.exists(_symbolic_link_data_path):
+        cmd = f"ln -s {_builtin_data_path} {_symbolic_link_data_path}"
+        os.system(cmd)
+        
+    print(f"Use builtin dataset path: {_builtin_data_path}")
+        
+
+def get_dataloader_from_args(args):
+    ## use built-in dataset
+    if args.use_builtin_data:
+        download_builtin_data()
+ 
+        if args.use_imagenet:
+            args.data_path = os.path.join(_builtin_data_path, "datasets", "imagenet")
+            
+            return get_imagenet_dataloader(args.data_path, args.batch_size, args.num_workers, args.model_framework, args.input_layout)
+            
+        elif args.use_coco2017:
+            args.data_path = os.path.join(_builtin_data_path, "datasets", "coco", "images", "val2017")
+            args.label_path = os.path.join(_builtin_data_path, "datasets", "coco", "annotations", "instances_val2017.json")
+
+            input_shape = args.input_shape_list[0]            
+            assert len(input_shape) == 4, f"input should be a 4d tensor, format as NCHW or NHWC, got {len(input_shape)}"
+            if args.input_layout == "NCHW":
+                assert input_shape[2] == input_shape[3], f"HW should be the same, got {input_shape[2]} and {input_shape[3]}"
+                args.image_size = input_shape[2]
+            else: #NHWC
+                assert input_shape[1] == input_shape[2], f"HW should be the same, got {input_shape[1]} and {input_shape[2]}"
+                args.image_size = input_shape[1]
+
+            # use custom option do preprocessing
+            if args.custom_option is not None  and 'process' in args.custom_option:
+                return get_coco2017_dataloader(args.data_path, args.label_path, args.batch_size, args.image_size, args.num_workers, args.model_framework, args.input_layout, args.custom_option['process'])
+            else:   
+                return get_coco2017_dataloader(args.data_path, args.label_path, args.batch_size, args.image_size, args.num_workers, args.model_framework, args.input_layout)
+            
+    
+    elif args.calibration_file_path is not None:
+        ## NOTE(chen.chen)
+        ## user-provided dataset, just use it as calibration data
+        ## we support two format .npy and .data
+        
+        ## if extetion is .npy, it should be a single npy file,
+        ## each input should be saved in a np.ndarray which has beed preprocessed
+        ## e.g. for two inputs model
+        ## the npy should be a list of two array, the shape of each array is like below
+        ## ((100, 3, 224, 224), (100, 1000))
+        
+        ## if extension is .data, we will call np.frombuffer to load the data
+        ## this is for paddle-igie compatibility and only support single input now
+        
+        
+        calibration_file_path = args.calibration_file_path
+        assert os.path.exists(calibration_file_path), f"can not find calibration file:{calibration_file_path}"
+        ext = os.path.splitext(calibration_file_path)[-1]
+        
+        assert ext in [".npy", ".data"], f"unspported calibration file format {ext}, it should be .npy or .data"
+        
+        dataset = NumpyDataSet(args.input_name_list, args.input_shape_list, args.input_dtype_list, calibration_file_path)
+        
+        dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, drop_last=True, collate_fn=dataset.collate_fn)
+          
+        return dataloader
+    
+    else:
+        ## NOTE(chen.chen)
+        ## use fake data for calibration, just used for perf test
+        ## here we should know the shape/dtype info of the input to generate the fake input data
+        dataset = FakeDataSet(args.input_name_list, args.input_shape_list, args.input_dtype_list)
+        dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, drop_last=True, collate_fn=dataset.collate_fn)
+
+        return dataloader
+    
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8229884b17d817c0b45938dc1d44e6840b15e02b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py
@@ -0,0 +1,9 @@
+# import torch first to make jit op work without `ImportError of libc10.so`
+import torch
+
+from .jit_ops import FastCOCOEvalOp, JitOp
+
+try:
+    from .fast_coco_eval_api import COCOeval_opt
+except ImportError:  #  exception will be raised when users build yolox from source
+    pass
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e63bc9952918060f55999ec100b283d83616b46
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp
@@ -0,0 +1,502 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "cocoeval.h"
+#include <time.h>
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+using namespace pybind11::literals;
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    std::vector<uint64_t>* detection_sorted_indices) {
+  detection_sorted_indices->resize(detection_instances.size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_instances](size_t j1, size_t j2) {
+        return detection_instances[j1].score > detection_instances[j2].score;
+      });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+    const std::array<double, 2>& area_range,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    std::vector<uint64_t>* ground_truth_sorted_indices,
+    std::vector<bool>* ignores) {
+  ignores->clear();
+  ignores->reserve(ground_truth_instances.size());
+  for (auto o : ground_truth_instances) {
+    ignores->push_back(
+        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+  }
+
+  ground_truth_sorted_indices->resize(ground_truth_instances.size());
+  std::iota(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      0);
+  std::stable_sort(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      [&ignores](size_t j1, size_t j2) {
+        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+      });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    const std::vector<uint64_t>& ground_truth_sorted_indices,
+    const std::vector<bool>& ignores,
+    const std::vector<std::vector<double>>& ious,
+    const std::vector<double>& iou_thresholds,
+    const std::array<double, 2>& area_range,
+    ImageEvaluation* results) {
+  // Initialize memory to store return data matches and ignore
+  const int num_iou_thresholds = iou_thresholds.size();
+  const int num_ground_truth = ground_truth_sorted_indices.size();
+  const int num_detections = detection_sorted_indices.size();
+  std::vector<uint64_t> ground_truth_matches(
+      num_iou_thresholds * num_ground_truth, 0);
+  std::vector<uint64_t>& detection_matches = results->detection_matches;
+  std::vector<bool>& detection_ignores = results->detection_ignores;
+  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
+  detection_matches.resize(num_iou_thresholds * num_detections, 0);
+  detection_ignores.resize(num_iou_thresholds * num_detections, false);
+  ground_truth_ignores.resize(num_ground_truth);
+  for (auto g = 0; g < num_ground_truth; ++g) {
+    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+  }
+
+  for (auto t = 0; t < num_iou_thresholds; ++t) {
+    for (auto d = 0; d < num_detections; ++d) {
+      // information about best match so far (match=-1 -> unmatched)
+      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+      int match = -1;
+      for (auto g = 0; g < num_ground_truth; ++g) {
+        // if this ground truth instance is already matched and not a
+        // crowd, it cannot be matched to another detection
+        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+          continue;
+        }
+
+        // if detected instance matched to a regular ground truth
+        // instance, we can break on the first ground truth instance
+        // tagged as ignore (because they are sorted by the ignore tag)
+        if (match >= 0 && !ground_truth_ignores[match] &&
+            ground_truth_ignores[g]) {
+          break;
+        }
+
+        // if IOU overlap is the best so far, store the match appropriately
+        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+          best_iou = ious[d][ground_truth_sorted_indices[g]];
+          match = g;
+        }
+      }
+      // if match was made, store id of match for both detection and
+      // ground truth
+      if (match >= 0) {
+        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+        detection_matches[t * num_detections + d] =
+            ground_truth_instances[ground_truth_sorted_indices[match]].id;
+        ground_truth_matches[t * num_ground_truth + match] =
+            detection_instances[detection_sorted_indices[d]].id;
+      }
+
+      // set unmatched detections outside of area range to ignore
+      const InstanceAnnotation& detection =
+          detection_instances[detection_sorted_indices[d]];
+      detection_ignores[t * num_detections + d] =
+          detection_ignores[t * num_detections + d] ||
+          (detection_matches[t * num_detections + d] == 0 &&
+           (detection.area < area_range[0] || detection.area > area_range[1]));
+    }
+  }
+
+  // store detection score results
+  results->detection_scores.resize(detection_sorted_indices.size());
+  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+    results->detection_scores[d] =
+        detection_instances[detection_sorted_indices[d]].score;
+  }
+}
+
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges,
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances) {
+  const int num_area_ranges = area_ranges.size();
+  const int num_images = image_category_ground_truth_instances.size();
+  const int num_categories =
+      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+  std::vector<uint64_t> detection_sorted_indices;
+  std::vector<uint64_t> ground_truth_sorted_indices;
+  std::vector<bool> ignores;
+  std::vector<ImageEvaluation> results_all(
+      num_images * num_area_ranges * num_categories);
+
+  // Store results for each image, category, and area range combination. Results
+  // for each IOU threshold are packed into the same ImageEvaluation object
+  for (auto i = 0; i < num_images; ++i) {
+    for (auto c = 0; c < num_categories; ++c) {
+      const std::vector<InstanceAnnotation>& ground_truth_instances =
+          image_category_ground_truth_instances[i][c];
+      const std::vector<InstanceAnnotation>& detection_instances =
+          image_category_detection_instances[i][c];
+
+      SortInstancesByDetectionScore(
+          detection_instances, &detection_sorted_indices);
+      if ((int)detection_sorted_indices.size() > max_detections) {
+        detection_sorted_indices.resize(max_detections);
+      }
+
+      for (size_t a = 0; a < area_ranges.size(); ++a) {
+        SortInstancesByIgnore(
+            area_ranges[a],
+            ground_truth_instances,
+            &ground_truth_sorted_indices,
+            &ignores);
+
+        MatchDetectionsToGroundTruth(
+            detection_instances,
+            detection_sorted_indices,
+            ground_truth_instances,
+            ground_truth_sorted_indices,
+            ignores,
+            image_category_ious[i][c],
+            iou_thresholds,
+            area_ranges[a],
+            &results_all
+                [c * num_area_ranges * num_images + a * num_images + i]);
+      }
+    }
+  }
+
+  return results_all;
+}
+
+// Convert a python list to a vector
+template <typename T>
+std::vector<T> list_to_vec(const py::list& l) {
+  std::vector<T> v(py::len(l));
+  for (int i = 0; i < (int)py::len(l); ++i) {
+    v[i] = l[i].cast<T>();
+  }
+  return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index].  Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i.  detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+    const std::vector<ImageEvaluation>& evaluations,
+    const int64_t evaluation_index,
+    const int64_t num_images,
+    const int max_detections,
+    std::vector<uint64_t>* evaluation_indices,
+    std::vector<double>* detection_scores,
+    std::vector<uint64_t>* detection_sorted_indices,
+    std::vector<uint64_t>* image_detection_indices) {
+  assert(evaluations.size() >= evaluation_index + num_images);
+
+  // Extract a list of object instances of the applicable category, area
+  // range, and max detections requirements such that they can be sorted
+  image_detection_indices->clear();
+  evaluation_indices->clear();
+  detection_scores->clear();
+  image_detection_indices->reserve(num_images * max_detections);
+  evaluation_indices->reserve(num_images * max_detections);
+  detection_scores->reserve(num_images * max_detections);
+  int num_valid_ground_truth = 0;
+  for (auto i = 0; i < num_images; ++i) {
+    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+    for (int d = 0;
+         d < (int)evaluation.detection_scores.size() && d < max_detections;
+         ++d) { // detected instances
+      evaluation_indices->push_back(evaluation_index + i);
+      image_detection_indices->push_back(d);
+      detection_scores->push_back(evaluation.detection_scores[d]);
+    }
+    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+      if (!ground_truth_ignore) {
+        ++num_valid_ground_truth;
+      }
+    }
+  }
+
+  // Sort detections by decreasing score, using stable sort to match
+  // python implementation
+  detection_sorted_indices->resize(detection_scores->size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_scores](size_t j1, size_t j2) {
+        return (*detection_scores)[j1] > (*detection_scores)[j2];
+      });
+
+  return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+    const int64_t precisions_out_index,
+    const int64_t precisions_out_stride,
+    const int64_t recalls_out_index,
+    const std::vector<double>& recall_thresholds,
+    const int iou_threshold_index,
+    const int num_iou_thresholds,
+    const int num_valid_ground_truth,
+    const std::vector<ImageEvaluation>& evaluations,
+    const std::vector<uint64_t>& evaluation_indices,
+    const std::vector<double>& detection_scores,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<uint64_t>& image_detection_indices,
+    std::vector<double>* precisions,
+    std::vector<double>* recalls,
+    std::vector<double>* precisions_out,
+    std::vector<double>* scores_out,
+    std::vector<double>* recalls_out) {
+  assert(recalls_out->size() > recalls_out_index);
+
+  // Compute precision/recall for each instance in the sorted list of detections
+  int64_t true_positives_sum = 0, false_positives_sum = 0;
+  precisions->clear();
+  recalls->clear();
+  precisions->reserve(detection_sorted_indices.size());
+  recalls->reserve(detection_sorted_indices.size());
+  assert(!evaluations.empty() || detection_sorted_indices.empty());
+  for (auto detection_sorted_index : detection_sorted_indices) {
+    const ImageEvaluation& evaluation =
+        evaluations[evaluation_indices[detection_sorted_index]];
+    const auto num_detections =
+        evaluation.detection_matches.size() / num_iou_thresholds;
+    const auto detection_index = iou_threshold_index * num_detections +
+        image_detection_indices[detection_sorted_index];
+    assert(evaluation.detection_matches.size() > detection_index);
+    assert(evaluation.detection_ignores.size() > detection_index);
+    const int64_t detection_match =
+        evaluation.detection_matches[detection_index];
+    const bool detection_ignores =
+        evaluation.detection_ignores[detection_index];
+    const auto true_positive = detection_match > 0 && !detection_ignores;
+    const auto false_positive = detection_match == 0 && !detection_ignores;
+    if (true_positive) {
+      ++true_positives_sum;
+    }
+    if (false_positive) {
+      ++false_positives_sum;
+    }
+
+    const double recall =
+        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
+    recalls->push_back(recall);
+    const int64_t num_valid_detections =
+        true_positives_sum + false_positives_sum;
+    const double precision = num_valid_detections > 0
+        ? static_cast<double>(true_positives_sum) / num_valid_detections
+        : 0.0;
+    precisions->push_back(precision);
+  }
+
+  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
+    if ((*precisions)[i] > (*precisions)[i - 1]) {
+      (*precisions)[i - 1] = (*precisions)[i];
+    }
+  }
+
+  // Sample the per instance precision/recall list at each recall threshold
+  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+    // first index in recalls >= recall_thresholds[r]
+    std::vector<double>::iterator low = std::lower_bound(
+        recalls->begin(), recalls->end(), recall_thresholds[r]);
+    size_t precisions_index = low - recalls->begin();
+
+    const auto results_ind = precisions_out_index + r * precisions_out_stride;
+    assert(results_ind < precisions_out->size());
+    assert(results_ind < scores_out->size());
+    if (precisions_index < precisions->size()) {
+      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+      (*scores_out)[results_ind] =
+          detection_scores[detection_sorted_indices[precisions_index]];
+    } else {
+      (*precisions_out)[results_ind] = 0;
+      (*scores_out)[results_ind] = 0;
+    }
+  }
+}
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evaluations) {
+  const std::vector<double> recall_thresholds =
+      list_to_vec<double>(params.attr("recThrs"));
+  const std::vector<int> max_detections =
+      list_to_vec<int>(params.attr("maxDets"));
+  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+  const int num_recall_thresholds = py::len(params.attr("recThrs"));
+  const int num_categories = params.attr("useCats").cast<int>() == 1
+      ? py::len(params.attr("catIds"))
+      : 1;
+  const int num_area_ranges = py::len(params.attr("areaRng"));
+  const int num_max_detections = py::len(params.attr("maxDets"));
+  const int num_images = py::len(params.attr("imgIds"));
+
+  std::vector<double> precisions_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+  std::vector<double> recalls_out(
+      num_iou_thresholds * num_categories * num_area_ranges *
+          num_max_detections,
+      -1);
+  std::vector<double> scores_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+
+  // Consider the list of all detected instances in the entire dataset in one
+  // large list.  evaluation_indices, detection_scores,
+  // image_detection_indices, and detection_sorted_indices all have the same
+  // length as this list, such that each entry corresponds to one detected
+  // instance
+  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
+  std::vector<double> detection_scores; // detection scores of each instance
+  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
+                                                  // instances in the dataset
+  std::vector<uint64_t>
+      image_detection_indices; // indices into the list of detected instances in
+                               // the same image as each instance
+  std::vector<double> precisions, recalls;
+
+  for (auto c = 0; c < num_categories; ++c) {
+    for (auto a = 0; a < num_area_ranges; ++a) {
+      for (auto m = 0; m < num_max_detections; ++m) {
+        // The COCO PythonAPI assumes evaluations[] (the return value of
+        // COCOeval::EvaluateImages() is one long list storing results for each
+        // combination of category, area range, and image id, with categories in
+        // the outermost loop and images in the innermost loop.
+        const int64_t evaluations_index =
+            c * num_area_ranges * num_images + a * num_images;
+        int num_valid_ground_truth = BuildSortedDetectionList(
+            evaluations,
+            evaluations_index,
+            num_images,
+            max_detections[m],
+            &evaluation_indices,
+            &detection_scores,
+            &detection_sorted_indices,
+            &image_detection_indices);
+
+        if (num_valid_ground_truth == 0) {
+          continue;
+        }
+
+        for (auto t = 0; t < num_iou_thresholds; ++t) {
+          // recalls_out is a flattened vectors representing a
+          // num_iou_thresholds X num_categories X num_area_ranges X
+          // num_max_detections matrix
+          const int64_t recalls_out_index =
+              t * num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          // precisions_out and scores_out are flattened vectors
+          // representing a num_iou_thresholds X num_recall_thresholds X
+          // num_categories X num_area_ranges X num_max_detections matrix
+          const int64_t precisions_out_stride =
+              num_categories * num_area_ranges * num_max_detections;
+          const int64_t precisions_out_index = t * num_recall_thresholds *
+                  num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          ComputePrecisionRecallCurve(
+              precisions_out_index,
+              precisions_out_stride,
+              recalls_out_index,
+              recall_thresholds,
+              t,
+              num_iou_thresholds,
+              num_valid_ground_truth,
+              evaluations,
+              evaluation_indices,
+              detection_scores,
+              detection_sorted_indices,
+              image_detection_indices,
+              &precisions,
+              &recalls,
+              &precisions_out,
+              &scores_out,
+              &recalls_out);
+        }
+      }
+    }
+  }
+
+  time_t rawtime;
+  struct tm local_time;
+  std::array<char, 200> buffer;
+  time(&rawtime);
+#ifdef _WIN32
+  localtime_s(&local_time, &rawtime);
+#else
+  localtime_r(&rawtime, &local_time);
+#endif
+  strftime(
+      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+  return py::dict(
+      "params"_a = params,
+      "counts"_a = std::vector<int64_t>({num_iou_thresholds,
+                                         num_recall_thresholds,
+                                         num_categories,
+                                         num_area_ranges,
+                                         num_max_detections}),
+      "date"_a = buffer,
+      "precision"_a = precisions_out,
+      "recall"_a = recalls_out,
+      "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbf5aab4b8303b8e199f10e1ecf2f634ca29cb42
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h
@@ -0,0 +1,98 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+  InstanceAnnotation(
+      uint64_t id,
+      double score,
+      double area,
+      bool is_crowd,
+      bool ignore)
+      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+  uint64_t id;
+  double score = 0.;
+  double area = 0.;
+  bool is_crowd = false;
+  bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+  // For each of the D detected instances, the id of the matched ground truth
+  // instance, or 0 if unmatched
+  std::vector<uint64_t> detection_matches;
+
+  // The detection score of each of the D detected instances
+  std::vector<double> detection_scores;
+
+  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+  // because it's outside area_range)
+  std::vector<bool> ground_truth_ignores;
+
+  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+  // because it's outside aRng)
+  std::vector<bool> detection_ignores;
+};
+
+template <class T>
+using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves.  The parameters of nested vectors have the following semantics:
+//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
+//     detected instance and g'th ground truth instance of
+//     category category_ids[c] in image image_ids[i]
+//   image_category_ground_truth_instances[i][c] is a vector of ground truth
+//     instances in image image_ids[i] of category category_ids[c]
+//   image_category_detection_instances[i][c] is a vector of detected
+//     instances in image image_ids[i] of category category_ids[c]
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters.  It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evalutations);
+
+} // namespace COCOeval
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+    m.def(
+        "COCOevalEvaluateImages",
+        &COCOeval::EvaluateImages,
+        "COCOeval::EvaluateImages");
+    pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
+        .def(pybind11::init<uint64_t, double, double, bool, bool>());
+    pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
+        .def(pybind11::init<>());
+}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..374031ab8fa8738f96cbfec50985d2e0ae406e53
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import copy
+import time
+
+import numpy as np
+from pycocotools.cocoeval import COCOeval
+
+from .jit_ops import FastCOCOEvalOp
+
+
+class COCOeval_opt(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+    and accumulate() are implemented in C++ to speedup evaluation
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        try:
+            self.module = FastCOCOEvalOp().load()
+        except:
+            raise ImportError
+        
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+        datastructure that isn't readable from Python but is used by a c++ implementation of
+        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
+        self.evalImgs because this datastructure is a computational bottleneck.
+        :return: None
+        """
+        tic = time.time()
+
+        print("Running per image evaluation...")
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            print(
+                "useSegm (deprecated) is not None. Running {} evaluation".format(
+                    p.iouType
+                )
+            )
+        print("Evaluate annotation type *{}*".format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == "segm" or p.iouType == "bbox":
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId)
+            for imgId in p.imgIds
+            for catId in catIds
+        }
+
+        maxDet = p.maxDets[-1]
+
+        # <<<< Beginning of code differences with original COCO API
+        def convert_instances_to_cpp(instances, is_det=False):
+            # Convert annotations for a list of instances in an image to a format that's fast
+            # to access in C++
+            instances_cpp = []
+            for instance in instances:
+                instance_cpp = self.module.InstanceAnnotation(
+                    int(instance["id"]),
+                    instance["score"] if is_det else instance.get("score", 0.0),
+                    instance["area"],
+                    bool(instance.get("iscrowd", 0)),
+                    bool(instance.get("ignore", 0)),
+                )
+                instances_cpp.append(instance_cpp)
+            return instances_cpp
+
+        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+        ground_truth_instances = [
+            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        detected_instances = [
+            [
+                convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
+                for catId in p.catIds
+            ]
+            for imgId in p.imgIds
+        ]
+        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+        if not p.useCats:
+            # For each image, flatten per-category lists into a single list
+            ground_truth_instances = [
+                [[o for c in i for o in c]] for i in ground_truth_instances
+            ]
+            detected_instances = [
+                [[o for c in i for o in c]] for i in detected_instances
+            ]
+
+        # Call C++ implementation of self.evaluateImgs()
+        self._evalImgs_cpp = self.module.COCOevalEvaluateImages(
+            p.areaRng,
+            maxDet,
+            p.iouThrs,
+            ious,
+            ground_truth_instances,
+            detected_instances,
+        )
+        self._evalImgs = None
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+        # >>>> End of code differences with original COCO API
+
+    def accumulate(self):
+        """
+        Accumulate per image evaluation results and store the result in self.eval.  Does not
+        support changing parameter settings from those used by self.evaluate()
+        """
+        print("Accumulating evaluation results...")
+        tic = time.time()
+        if not hasattr(self, "_evalImgs_cpp"):
+            print("Please run evaluate() first")
+
+        self.eval = self.module.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+
+        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+        self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+            self.eval["counts"][:1] + self.eval["counts"][2:]
+        )
+
+        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+        # num_area_ranges X num_max_detections
+        self.eval["precision"] = np.array(self.eval["precision"]).reshape(
+            self.eval["counts"]
+        )
+        self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+        toc = time.time()
+        print(
+            "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)
+        )
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce3195ff4b796542670a23bc32bcd67dc8aed55
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py
@@ -0,0 +1,179 @@
+import glob
+import importlib
+import os
+import sys
+import time
+from typing import List
+from torch import distributed as dist
+from contextlib import contextmanager
+
+__all__ = ["JitOp", "FastCOCOEvalOp"]
+
+_LOCAL_PROCESS_GROUP = None
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if _LOCAL_PROCESS_GROUP is None:
+        return get_rank()
+
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+@contextmanager
+def wait_for_the_master(local_rank: int = None):
+    """
+    Make all processes waiting for the master to do some task.
+
+    Args:
+        local_rank (int): the rank of the current process. Default to None.
+            If None, it will use the rank of the current process.
+    """
+    if local_rank is None:
+        local_rank = get_local_rank()
+
+    if local_rank > 0:
+        dist.barrier()
+    yield
+    if local_rank == 0:
+        if not dist.is_available():
+            return
+        if not dist.is_initialized():
+            return
+        else:
+            dist.barrier()
+
+class JitOp:
+    """
+    Just-in-time compilation of ops.
+
+    Some code of `JitOp` is inspired by `deepspeed.op_builder`,
+    check the following link for more details:
+    https://github.com/microsoft/DeepSpeed/blob/master/op_builder/builder.py
+    """
+
+    def __init__(self, name):
+        self.name = name
+
+    def absolute_name(self) -> str:
+        """Get absolute build path for cases where the op is pre-installed."""
+        pass
+
+    def sources(self) -> List:
+        """Get path list of source files of op.
+
+        NOTE: the path should be elative to root of package during building,
+            Otherwise, exception will be raised when building package.
+            However, for runtime building, path will be absolute.
+        """
+        pass
+
+    def include_dirs(self) -> List:
+        """
+        Get list of include paths, relative to root of package.
+
+        NOTE: the path should be elative to root of package.
+            Otherwise, exception will be raised when building package.
+        """
+        return []
+
+    def define_macros(self) -> List:
+        """Get list of macros to define for op"""
+        return []
+
+    def cxx_args(self) -> List:
+        """Get optional list of compiler flags to forward"""
+        args = ["-O2"] if sys.platform == "win32" else ["-O3", "-std=c++14", "-g", "-Wno-reorder"]
+        return args
+
+    def nvcc_args(self) -> List:
+        """Get optional list of compiler flags to forward to nvcc when building CUDA sources"""
+        args = [
+            "-O3", "--use_fast_math",
+            "-std=c++17" if sys.platform == "win32" else "-std=c++14",
+            "-U__CUDA_NO_HALF_OPERATORS__",
+            "-U__CUDA_NO_HALF_CONVERSIONS__",
+            "-U__CUDA_NO_HALF2_OPERATORS__",
+        ]
+        return args
+
+    def build_op(self):
+        from torch.utils.cpp_extension import CppExtension
+        return CppExtension(
+            name=self.absolute_name(),
+            sources=self.sources(),
+            include_dirs=self.include_dirs(),
+            define_macros=self.define_macros(),
+            extra_compile_args={
+                "cxx": self.cxx_args(),
+            },
+        )
+
+    def load(self, verbose=False):
+        try:
+            # try to import op from pre-installed package
+            return importlib.import_module(self.absolute_name())
+        except Exception:  # op not compiled, jit load
+            with wait_for_the_master():  # to avoid race condition
+                return self.jit_load(verbose)
+
+    def jit_load(self, verbose=False):
+        from torch.utils.cpp_extension import load
+        try:
+            import ninja  # noqa
+        except ImportError:
+            if verbose:
+                print(
+                    f"Ninja is not installed, fall back to normal installation for {self.name}."
+                )
+
+        build_tik = time.time()
+        # build op and load
+        op_module = load(
+            name=self.name,
+            sources=self.sources(),
+            extra_cflags=self.cxx_args(),
+            extra_cuda_cflags=self.nvcc_args(),
+            verbose=verbose,
+        )
+        build_duration = time.time() - build_tik
+        if verbose:
+            print(f"Load {self.name} op in {build_duration:.3f}s.")
+        return op_module
+
+    def clear_dynamic_library(self):
+        """Remove dynamic libraray files generated by JIT compilation."""
+        module = self.load()
+        os.remove(module.__file__)
+
+
+class FastCOCOEvalOp(JitOp):
+
+    def __init__(self, name="fast_cocoeval"):
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'fastCoCoeval.{self.name}'
+
+    def sources(self):
+        sources = glob.glob(os.path.join("fastCoCoeval", "cocoeval", "*.cpp"))
+        if not sources:  # source will be empty list if the so file is removed after install
+            # use abosolute path to compile
+            code_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "cocoeval", "*.cpp")
+            sources = glob.glob(code_path)
+        return sources
+
+    def include_dirs(self):
+        return [os.path.join("fastCoCoeval", "cocoeval")]
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b413788a026c025aec5ec69fad0018cfc470b1f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py
@@ -0,0 +1,20 @@
+import os
+import json
+
+def download_file(src_url, save_path):
+    if not os.path.exists(save_path):
+        cmd = f"wget {src_url}"
+        os.system(cmd)
+        
+    assert os.path.exists(save_path)
+    
+
+def load_json(path):
+    with open(path, "r") as f:
+        data = json.load(f)
+    return data
+
+
+def save_json(data, path):
+    with open(path, "w") as f:
+        json.dump(data, f, indent=4)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d034bc0a15441cfc0c920d5bfc21a1cc4e24d6b8
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py
@@ -0,0 +1,23 @@
+import torch
+import numpy as np
+
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0b5e12c5ab86fcd4149eb7b7354e0a8222ad124
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py
@@ -0,0 +1,113 @@
+import os
+import shutil
+import onnx
+import torch
+import torchvision
+import tensorflow as tf
+try:
+    tf_compat_v1 = tf.compat.v1
+except ImportError:
+    tf_compat_v1 = tf
+
+import tvm
+from tvm import relay
+import tvm.relay.testing.tf as tf_testing
+from .onnx_util import get_batch_size, rewrite_int64_input_to_int32
+from .onnx_rewrite_batch_size import rewrite_batch_size
+from .argument import to_bool
+from tvm.relay.transform.iluvatar import SimplifyGraph
+
+def import_model_to_igie(model_path_or_name, input_dict, model_framework):
+    
+    base_name = os.path.splitext(os.path.split(model_path_or_name)[1])[0]
+    cache_hash = f"{base_name}_cache_dir"
+    mod_path = os.path.join(cache_hash, "mod.cache")
+    params_path = os.path.join(cache_hash, "params.cache")
+    
+    # find cached mod and params
+    if os.path.exists(cache_hash) and to_bool(os.environ.get("IGIE_USE_CACHE", False)):
+        with open(mod_path, "r") as mod_file:
+            mod = tvm.parser.fromtext(mod_file.read())
+        
+        with open(params_path, "rb") as params_file:
+            params = relay.load_param_dict(params_file.read())
+
+        return mod, params
+    
+    paddle_dir_path = os.path.split(model_path_or_name)[0]
+    if os.path.exists(model_path_or_name) or os.path.exists(paddle_dir_path):
+        if model_framework == "onnx":
+            batch_size = list(input_dict.values())[0][0]
+            model_path = model_path_or_name
+            
+                  
+            # we don't want to handle multi_input case here,
+            # e.g. input_ids:1000,22 pixel_values:32,3,224,224 attention_mask:1000,22 for clip model
+            if len(input_dict) == 1:
+                batch_size_from_model = get_batch_size(model_path_or_name)
+                if isinstance(batch_size_from_model, int) and batch_size_from_model != batch_size:
+                    model_path = f"{model_path[:-5]}_rewrite_b{batch_size}.onnx"
+                    rewrite_batch_size(model_path_or_name, batch_size, save_model_path=model_path)
+
+            model = onnx.load(model_path)
+            # model = rewrite_int64_input_to_int32(model)
+            mod, params = relay.frontend.from_onnx(model, input_dict, freeze_params=True)
+    
+        elif model_framework == "pytorch":
+            scripted_model = torch.jit.load(model_path_or_name).eval()
+            input_infos = [(k, v) for k, v in input_dict.items()]
+            mod, params = relay.frontend.from_pytorch(scripted_model, input_infos=input_infos)
+    
+        elif model_framework == "tensorflow":
+            with tf_compat_v1.gfile.GFile(model_path_or_name, "rb") as f:
+                graph_def = tf_compat_v1.GraphDef()
+                graph_def.ParseFromString(f.read())
+                graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+            mod, params = relay.frontend.from_tensorflow(graph_def, shape=input_dict)
+            
+        elif model_framework == "paddle":
+                import paddle
+                model = paddle.jit.load(model_path_or_name)
+                mod, params = relay.frontend.from_paddle(model, input_dict)
+        else:
+            raise ValueError(f"framwork {model_framework} is not supported yet")
+        
+    else:
+        # In this case we will try to find from tochvision
+        # e.g. model_path_or_name="resnet18"
+
+        try:
+            import ssl 
+            ssl._create_default_https_context = ssl._create_unverified_context
+            model = getattr(torchvision.models, model_path_or_name)(pretrained=True).eval()
+        except:
+            raise ValueError(f"can not find model {model_path_or_name} from torchvision and current working directory")
+        
+        
+        input_datas = []
+        for shape in input_dict.values():
+            # currently torchvision model should always use float32 input
+            input_datas.append(torch.randn(shape))
+        
+        scripted_model = torch.jit.trace(model, tuple(input_datas)).eval()
+        input_infos = [(k, v) for k, v in input_dict.items()]
+        mod, params = relay.frontend.from_pytorch(scripted_model, input_infos=input_infos) 
+
+    # save cache
+    if to_bool(os.environ.get("IGIE_USE_CACHE", False)):
+        if os.path.exists(cache_hash):
+            shutil.rmtree(cache_hash)
+        os.makedirs(cache_hash)
+        
+        mod_path = os.path.join(cache_hash, "mod.cache")
+        with open(mod_path, "w") as mod_file:
+            mod_file.write(mod.astext())
+
+        params_path = os.path.join(cache_hash, "params.cache")
+        with open(params_path, "wb") as params_file:
+            params_file.write(relay.save_param_dict(params))
+    
+    # need SimlifyGraph mod when importing onnx models, especially the model contains Q/DQ node
+    mod = SimplifyGraph(mod, params)   
+    
+    return mod, params
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..452916efb0db565d38b9ce4baeb68ed01bd94815
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py
@@ -0,0 +1,81 @@
+import tvm
+from tvm import relay
+from tvm.relay import Expr
+from tvm.relay.dataflow_pattern import wildcard, is_constant, is_op, DFPatternCallback, rewrite
+from tvm.relay.expr_functor import ExprMutator
+
+#TODO(chen.chen): we should move this class to igie repo
+class MainFunctionParamsRewriter(ExprMutator):
+    def __init__(self, target_input_dict, preprocess_rewriter=None):        
+        self.target_input = target_input_dict
+        self.preprocess_rewriter = preprocess_rewriter
+        self.target_input_name_list = list(self.target_input.keys())
+        
+        super().__init__()
+        
+    def visit_function(self, fn):
+        params = [self.visit(i) for i in fn.params]
+        body  = self.visit(fn.body)
+        
+        original_input_name_list = [param.name_hint for param in params]
+        assert len(set(self.target_input_name_list) - set(original_input_name_list)) == 0, f"invalid target_input_name: {set(self.target_input_name_list) - set(original_input_name_list)}"
+        
+        new_params = []
+        bind = {}
+        for param in params:
+            old_param = param
+            name = param.name_hint
+            
+            new_param = old_param
+            if name in self.target_input:
+                shape = self.target_input[name][0]
+                if len(self.target_input[name]) == 2:
+                    dtype = self.target_input[name][1]
+                else:
+                    dtype = old_param.type_annotation.dtype
+                new_param = relay.var(name_hint=name, shape=shape, dtype=dtype)
+
+            new_params.append(new_param)
+            bind[old_param] = new_param
+            
+        new_body = relay.expr.bind(body, bind)
+        
+        new_function = relay.Function(params=new_params,
+                                      body=new_body,
+                                      ret_type=None,
+                                      type_params=fn.type_params,
+                                      attrs=fn.attrs)
+        return new_function            
+               
+    def __call__(self, mod):
+        if self.preprocess_rewriter:
+            mod["main"] = rewrite(self.preprocess_rewriter, mod["main"])
+        mod["main"] = self.visit(mod["main"])
+        return mod
+    
+    
+# TODO(chen.chen) this function is designeg for bert model, but it doesn't work now
+# the reason is that, position_embedding is fixed when mod is generated from onnx
+# e.g. the meta[relay.Constant][51] is fixed as 256
+# even if we rewrite the seq_len to 384, the InferType will failed for %9 = add(%8, meta[relay.Constant][51] /* ty=Tensor[(1, 256, 768), float32] */)
+
+# def @main(%input_ids: Tensor[(8, 256), int64], %attention_mask: Tensor[(8, 256), int64], %token_type_ids: Tensor[(8, 256), int64]) -> (Tensor[(8, 256), float32], Tensor[(8, 256), float32]) {
+#   %0 = less(%input_ids, 0 /* ty=int64 */) /* ty=Tensor[(8, 256), bool] */;
+#   %1 = add(%input_ids, 30522 /* ty=int64 */) /* ty=Tensor[(8, 256), int64] */;
+#   %2 = where(%0, %1, %input_ids) /* ty=Tensor[(8, 256), int64] */;
+#   %3 = less(%token_type_ids, 0 /* ty=int64 */) /* ty=Tensor[(8, 256), bool] */;
+#   %4 = add(%token_type_ids, 2 /* ty=int64 */) /* ty=Tensor[(8, 256), int64] */;
+#   %5 = where(%3, %4, %token_type_ids) /* ty=Tensor[(8, 256), int64] */;
+#   %6 = take(meta[relay.Constant][49] /* ty=Tensor[(30522, 768), float32] */, %2, axis=0) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %7 = take(meta[relay.Constant][50] /* ty=Tensor[(2, 768), float32] */, %5, axis=0) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %8 = add(%6, %7) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %9 = add(%8, meta[relay.Constant][51] /* ty=Tensor[(1, 256, 768), float32] */) /* ty=Tensor[(8, 256, 768), float32] */;
+  
+  
+def modify_seq_len_for_nlp(mod, input_dict, target_seq_len):
+    target_input_dict = {}
+    for name, shape in input_dict.items():
+        target_input_dict[name] = [(shape[0], target_seq_len)]
+    mod = relay.transform.InferType()(mod)
+    mod = MainFunctionParamsRewriter(target_input_dict=target_input_dict)(mod)
+    return mod
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py
new file mode 100644
index 0000000000000000000000000000000000000000..5332febfb8f2ce169bafaf9c74683814f07ac8b0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py
@@ -0,0 +1,113 @@
+"""
+rewrite src onnx model and infer shape if possible, current sypport
+
+1. rewrite batch_size, e.g 1x3x640x640 -> 32x3x640x640
+
+Attention:
+1. all inputs/outputs batchszie dim will be modified together, which means some NLP/Audio senquence models will introduce problems
+
+
+"""
+import onnx
+from onnx import OperatorSetIdProto
+import onnx.numpy_helper
+
+import onnxoptimizer
+from onnxsim import simplify
+
+from .onnx_util import get_batch_size, rewrite_tensor_batch_size
+
+def rewrite_batch_size(model,
+                       batch_size,
+                       modify_reshape_dim=True,
+                       save_model_path=None):
+
+    ## rewrite input and output
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+        
+    ## there is a issue that when the onnx model comes from tf,
+    ## some shape info is stored as constant node's output instead of initializer
+    passes = [
+        "extract_constant_to_initializer", "eliminate_unused_initializer"
+    ]
+    model = onnxoptimizer.optimize(model, passes)
+    
+    
+
+    # to support qlinear op if the opset_import is not supported
+    # if we have some ohter domains need to import, add them here
+    ms_opset = OperatorSetIdProto()
+    ms_opset.domain = "com.microsoft"
+    ms_opset.version = 1
+
+    ori_opset_import = model.opset_import
+
+    if ms_opset not in ori_opset_import:
+        ori_opset_import.append(ms_opset)
+
+    model, check = simplify(model)
+    assert check, "Simplified ONNX model could not be validated"
+    
+
+    graph = model.graph
+    initializer = graph.initializer
+    inputs = graph.input
+    outputs = graph.output
+    nodes = graph.node
+
+    ori_batch_size = get_batch_size(model)
+
+    ## in case that some onnx model inputs contain initializers' shape info, we will remove them to avoid rewriting input failure
+
+    initializer_names = set([i.name for i in initializer])
+    import copy
+    tmp_inputs = copy.deepcopy(inputs)
+    for i in tmp_inputs:
+        if i.name in initializer_names:
+            inputs.remove(i)
+
+    for i in inputs:
+        rewrite_tensor_batch_size(i, batch_size)
+
+    for i in outputs:
+        rewrite_tensor_batch_size(i, batch_size)
+
+    ## we may need to modify reshape initializer if we modify input batchsize
+    ## this code only works when the target shape is fixed, and occurs as a input initializer in the node
+    ## so this may introduce some other problems when the purpose of reshape operations are totally different
+
+    if modify_reshape_dim:
+        reshape_input = []
+        for idx, i in enumerate(nodes):
+            if i.op_type == "Reshape":
+                reshape_input.extend(i.input)
+            if i.op_type == "Resize" and len(i.input) == 4:
+                reshape_input.append(i.input[3])
+        for idx, i in enumerate(initializer):
+            if i.name in reshape_input:
+                shape = onnx.numpy_helper.to_array(i).copy()
+                if shape.dtype == "int64":
+                    shape[0] = batch_size
+                    initializer[idx].CopyFrom(
+                        onnx.numpy_helper.from_array(shape, i.name))
+
+    for i in graph.value_info:
+        if i.type.tensor_type.shape.dim:
+            if i.type.tensor_type.shape.dim[0].dim_value == ori_batch_size:
+                i.type.tensor_type.shape.dim[0].dim_value = batch_size
+
+    model, check = simplify(model)
+    assert check, "Simplified ONNX model could not be validated"
+
+    model = onnx.shape_inference.infer_shapes(model,
+                                              check_type=True,
+                                              strict_mode=True,
+                                              data_prop=True)
+    onnx.checker.check_model(model)
+
+    if save_model_path:
+        onnx.save(model, save_model_path)
+    return model
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..96823647216acb23b8f1a6be39d00aacf53107ec
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py
@@ -0,0 +1,130 @@
+import onnx
+from collections import defaultdict
+
+import onnx
+import os
+
+## FYI
+ONNX_DTYPE = {
+    0: onnx.TensorProto.FLOAT,
+    1: onnx.TensorProto.FLOAT,
+    2: onnx.TensorProto.UINT8,
+    3: onnx.TensorProto.INT8,
+    4: onnx.TensorProto.UINT16,
+    5: onnx.TensorProto.INT16,
+    6: onnx.TensorProto.INT32,
+    7: onnx.TensorProto.INT64,
+    8: onnx.TensorProto.STRING,
+    9: onnx.TensorProto.BOOL,
+}
+
+
+def rewrite_tensor_dim(tensor, dim_value_dict):
+    if isinstance(dim_value_dict, list):
+        dim_value_dict = {idx: i for idx, i in enumerate(dim_value_dict)}
+    all_dim = tensor.type.tensor_type.shape.dim
+    for idx, value in dim_value_dict.items():
+        if isinstance(value, str):
+            all_dim[idx].dim_param = "batch"
+        else:
+            all_dim[idx].dim_value = value
+
+
+def rewrite_tensor_batch_size(tensor, batch_size):
+
+    dim_value_dict = {0: batch_size}
+    rewrite_tensor_dim(tensor, dim_value_dict)
+
+
+def get_tensor_dim(tensor):
+    dims = []
+    all_dim = tensor.type.tensor_type.shape.dim
+    rank = len(all_dim)
+    for i in range(rank):
+        if all_dim[i].dim_value:
+            dims.append(all_dim[i].dim_value)
+        else:
+            dims.append(all_dim[i].dim_param)
+    return dims
+
+
+def get_tensor_name(tensor):
+    return tensor.name
+
+
+def nchw_dim_to_nhwc_dim(dim_list):
+    assert len(dim_list) == 4
+    new_dim = [dim_list[0], dim_list[2], dim_list[3], dim_list[1]]
+    return new_dim
+
+
+def get_input_number(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+    inputs = model.graph.input
+    return len(inputs)
+
+def get_batch_size(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+    inputs = model.graph.input
+    return get_tensor_dim(inputs[0])[0]
+
+
+def count_op_type(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+    nodes = model.graph.node
+
+    node2count = defaultdict(int)
+    for i in nodes:
+        node2count[i.op_type] += 1
+
+    return node2count
+
+
+def contain_qlinear_opearator(onnx_model):
+    if isinstance(onnx_model, str):
+        onnx_model = onnx.load(onnx_model)
+
+    nodes = onnx_model.graph.node
+
+    for i in nodes:
+        op_type = i.op_type.lower()
+        if op_type.startswith("qlinear") or op_type.startswith("qgemm"):
+            return True
+    return False
+
+
+def get_all_node_name(model, exclude_constant=False, pretty_print=False):
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+    nodes = model.graph.node
+    if exclude_constant:
+        all_node = [i.name for i in nodes if i.op_type != "Constant"]
+    else:
+        all_node = [i.name for i in nodes]
+
+    all_node.sort()
+    if pretty_print:
+        res = [f'"{i}"' for i in all_node]
+        res = ",\n".join(res)
+        res = f'[\n{res}\n]'
+        print(res)
+
+    return all_node
+
+def rewrite_int64_input_to_int32(model):
+    inputs = model.graph.input
+    
+    for i in inputs:
+        if i.type.tensor_type.elem_type == 7:
+            i.type.tensor_type.elem_type = 6
+    
+    print(inputs)
+    import pdb;pdb.set_trace()
+    onnx.checker.check_model(model)
+
+    return model
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..2490c6643cae10a61790262dde71a25273bd4be9
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py
@@ -0,0 +1,531 @@
+import os
+import psutil
+from itertools import permutations
+import numpy as np
+
+import tvm
+from tvm import relay
+
+import onnx
+import onnx.helper as onnx_helper
+import onnxoptimizer
+from onnxsim import simplify
+from onnxruntime.quantization import (CalibrationDataReader, QuantFormat,
+                                      quantize_static, QuantType,
+                                      CalibrationMethod)
+
+from .onnx_util import contain_qlinear_opearator, rewrite_tensor_dim
+from .onnx_rewrite_batch_size import rewrite_batch_size
+from .dataloader import get_dataloader_from_args
+
+class Node:
+    def __init__(self, name, op_type, input, output):
+        self.name = name
+        self.op_type = op_type
+        self.input = input
+        self.output = output
+        
+        
+        self.from_node = []
+        self.to_node = []
+
+    def __repr__(self) -> str:
+        return f"{self.name} [{self.op_type}], input = {self.input}, output = {self.output}"
+
+    
+    @staticmethod
+    def connect(node_list):
+        perm = permutations(node_list, 2)
+        for (i, j) in perm:
+            i._connect(j)    
+    
+    def _connect(self, node):
+        if node in self.from_node or node in self.to_node:
+            return
+        for output in node.output:
+            if output in set(self.input):
+                node.to_node.append(self)
+                self.from_node.append(node)
+
+class Model:
+    @staticmethod
+    def add_ms_opset_domain(model,
+                            ms_opset_domain="com.microsoft",
+                            ms_opset_version=1):
+        found = False
+        for i in model.opset_import:
+            if i.domain == ms_opset_domain:
+                found = True
+                break
+
+        if not found:
+            ms_opset = onnx_helper.make_operatorsetid(ms_opset_domain,
+                                                        ms_opset_version)
+            model.opset_import.append(ms_opset)
+
+        return model
+
+    @staticmethod
+    def preprocess_onnx(model):
+        model = Model.add_ms_opset_domain(model)
+
+        passes = onnxoptimizer.get_available_passes()
+
+        no_need = [
+            # NOTE(chen.chen): the following passes cause some error, need to debug
+            "lift_lexical_references",
+            "split_init",
+            "split_predict",
+
+            # we do not want to rename anything
+            "rename_input_output",
+            "set_unique_name_for_nodes"
+        ]
+        passes = [i for i in passes if i not in no_need]       
+        model = onnxoptimizer.optimize(model, passes)
+
+        model, check = simplify(model)
+        assert check, "Simplified ONNX model could not be validated"
+
+        # model = onnx.shape_inference.infer_shapes(model, check_type=True, strict_mode=True, data_prop=True)
+        return model
+    
+    def __init__(self, model):
+        if isinstance(model, str):
+            model = onnx.load(model)
+        self.model = Model.preprocess_onnx(model)
+        
+        self.graph = self.model.graph
+        self.nodes = self.graph.node
+        self.node_list = []
+        for i in self.nodes:
+            self.node_list.append(Node(i.name, i.op_type, i.input, i.output))
+        Node.connect(self.node_list)
+        
+    
+    
+def find_detect_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    node_list = model.node_list
+    
+    
+    last_conv = []
+    # find last conv nodes before detect
+    for i in range(len(node_list) - 1, -1,  -1):
+        node = node_list[i]
+        if not node.op_type == "Conv":
+            continue
+        
+        after_node = node.to_node[:]
+        find_conv = False
+        while after_node:
+            last = after_node.pop()
+            after_node.extend(last.to_node)
+            
+            if last.op_type == "Conv":
+                find_conv = True
+                break
+
+        if not find_conv:
+            last_conv.append(node)
+    
+    
+    
+    exclude_detect_node_type = [
+        "Add", "Mul", "Concat",  
+        # "Reshape", "Exp", "Power", "Slice", "Split" ## these node will not be quantized actually
+        ]
+    exclude_detect_node_name = []
+    for i in last_conv:
+        after_node = i.to_node[:]
+        while after_node:
+            last = after_node.pop()
+            after_node.extend(last.to_node)
+            
+            if last.op_type in exclude_detect_node_type:
+                exclude_detect_node_name.append(last.name)
+    
+    exclude_detect_node_name = sorted(list(set(exclude_detect_node_name)))
+    return exclude_detect_node_name
+
+
+def find_unsupported_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    node_list = model.node_list
+    
+    
+    igie_not_supported_node_type = [
+        "Softmax",
+        "Gemm", # igie onnx frontend error for mobilenetv2
+    ]
+    exclude_node_name = []
+    for i in node_list:
+        if i.op_type in igie_not_supported_node_type:
+            exclude_node_name.append(i.name)
+       
+    return exclude_node_name
+
+
+def find_group_conv_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    nodes = model.graph.node
+
+    exclude_node_name = []
+    for node in nodes:
+        if node.op_type == "Conv":
+            attrs = node.attribute
+            for j in attrs:
+                if j.name == "group" and j.i != 1:
+                    exclude_node_name.append(node.name)
+       
+    return exclude_node_name
+
+class BaseDataReader(CalibrationDataReader):
+
+    def __init__(self, dataloader, cnt_limit=500):
+        # pytorch-like dataloader
+        self.dataloader = dataloader
+        self.cnt = 0
+        self.cnt_limit = cnt_limit
+        self.rewind()
+
+    def get_next(self):
+        raise NotImplementedError
+
+    def reset_dataloader(self):
+        self.dataloader_iter = iter(self.dataloader)
+
+    def rewind(self):
+        self.reset_dataloader()
+        self.cnt = 0
+
+    def set_dataloader(self, dataloader):
+        self.dataloader = dataloader
+        self.rewind()
+
+    def should_stop(self, memory_upper_bound=80):
+        # avoid oom
+        if BaseDataReader._exceed_memory_upper_bound(
+                upper_bound=memory_upper_bound
+        ) or self.cnt + 1 > self.cnt_limit:
+            return True
+        self.cnt += 1
+        return False
+
+    def get_next_data(self):
+        data = next(self.dataloader_iter, None)
+        if data is None:
+            self.reset_dataloader()
+            data = next(self.dataloader_iter, None)
+        return data
+
+    @staticmethod
+    def _exceed_memory_upper_bound(upper_bound=90):
+        # upper_bound in [0, 100]
+
+        info = psutil.virtual_memory()
+        total_percent = info.percent
+        if total_percent >= upper_bound:
+            return True
+        return False
+
+class ONNXDataReader(BaseDataReader):
+    def __init__(self, input_name_list, dataloader, cnt_limit=500):
+        self.input_name_list = input_name_list
+        super().__init__(dataloader, cnt_limit)
+    
+    def get_next(self):
+        if self.should_stop(memory_upper_bound=90):
+            return None
+        print(f"onnx calibration data count: {self.cnt}")
+        all_input = self.get_next_data()
+        
+        #NOTE(chen.chen)
+        # we assumen the all_input contains each input tensorin input_name_list with the same order
+        assert len(all_input) >= len(self.input_name_list)
+        ort_input = {k: np.array(v) for k, v in zip(self.input_name_list, all_input)}
+        return ort_input
+            
+
+def fill_onnx_input_shape(model_path, input_shape_list, model_save_path=None):
+    model = onnx.load(model_path)
+    inputs = model.graph.input
+
+    assert len(inputs) == len(input_shape_list), f"input number error, should be {len(inputs)}, got {len(input_shape_list)}"
+    for tensor, shape in zip(inputs, input_shape_list):
+        rewrite_tensor_dim(tensor, shape)
+        
+    model = Model.preprocess_onnx(model)
+    
+    if model_save_path is None:
+        model_save_path = f"{model_path[:-5]}_fill_input.onnx"
+    onnx.save(model, model_save_path)
+    
+    return model_save_path
+
+
+def onnx_quantize_model_from_args(args):
+    ori_model_path = args.model_path
+    assert ori_model_path.endswith(".onnx")
+    
+    # NOTE(chen.chen)
+    # we should just rewrite input_shape here since some batch_size dim of reshape op is fixed
+    # ori_model_path = fill_onnx_input_shape(ori_model_path, args.input_shape_list)
+    
+    # skip model which has been quantized
+    if contain_qlinear_opearator(ori_model_path):
+        return ori_model_path
+    
+    # check if quantization_config is valid
+    # NOTE(chen.chen)
+    # if user has not specified the quantization_config
+    # we should have a default config here
+
+    config = args.quantization_config.get("onnx", {})  
+    quant_format = config.get("quant_format", "qoperator").lower()
+    if quant_format == "qdq":   
+        quant_format = QuantFormat.QDQ
+    elif quant_format == "qoperator":
+        quant_format = QuantFormat.QOperator
+    else:
+        raise ValueError(f"invalid quant_format: {quant_format}")
+    
+    
+    
+    op_types_to_quantize = config.get("op_types_to_quantize", [])
+    per_channel = config.get("per_channel", False)
+    reduce_range = config.get("reduce_range", False)
+    nodes_to_quantize = config.get("nodes_to_quantize", [])
+    nodes_to_exclude = config.get("nodes_to_exclude", [])
+    skip_group_conv_layer = config.get("skip_group_conv_layer", False)
+    
+    if args.automatic_yolo_quantization:
+        yolo_detect_nodes = find_detect_node(ori_model_path)
+        nodes_to_exclude.extend([i for i in yolo_detect_nodes if i not in nodes_to_exclude])
+        
+    if skip_group_conv_layer:
+        group_conv_node = find_group_conv_node(ori_model_path)
+        print(group_conv_node)
+        nodes_to_exclude.extend([i for i in group_conv_node if i not in nodes_to_exclude])
+    
+    unsupport_node = find_unsupported_node(ori_model_path)
+    nodes_to_exclude.extend([i for i in unsupport_node if i not in nodes_to_exclude])
+    
+    calibrate_method = config.get("calibrate_method", "percentile").lower()
+    if calibrate_method == "minmax":
+        calibrate_method=CalibrationMethod.MinMax
+    elif calibrate_method == "entropy":
+        calibrate_method=CalibrationMethod.Entropy
+    elif calibrate_method == "percentile":
+        calibrate_method=CalibrationMethod.Percentile
+    else:
+        raise ValueError(f"invalid calibrate_method: {calibrate_method}")
+    
+    quant_model_path = f"{os.path.split(ori_model_path)[1][:-5]}_quant.onnx"
+    
+    
+    ## NOTE(chen.chen)
+    ## for memory issue, we will try to change the batchsize of model to 1 during quantization
+    ## but it only works for simple cv model
+    ## we reserve a field for user to control this behavior to avoid some strange batch-rewriting result 
+    memory_efficient_quant = config.get("memory_efficient_quant", True)
+    batch_size =  args.batch_size
+    if memory_efficient_quant:
+        model_input = ori_model_path[:-5] + "_b1.onnx"
+        rewrite_batch_size(ori_model_path, 
+                           batch_size=1,
+                           save_model_path=model_input)
+        args.batch_size = 1
+    else:
+        model_input = ori_model_path
+        
+    dataloader = get_dataloader_from_args(args)
+    
+    calibrate_data_count = config.get("calibrate_data_count", 20)
+    datareader = ONNXDataReader(args.input_name_list, dataloader, calibrate_data_count)
+    
+    args.batch_size = batch_size    
+    
+    if args.verbose:
+        print("onnx quanziation config:")
+        print("model_input: ", model_input)
+        print("model_output: ", quant_model_path)
+        print("quant_format: ", quant_format)
+        print("op_types_to_quantize: ", op_types_to_quantize)
+        print("per_channel: ", per_channel)
+        print("reduce_range: ", reduce_range)
+        print("nodes_to_quantize: ", nodes_to_quantize)
+        print("nodes_to_exclude: ", nodes_to_exclude)
+        print("calibrate_method: ", calibrate_method)
+        print("skip_group_conv_layer: ", skip_group_conv_layer)
+    
+    symmetric_quantize(
+        model_input=model_input,
+        model_output=quant_model_path, 
+        calibration_data_reader=datareader,
+        quant_format=quant_format,
+        op_types_to_quantize=op_types_to_quantize,
+        per_channel=per_channel,
+        reduce_range=reduce_range,
+        nodes_to_quantize=nodes_to_quantize,
+        nodes_to_exclude=nodes_to_exclude,
+        calibrate_method=calibrate_method)
+    
+    ## NOTE(chen.chen)
+    ## rewrite the batchsize back to the origin batchsize
+    if memory_efficient_quant: 
+        rewrite_batch_size(quant_model_path, 
+                           batch_size=args.batch_size,
+                           save_model_path=quant_model_path)
+    
+    return quant_model_path
+
+
+
+
+def igie_calibrate_dataset(dataloader, input_name_list, calibrate_data_count=3):
+    calibration_data_list = []
+    for idx, batch in enumerate(dataloader):
+        if idx >= calibrate_data_count:
+            break
+        
+        data_dict = {}
+        for data, name in zip(batch, input_name_list):
+            data_dict[name] = data
+        
+        calibration_data_list.append(data_dict)
+    return calibration_data_list
+
+def igie_quantize_model_from_args(mod, params, args):
+    
+    # NOTE(chen.chen)
+    # we need to remove unused function for tensorflow
+    from tvm.relay.transform.iluvatar import SimplifyGraph
+    mod = SimplifyGraph(mod, params)
+    
+    
+    config = args.quantization_config.get("igie", {})  
+    
+    
+    base_name = os.path.splitext(os.path.split(args.model_path)[1])[0]
+    
+    scale_file_path = config.get("scale_file_path", "")
+    if scale_file_path == "":
+        scale_file_path = f"quantize_scale_file_{base_name}_{args.target}.npy"
+    calibrate_mode = config.get("calibrate_mode", "percentile")
+    weight_scale = config.get("weight_scale", "max")
+    
+    
+    skip_first_conv_layer = config.get("skip_first_conv_layer", False)
+    if args.target != "iluvatar_with_all_libs":
+        skip_first_conv_layer = True
+        
+    skip_conv_layers = None
+    if skip_first_conv_layer:
+        skip_conv_layers = [0]
+
+    skip_dense_layer = config.get("skip_dense_layer", False)
+    calibrate_chunk_by = config.get("calibrate_chunk_by", -1)
+    skip_group_conv_layer = config.get("skip_group_conv_layer", False)
+    
+    global_scale = config.get("global_scale", 0.8)
+    calibrate_data_count = config.get("calibrate_data_count", 3)
+    
+    if args.verbose:
+        print("igie quanziation config:")
+        print("calibrate_mode: ", calibrate_mode)
+        print("weight_scale: ", weight_scale)
+        print("scale_file_path: ", scale_file_path)
+        print("skip_dense_layer: ", skip_dense_layer)
+        print("skip_first_conv_layer: ", skip_first_conv_layer)
+        print("skip_group_conv_layer: ", skip_group_conv_layer)
+        print("calibrate_chunk_by: ", calibrate_chunk_by)
+        print("global_scale: ", global_scale)
+        print("calibrate_data_count: ", calibrate_data_count)
+    
+    
+    if calibrate_mode == "global_scale":
+        with tvm.transform.PassContext(opt_level=3):
+            with relay.quantize.qconfig(calibrate_mode=calibrate_mode,
+                                        global_scale=global_scale,
+                                        skip_conv_layers=skip_conv_layers,
+                                        skip_dense_layer=skip_dense_layer):
+                mod = relay.quantize.quantize(mod, params)
+    
+    elif calibrate_mode == "percentile" or calibrate_mode == "kl_divergence":
+
+        dataloader = get_dataloader_from_args(args)
+        dataset = igie_calibrate_dataset(dataloader, args.input_name_list, calibrate_data_count)
+            
+        with tvm.transform.PassContext(opt_level=3):
+            with relay.quantize.qconfig(calibrate_mode=calibrate_mode,
+                                        weight_scale=weight_scale,
+                                        skip_conv_layers=skip_conv_layers,
+                                        skip_dense_layer=skip_dense_layer,
+                                        calibrate_chunk_by=calibrate_chunk_by,
+                                        import_scale_file=scale_file_path,
+                                        skip_group_conv_layers=skip_group_conv_layer):
+                mod = relay.quantize.quantize(mod, params, dataset=dataset)
+        
+    else:
+        raise ValueError(f"unsupported calibrate_mode: {calibrate_mode}")
+    
+
+    
+    
+    return mod, params
+
+
+
+
+def _modify_symmetric(extra_options):
+    if extra_options is None:
+        extra_options = {"ActivationSymmetric": True, "WeightSymmetric": True}
+    else:
+        extra_options["ActivationSymmetric"] = True
+        extra_options["WeightSymmetric"] = True
+
+    return extra_options
+
+
+
+def symmetric_quantize(
+    model_input,
+    model_output,
+    calibration_data_reader: CalibrationDataReader,
+    quant_format=QuantFormat.QOperator,
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    optimize_model=False,
+    calibrate_method=CalibrationMethod.Percentile,
+    extra_options=None,
+):
+    extra_options = _modify_symmetric(extra_options)
+    assert quant_format in [QuantFormat.QOperator, QuantFormat.QDQ]
+    quantize_static(model_input,
+                    model_output,
+                    calibration_data_reader=calibration_data_reader,
+                    quant_format=quant_format,
+                    op_types_to_quantize=op_types_to_quantize,
+                    per_channel=per_channel,
+                    reduce_range=reduce_range,
+                    activation_type=QuantType.QInt8,
+                    weight_type=QuantType.QInt8,
+                    nodes_to_quantize=nodes_to_quantize,
+                    nodes_to_exclude=nodes_to_exclude,
+                    optimize_model=optimize_model,
+                    use_external_data_format=False,
+                    calibrate_method=calibrate_method,
+                    extra_options=extra_options)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..907288df18e949c0d7e7163b6c81f46ff9d0ce8f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py
@@ -0,0 +1,21 @@
+def check_status(result_dict, args):
+    is_valid = True
+    if args.acc_target is not None:
+        acc_result = result_dict["acc_result"]
+        if acc_result < args.acc_target:
+            print(f"Expected acc_target is {args.acc_target}, got {acc_result}")
+            is_valid = False
+            
+    if args.fps_target is not None:
+        fps_result = result_dict["fps_result"]
+        if fps_result < args.fps_target:
+            print(f"Expected fps_target is {args.fps_target}, got {fps_result}")
+            is_valid = False
+    
+    if is_valid:
+        print("\n====Test Success!====\n")
+    else:
+        print("\n====Test failed!====\n")
+        exit(1)
+    
+    
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df46829cd9c33650b4e9d6b4b0beb71d597866b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py
@@ -0,0 +1,24 @@
+import tvm
+
+def get_target(target_name):
+    
+    target = None
+    if target_name == "llvm":
+        target = tvm.target.Target(target_name)
+    
+    elif target_name == "iluvatar":
+        target = tvm.target.iluvatar(model="MR")
+    
+    elif target_name == "iluvatar_with_cudnn_cublas":
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas")
+    elif target_name == "iluvatar_with_ixinfer":
+        target = tvm.target.iluvatar(model="MR", options="-libs=ixinfer")
+    elif target_name == "iluvatar_with_all_libs":
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")
+
+    else:
+        raise Exception(f"Unsupport Target name: {target_name}!")
+    
+    device = tvm.device(target.kind.name, 0)
+    
+    return target, device
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed0ad0f73d23587d10b0ceb6b248fdb0959c6014
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py
@@ -0,0 +1,81 @@
+import tvm
+import time
+from contextlib import contextmanager
+
+
+_get_timer = tvm.get_global_func("profiling.get_timer")
+_start = tvm.get_global_func("profiling.start")
+_stop = tvm.get_global_func("profiling.stop")
+_elapse_time = tvm.get_global_func("profiling.elapse_time")
+
+
+class Timer:
+    def __init__(self, device=None):
+        self.last_duration = 0  # ms
+        self.duration_list = []  # ms
+        
+        self.device = device
+        self._timer = None
+        if device is not None:
+            self._timer =  _get_timer(device)
+
+        self.start_cnt = 0
+        self.end_cnt = 0
+
+    def total_duration(self):
+        return sum(self.duration_list)
+
+    def _update(self, duration):
+        self.last_duration = duration
+        self.duration_list.append(self.last_duration)
+
+
+    def start(self):
+        assert self._timer is not None
+        self.start_cnt += 1
+        self.device.sync()
+        _start(self._timer)
+         
+    
+    def stop(self):
+        assert self._timer is not None
+        self.end_cnt += 1
+        assert self.end_cnt == self.start_cnt
+    
+        _stop(self._timer)
+        self._update(_elapse_time(self._timer) / 1e6)  ## ns / 1e6 -> ms
+
+
+
+    # @contextmanager
+    # def timeit_sync(self, device, use_host_time=False):
+    #     # NOTE(chen.chen)
+    #     # not works as expected when use device timer
+    #     # it seems python contextmanager always use host time?
+    #     if use_host_time:
+    #         device.sync()
+    #         t1 = time.time()
+
+    #         yield
+
+    #         device.sync()
+    #         t2 = time.time()
+    #         self._update((t2 - t1) * 1e3)  ## s * 1e3 -> ms
+    #     else:
+    #         timer = _get_timer(device)
+    #         device.sync()
+    #         _start(timer)
+
+    #         yield
+
+    #         _stop(timer)
+    #         self._update(_elapse_time(timer) / 1e6)  ## ns / 1e6 -> ms
+
+    # @contextmanager
+    # def timeit(self):
+    #     t1 = time.time()
+
+    #     yield
+
+    #     t2 = time.time()
+    #     self._update((t2 - t1) * 1e3)  ## s * 1e3 -> ms
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/compile_backend.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/compile_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..535a777c1f993a64de4a173ee6efe498626178fc
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/compile_backend.py
@@ -0,0 +1,84 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional
+
+
+class CompileBackend(object):
+    def __init__(self):
+        self.hardware_type = 'UnKnown'
+        self.need_reload = False
+        self.need_quant = False
+
+    def version(self) -> str:
+        """
+        Return compile backend version details
+        """
+        raise NotImplementedError("CompileBackend:version")
+
+    def pre_optimize(self, configs: Dict[str, Any]):
+        """
+        Model pre-optimization interface. Requirements: Model pre-optimization
+        cannot change the model format. Torch model export to ONNX is allowed.
+        """
+        return configs
+
+    def compile(self,
+                configs: Dict[str, Any],
+                dataloader=None) -> Dict[str, Any]:
+        """
+        Model compilation interface. Model conversion and compilation 
+        can be performed here. The model format can be changed here.
+
+        Arguments:
+            configs (list of ``str``s, optional): model configs.
+        """
+        raise NotImplementedError("CompileBackend:compile")
+
+    def tuning(self, configs: Dict[str, Any]):
+        """
+        This interface is reserved for the future. The purpose is
+        that some compilation optimization needs to be improved
+        according to the results of the first compilation and operation.
+        The tuning interface provides such a window for tuning.
+        """
+        return
+
+    def segment(self, configs: Dict[str, Any]):
+        """
+        This interface is reserved for the future. The purpose is
+        to better adapt to the scene of subgraph compilation in the future.
+        For manufacturers who place segment and compile in the same stage,
+        this interface can be ignored.
+        """
+        return
+
+    def get_interact_profile(self, config: Dict[str, Any]):
+        """
+        Load the interactive configuration interface. If the vendor needs
+        the user to provide some additional information, you can load the
+        json file you added here and return a list of dict. mlperf will 
+        display the content of the profile to the user and is responsible
+        for collecting feedback about the profile. If the user does not need
+        to provide additional information, return None here. get_interact_profile
+        can already get some workload info and model info, and the vendor can
+        also generate some options other than json under this API.
+        """
+        raise NotImplementedError("CompileBackend:get_interact_profile")
+
+    def get_best_batch_size(self) -> Optional[List[int]]:
+        """
+        Get Best Batch Size for the model
+        """
+        raise NotImplementedError("CompileBackend:get_best_batch_size")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/runtime_backend.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/runtime_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..db856b0bd897539bf6d40845b7dcf7b325f158a6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/runtime_backend.py
@@ -0,0 +1,65 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+
+class RuntimeBackend(object):
+    def __init__(self):
+        self.hardware_type = 'UnKnown'
+        self.need_reload = False
+        self.need_quant = False
+
+    def version(self) -> str:
+        """
+        Return runtime backend version details
+        """
+        raise NotImplementedError("RuntimeBackend:version")
+
+    def load(self, batch_size) -> str:
+        """
+        Return runtime backend version details
+        """
+        raise NotImplementedError("RuntimeBackend:load")
+
+    def get_loaded_batch_size(self) -> int:
+        """
+        Get Currect batch size
+        """
+        raise NotImplementedError("RuntimeBackend:get_loaded_batch_size")
+
+    def predict(self, data):
+        """
+        Run the compiled model and return the model output corresponding to the data.
+        """
+        raise NotImplementedError("RuntimeBackend:predict")
+
+    def is_qs_mode_supported(self) -> bool:
+        """
+        Used to check whether QSv2 Runtime is enabled
+        """
+        return False
+
+    def generate_qs_config(self) -> Dict[str, Any]:
+        """
+        Used only when is_qs_ported return True. Generate QS Config
+        File for QSv2 Runtime
+        """
+        return None
+
+    def benchmark(self, dataloader):
+        """
+        Performance Testing when qs mode is not enabled.
+        """
+        raise NotImplementedError("RuntimeBackend:benchmark")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/backend_store.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/backend_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..80b1e8fbfaacd479f968d6dfa2e710d82a9f2534
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/backend_store.py
@@ -0,0 +1,62 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+from general_perf.backends.compile_backend import CompileBackend
+from general_perf.backends.runtime_backend import RuntimeBackend
+
+log = logging.getLogger("BackendStore")
+
+__all__ = [
+    "CompileBackend",
+]
+
+
+def init_compile_backend(hardware_type: str) -> CompileBackend:
+    """
+    Load related compile backend with input hardware type
+
+    Arguments: str
+
+    Returns: CompileBackend()
+    """
+    log.info("Loading Compile Backend: {}".format(hardware_type))
+
+    compile_backend = importlib.import_module('general_perf.backends.' +
+                                              hardware_type +
+                                              ".compile_backend_" +
+                                              hardware_type.lower())
+    compile_backend = getattr(compile_backend,
+                              "CompileBackend" + hardware_type)
+    return compile_backend()
+
+
+def init_runtime_backend(hardware_type: str) -> RuntimeBackend:
+    """
+    Load related compile backend with input hardware type
+
+    Arguments: str
+
+    Returns: RuntimeBackend()
+    """
+    log.info("Loading Runtime Backend: {}".format(hardware_type))
+
+    runtime_backend = importlib.import_module('general_perf.backends.' +
+                                              hardware_type +
+                                              ".runtime_backend_" +
+                                              hardware_type.lower())
+    runtime_backend = getattr(runtime_backend,
+                              "RuntimeBackend" + hardware_type)
+    return runtime_backend()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/dataset_store.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/dataset_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..5309ce5a4f68f5b3a12d7bdcc611cd526f9bbaef
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/dataset_store.py
@@ -0,0 +1,43 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+from typing import Any, Dict
+import os
+import sys
+from general_perf.datasets.data_loader import Dataset
+
+log = logging.getLogger("DatasetStore")
+
+
+def load_dataset(config: Dict[str, Any]) -> Dataset:
+    """
+    Load related dataset class with config file
+    Args: Dict
+
+    Returns: Dataloader()
+    """
+    if config['dataset_name']:
+        dataset_name = config['dataset_name']
+        log.info("Loading Dataset: {}".format(config['dataset_name']))
+    else:
+        dataset_name = 'fake_dataset'
+        log.info("Loading Dataset: Dataset does not exist, using fake data")
+
+    data_loader = importlib.import_module('general_perf.datasets.' +
+                                          dataset_name + ".data_loader")
+    data_loader = getattr(data_loader, 'DataLoader')
+    dataset = data_loader(config)
+    return dataset
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/workload_store.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/workload_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed95088abad6da4a75263f479e03201d94360d42
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/workload_store.py
@@ -0,0 +1,46 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import logging
+from typing import Any, List, Dict
+
+log = logging.getLogger("WorkloadStore")
+
+
+def load_workload(task: str) -> Dict[str, Any]:
+    """
+    Return a list of dictionary with model Configuration
+
+    Args: List[str]
+
+    Returns: List[dic]
+    """
+    modules_dir = os.path.dirname(os.path.dirname(
+        os.path.dirname(__file__))) + '/workloads'
+
+    for file in os.listdir(modules_dir):
+        path = os.path.join(modules_dir, file)
+        if (not file.startswith('_') and not file.startswith('.')
+                and (file.endswith('.json') or os.path.isdir(path))
+                and file[:file.find('.json')] == task):
+            module_name = file
+            with open("general_perf/workloads/" + module_name, 'r') as f:
+                workload_dict = json.load(f)
+            return workload_dict
+    else:
+        log.error(
+            "Task name: [ {} ] was not found, please check your task name".
+            format(task))
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b7a4df767042ab408b6cc1194ffdec1bdac976f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
@@ -0,0 +1,395 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+import virtualenv
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+from general_perf.tools.build_pdf import build_pdf
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+        build_pdf(output_report_path)
+        log.info("PDF Version is saved in path: [ {}/{}-TO-{}.pdf ]".format(
+            output_dir[output_dir.rfind('general_perf'):],
+            base_report['Model'],
+            output_report_path.split('/')[-1].split('-')[1].upper()))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        if os.path.exists('general_perf/backends/' + hardware_type +
+                          '/requirements.txt'):
+            log.info("Activating Virtual Env for " + hardware_type)
+
+            venv_dir = os.path.join("general_perf/backends",
+                                    hardware_type + "/venv")
+            activate_file = os.path.join(venv_dir, 'bin', 'activate_this.py')
+            if not os.path.exists(venv_dir):
+                log.info("venv not exist, Creating Virtual Env for " +
+                         hardware_type)
+                if (hardware_type == "HPU"):
+                    virtualenv.create_environment(venv_dir,True)
+                else:
+                    virtualenv.create_environment(venv_dir)
+                exec(open(activate_file).read(), {'__file__': activate_file})
+                python_path = os.path.join(venv_dir, 'bin', 'python3')
+                subprocess.call([
+                    python_path, '-m', 'pip', 'install', '--upgrade', 'pip', '--quiet'
+                ])
+                subprocess.call([
+                    python_path, '-m', 'pip', 'install', '-r', 'general_perf/backends/' +
+                    hardware_type + '/requirements.txt', '-q'
+                ])
+            else:
+                exec(open(activate_file).read(), {'__file__': activate_file})
+                '''
+                just in case install failed in pre-run.
+                '''
+                python_path = os.path.join(venv_dir, 'bin', 'python3')
+                subprocess.call([
+                    python_path, '-m', 'pip', 'install', '--upgrade', 'pip', '--quiet'
+                ])
+                subprocess.call([
+                    python_path, '-m', 'pip', 'install', '-r', 'general_perf/backends/' +
+                    hardware_type + '/requirements.txt', '-q'
+                ])
+
+                if not hasattr(sys, 'real_prefix'):
+                    return False
+                return True
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..c029187c538951aba1268b049618625313e42bd3
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py
@@ -0,0 +1,91 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+
+log = logging.getLogger("Dataset")
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "FLOAT16": np.float16,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+
+class Dataset():
+    def __init__(self, config):
+        self.config = config
+        self.cur_bs = 1
+        self.batched_data = []
+        self.labels = []
+        self.items = 0
+        self.batch_num = int(self.items / self.cur_bs)
+
+    def name(self) -> str:
+        """
+        Return the name of dataset
+        """
+        raise NotImplementedError("Dataset:name")
+
+    def get_item_count(self) -> int:
+        """
+        Return the number of data loaded
+        """
+        return self.items
+
+    def get_batch_count(self) -> int:
+        """
+        Return the number of batched data
+        """
+        return self.batch_num
+
+    def preprocess(self):
+        """
+        Data preprocess will happened here
+        """
+        return
+
+    def get_samples(self, sample_id):
+        """
+        Query data with sample id
+        """
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+        return self.batched_data[sample_id], self.labels[sample_id]
+
+    def rebatch(self, new_bs, skip=True) -> None:
+        """
+        Rebatch Datasets to specified number
+        """
+        raise NotImplementedError("Dataset:rebatch")
+
+    def get_fake_samples(self, batch_size, shape, input_type):
+        """
+        Generate fake data for testing
+        """
+        data = {}
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [val[0] * batch_size] + val[1:]
+            data[key] = np.random.random(size=val).astype(
+                INPUT_TYPE[input_type[i]])
+            i += 1
+        return data
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aa109d6b500e72cd0a4cb00ffa1ec40efbb0644
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py
@@ -0,0 +1,132 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import data_loader
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT16": np.float16,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+log = logging.getLogger("FAKE_DATA")
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+        self.config = config
+        self.cur_bs = 1
+
+    def name(self):
+        return 'fake_dataset'
+
+    def get_batch_count(self):
+        # always return 100
+        return 100
+
+    def generate_fake_data(self):
+        input_shape = self.config["input_shape"]
+        input_type = self.config["input_type"].split(',')
+
+        return self.get_fake_samples_regular(self.cur_bs, input_shape,
+                                             input_type)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+
+    def get_samples(self, sample_id):
+        if sample_id > 99 or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+
+        np.random.seed(sample_id)
+        return self.generate_fake_data()
+
+    def get_fake_samples_regular(self, batch_size, shape, input_type):
+        data = {}
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [batch_size] + val[1:]
+            if 'LONG' in input_type[i] or 'INT' in input_type[i]:
+                if "mask" in key or "segment" in key:
+                    data[key] = np.random.randint(
+                        low=0, high=2,
+                        size=val).astype(INPUT_TYPE[input_type[i]])
+                elif self.config[
+                        "model"] == "internal_videobert01-onnx-fp32" and key == "1_input_1":
+                    data[key] = np.random.ones(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.randint(
+                        low=0, high=1000,
+                        size=val).astype(INPUT_TYPE[input_type[i]])
+
+            elif 'STRING' in input_type[i]:
+                data[key] = 'This is a test string.'
+            elif 'BOOL' in input_type[i]:
+                data[key] = np.zeros(shape=val, dtype=bool)
+            else:
+                sample_data = np.random.random(size=val) * 2 - 1
+                data[key] = sample_data.astype(INPUT_TYPE[input_type[i]])
+            i += 1
+
+        return data
+
+    def get_fake_samples_bert(self, batch_size, shape, input_type):
+        data = {}
+
+        avg_seq_len = 192
+        max_seq_len = 384
+
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [val[0] * batch_size] + val[1:]
+            if i == 0:
+                # fake input id and mask
+                input_ids = np.random.randint(low=0, high=30523,
+                                              size=val).astype(
+                                                  INPUT_TYPE[input_type[i]])
+                data[key] = input_ids
+            elif i == 1:
+                # fake input array length
+                input_len = np.random.randint(low=2 * avg_seq_len -
+                                              max_seq_len,
+                                              high=max_seq_len + 1,
+                                              size=(batch_size),
+                                              dtype=np.int32)
+
+                input_mask = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+
+                for b_idx, s_len in enumerate(input_len):
+                    input_mask[b_idx][:s_len] = 1
+                data[key] = input_mask
+            else:
+                data[key] = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+            i += 1
+        return data
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0f8712a446782a0cd1e22c155cdc3610fa0c183
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py
@@ -0,0 +1,50 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent=10):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            if isinstance(results, dict):
+                list_key = list(results.keys())
+                list_key.sort()
+                for key in list_key:
+                    diffs.extend(results[key].flatten())
+            elif isinstance(results, list):
+                for out in results:
+                    diffs.extend(out.flatten())
+            else:
+                diffs.extend(results)
+
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, 0.0))
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                np.array(diffs),
+                allow_pickle=True)
+        return {"Fake Dataset Accuracy": 0}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..25b2d3cb5d8d3802bb67d034515d2e1676a227d1
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py
@@ -0,0 +1,118 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from typing import Any, Dict
+import matplotlib.pyplot as plt
+import numpy as np
+
+log = logging.getLogger("TestAccuracy")
+
+
+def draw_all_diff(ori_outs, cur_outs, file_name) -> Dict[str, Any]:
+    ori_data = ori_outs.flatten()
+    cur_data = cur_outs.flatten()
+    '''
+    Nan & Inf is not compareable, replece with 0 
+    '''
+    ori_data[np.isnan(ori_data)] = 0.0
+    ori_data[np.isinf(ori_data)] = 0.0
+
+    cur_data[np.isnan(cur_data)] = 0.0
+    cur_data[np.isinf(cur_data)] = 0.0
+
+    length = min(ori_data.shape[0], 300)
+    diff = ori_data - cur_data
+
+    ori_data = np.where(ori_data == 0, 1, ori_data)
+    rel_diff = np.divide(diff, ori_data)
+    rel_diff = np.nan_to_num(rel_diff)
+
+    log.info('Mean Diff: {}, Std Diff: {}, Max Diff: {}, Max Rel-Diff: {}, Mean Rel-Diff: {}'.format(
+        np.mean(abs(diff)), np.std(abs(diff)),
+        abs(diff).max(), abs(rel_diff).max(), np.mean(abs(rel_diff))))
+
+    result = {}
+    result["Mean Diff"] = round(float(np.mean(abs(diff))), 5)
+    result["Std Diff"] = round(float(np.std(abs(diff))), 5)
+    result["Max Diff"] = round(float(abs(diff).max()), 5)
+    result["Max Rel-Diff"] = round(float(abs(rel_diff).max()), 5)
+    result["Mean Rel-Diff"] = round(float(np.mean(abs(rel_diff))), 5)
+
+    plt.figure(figsize=(16, 8))
+
+    plt.cla()
+
+    plt.subplot(1, 3, 1)
+    plt.yscale('log')
+    plt.hist(diff,
+             bins=length,
+             alpha=0.5,
+             label='Diff',
+             range=(diff.min(), diff.max()))
+    plt.xlabel("Diff Distribute")
+
+    plt.subplot(1, 3, 2)
+    plt.yscale('log')
+    plt.hist(ori_data,
+             bins=length,
+             alpha=0.5,
+             label='CPU',
+             range=(ori_data.min(), ori_data.max()))
+    plt.xlabel("CPU Result")
+
+    plt.subplot(1, 3, 3)
+    plt.yscale('log')
+    plt.hist(cur_data,
+             bins=length,
+             alpha=0.5,
+             label='Backend',
+             range=(cur_data.min(), cur_data.max()))
+    plt.xlabel("Backend Result")
+
+    plt.savefig(file_name, dpi=300)
+    return result
+
+
+class AccuracyChecker():
+    def __init__(self):
+        self.configs = None
+        self.dataloader = None
+        self.runtime_backend = None
+        self.output_dir = ""
+
+    def calculate_diff(self) -> Dict[str, float]:
+        """
+        Return a dictionary of Mean Diff, Std Diff and Max Diff
+
+        Args: None
+
+        Returns: Dict[str, float]
+        """
+        cpu_data_path = os.path.abspath('general_perf/reports/CPU/' +
+                                        self.configs["model"])
+        if not os.path.exists(cpu_data_path):
+            log.info("Fetch CPU Data Failed")
+            return {}
+        vendor_data = np.load(self.output_dir +
+                              "/{}.npy".format(self.dataloader.name()))
+        cpu_data = np.load(cpu_data_path +
+                           "/{}.npy".format(self.dataloader.name()))
+        return draw_all_diff(
+            cpu_data, vendor_data,
+            self.output_dir + "/" + self.configs["model"] + '-to-' + self.configs['compile_precision'].lower() + '.png')
+
+    def calculate_acc(self, data_percent) -> Dict[str, Any]:
+        raise NotImplementedError("Dataset: caculate_acc")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/launch.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..51cf30609f7536f0c35651a689a106c6611c8f3d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/launch.py
@@ -0,0 +1,89 @@
+import os
+import sys
+import argparse
+import subprocess
+import logging
+import json
+
+# ${prj_root}/byte_infer_perf
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+from general_perf.core.configs.workload_store import load_workload
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("LANUCH")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="CPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Task will stoped after compilation finished")
+    parser.add_argument("--show_task_list",
+                        action='store_true',
+                        help="Print all task names")
+    parser.add_argument("--show_hardware_list",
+                        action='store_true',
+                        help="Print all hardware bytemlperf supported")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    parsed_args = get_args()
+
+    if parsed_args.show_task_list:
+        log.info("******************* Supported Task *******************")
+        for file in os.listdir('general_perf/workloads'):
+            print(file[:-5])
+
+    if parsed_args.show_hardware_list:
+        log.info("***************** Supported Hardware Backend *****************")
+        for file in os.listdir('general_perf/backends'):
+            if not file.endswith('.py') and not file.startswith('_'):
+                print(file)
+
+    if parsed_args.task:
+        log.info("******************* Pip Package Installing *******************")
+        subprocess.call([
+            'python3', '-m', 'pip', 'install', 'pip', '--upgrade', '--quiet'])
+        subprocess.call([
+            'python3', '-m', 'pip', 'install', '-r', 'general_perf/requirements.txt', '--quiet'])
+
+        workload = load_workload(parsed_args.task)
+        with open("general_perf/model_zoo/" + workload['model'] + '.json', 'r') as file:
+            model_info = json.load(file)
+
+        if not os.path.exists(model_info['model_path']):
+            subprocess.call([
+                'bash', 'general_perf/prepare_model_and_dataset.sh',
+                model_info['model'], model_info['dataset_name'] or "None"])
+
+        # test numeric
+        if workload['test_numeric'] and not parsed_args.compile_only and not workload['compile_only']:
+            log.info("******************************************* Running CPU Numeric Checker... *******************************************")
+            subprocess.call([
+                'bash', 'general_perf/backends/CPU/calculate_cpu_diff.sh',
+                workload['model'],
+                str(workload['batch_sizes'][0])
+            ])
+
+        cmd = f'python3 general_perf/core/perf_engine.py --hardware_type {parsed_args.hardware_type} --task {parsed_args.task}'
+        if parsed_args.compile_only:
+            cmd += '--compile_only'
+        exit_code = subprocess.call(cmd, shell=True)
+        sys.exit(exit_code)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/albert-torch-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/albert-torch-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c2de2460c07ae8d399a3400c09c1118069dbe6b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/albert-torch-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "albert-torch-fp32",
+    "model_path": "general_perf/model_zoo/popular/open_albert/albert-base-squad.pt",
+    "framework": "Pytorch",
+    "framework_version": "1.9.1",
+    "model_format": "pt",
+    "model_precision": "FP32",
+    "inputs":"input_ids.1,attention_mask.1,token_type_ids.1",
+    "outputs":"start_logits,end_logits",
+    "input_shape": {"input_ids.1": [1,384], "attention_mask.1": [1,384], "token_type_ids.1": [1,384]},
+    "input_type": "LONG,LONG,LONG",
+    "dataset_name": "open_squad",
+    "max_batch_size": 64,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/bert-tf-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/bert-tf-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..956225beb2def5a119facdf3a0435a2bfa71175c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/bert-tf-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "bert-tf-fp32",
+    "model_path": "general_perf/model_zoo/regular/open_bert/bert_fp32",
+    "framework": "Tensorflow",
+    "framework_version": "2.4.0",
+    "model_format": "saved_model",
+    "model_precision": "FP32",
+    "inputs":"input_ids:0,input_mask:0,segment_ids:0",
+    "outputs":"logits:0",
+    "input_shape": {"input_ids:0": [1,384], "input_mask:0": [1,384], "segment_ids:0": [1,384]},
+    "input_type": "INT32,INT32,INT32",
+    "dataset_name": "open_squad",
+    "max_batch_size": 64,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/bert-torch-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/bert-torch-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..43395217eb37849cacc4f366cc36a48e719a50c1
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/bert-torch-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "bert-torch-fp32",
+    "model_path": "general_perf/model_zoo/regular/open_bert/bert-base-uncased-squad-v1.pt",
+    "framework": "Pytorch",
+    "framework_version": "1.9.1",
+    "model_format": "pt",
+    "model_precision": "FP32",
+    "inputs":"input_ids.1,attention_mask.1,token_type_ids.1",
+    "outputs":"start_logits,end_logits",
+    "input_shape": {"input_ids.1": [1,384], "attention_mask.1": [1,384], "token_type_ids.1": [1,384]},
+    "input_type": "LONG,LONG,LONG",
+    "dataset_name": "open_squad",
+    "max_batch_size": 64,
+    "is_quantized": false
+}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/chatglm2-6b-torch-fp16.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/chatglm2-6b-torch-fp16.json
new file mode 100644
index 0000000000000000000000000000000000000000..4e2f05fc442004eef2cdd89222231fecf6ab7c7c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/chatglm2-6b-torch-fp16.json
@@ -0,0 +1,15 @@
+{
+    "model": "chatglm2-6b-torch-fp16",
+    "model_path": "general_perf/model_zoo/sota/chatglm2-6b",
+    "framework": "Pytorch",
+    "framework_version": "1.13.1",
+    "model_format": "bin",
+    "model_precision": "FP16",
+    "inputs": null,
+    "outputs": null,
+    "input_shape": null,
+    "input_type": null,
+    "dataset_name": null,
+    "max_batch_size": null,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/clip-onnx-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/clip-onnx-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..d68c90ac1f2e3da0c272a35756931b13de8fb2f9
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/clip-onnx-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "clip-onnx-fp32",
+    "model_path": "general_perf/model_zoo/sota/stable_diffusion/clip-text-encoder.onnx",
+    "framework": "Onnx",
+    "framework_version": "1.12.0",
+    "model_format": "onnx",
+    "model_precision": "FP32",
+    "inputs":"input_ids",
+    "outputs":"last_hidden_state,pooler_output",
+    "input_shape": {"input_ids": [1,77]},
+    "input_type": "INT32",
+    "dataset_name": null,
+    "max_batch_size": 64,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/conformer-encoder-onnx-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/conformer-encoder-onnx-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..5b8d55a96ec938cebab89c99a174eb61f670134d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/conformer-encoder-onnx-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "conformer-encoder-onnx-fp32",
+    "model_path": "general_perf/model_zoo/popular/open_conformer/conformer_encoder.onnx",
+    "framework": "Onnx",
+    "framework_version": "1.6",
+    "model_format": "onnx",
+    "model_precision": "FP32",
+    "inputs":"src,src_pad_mask",
+    "outputs":"memory",
+    "input_shape": {"src": [1, 3, 64, 512], "src_pad_mask": [1, 128]},
+    "input_type": "FLOAT32,BOOL",
+    "dataset_name": null,
+    "max_batch_size": 64,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/deberta-torch-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/deberta-torch-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..1163161b9ea31f2d276ec1eb7c60a216252db989
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/deberta-torch-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "deberta-torch-fp32",
+    "model_path": "general_perf/model_zoo/popular/open_deberta/deberta-base-squad.pt",
+    "framework": "Pytorch",
+    "framework_version": "1.10.0",
+    "model_format": "pt",
+    "model_precision": "FP32",
+    "inputs":"input_ids.1,attention_mask.1,token_type_ids.1",
+    "outputs":"start_logits,end_logits",
+    "input_shape": {"input_ids.1": [1,384], "attention_mask.1": [1,384], "token_type_ids.1": [1,384]},
+    "input_type": "LONG,LONG,LONG",
+    "dataset_name": "open_squad",
+    "max_batch_size": 64,
+    "is_quantized": false
+}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/gpt2-torch-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/gpt2-torch-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..8044790defc2d720ff3dba43c63e95357fbfbe79
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/gpt2-torch-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "gpt2-torch-fp32",
+    "model_path": "general_perf/model_zoo/sota/traced_gpt2/traced_gpt2.pt",
+    "framework": "Pytorch",
+    "framework_version": "1.8.1",
+    "model_format": "pt",
+    "model_precision": "FP32",
+    "inputs":"input_ids",
+    "outputs":"outputs",
+    "input_shape": {"input_ids": [1,384]},
+    "input_type": "LONG",
+    "dataset_name": null,
+    "max_batch_size": 64,
+    "is_quantized": false
+}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/llama2-7b-torch-fp16.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/llama2-7b-torch-fp16.json
new file mode 100644
index 0000000000000000000000000000000000000000..45b41736468cd41ff0d47e4c1636f769afcf0ae8
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/llama2-7b-torch-fp16.json
@@ -0,0 +1,15 @@
+{
+    "model": "llama2-7b-torch-fp16",
+    "model_path": "general_perf/model_zoo/sota/llama/llama-2-7b",
+    "framework": "Pytorch",
+    "framework_version": "1.13.1",
+    "model_format": "pth",
+    "model_precision": "FP16",
+    "inputs": null,
+    "outputs": null,
+    "input_shape": null,
+    "input_type": null,
+    "dataset_name": null,
+    "max_batch_size": null,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/resnet50-tf-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/resnet50-tf-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1abdfa37ecd1694428582593fe619f90ee3ecc2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/resnet50-tf-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "resnet50-tf-fp32",
+    "model_path": "general_perf/model_zoo/regular/open_resnet50/resnet50-fp32",
+    "framework": "Tensorflow",
+    "framework_version": "2.4.0",
+    "model_format": "saved_model",
+    "model_precision": "FP32",
+    "inputs": "input_tensor:0",
+    "outputs": "softmax_tensor:0",
+    "input_shape": {"input_tensor:0": [1, 224, 224, 3]},
+    "input_type": "FLOAT32",
+    "dataset_name": "open_imagenet",
+    "max_batch_size": 64,
+    "layout": "NHWC"
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/resnet50-torch-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/resnet50-torch-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7c229fb3e2399156526630338668dfd3431dffa
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/resnet50-torch-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "resnet50-torch-fp32",
+    "model_path": "general_perf/model_zoo/regular/open_resnet50/resnet50.pt",
+    "framework": "Pytorch",
+    "framework_version": "1.8.1",
+    "model_format": "pt",
+    "model_precision": "FP32",
+    "inputs": "input_1.1",
+    "outputs": "softmax_tensor:0",
+    "input_shape":  {"input_1.1": [1, 3, 224, 224]},
+    "input_type": "FLOAT32",
+    "dataset_name": "open_imagenet",
+    "max_batch_size": 64,
+    "layout": "NCHW"
+}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/roberta-torch-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/roberta-torch-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..e039071845e5b2a7d96dbc255c801b514d0f4b01
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/roberta-torch-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "roberta-torch-fp32",
+    "model_path": "general_perf/model_zoo/popular/open_roberta/roberta-base-squad.pt",
+    "framework": "Pytorch",
+    "framework_version": "1.9.1",
+    "model_format": "pt",
+    "model_precision": "FP32",
+    "inputs":"input_ids.1,attention_mask.1,token_type_ids.1",
+    "outputs":"start_logits,end_logits",
+    "input_shape": {"input_ids.1": [1,384], "attention_mask.1": [1,384], "token_type_ids.1": [1,384]},
+    "input_type": "LONG,LONG,LONG",
+    "dataset_name": "open_squad",
+    "max_batch_size": 64,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/roformer-tf-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/roformer-tf-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..278e9e920a5ce242b7c925510c268c7ba5a68a3a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/roformer-tf-fp32.json
@@ -0,0 +1,14 @@
+{
+    "model": "roformer-tf-fp32",
+    "model_path": "general_perf/model_zoo/popular/open_roformer",
+    "framework": "Tensorflow",
+    "framework_version": "2.4.0",
+    "model_format": "saved_model",
+    "model_precision": "FP32",
+    "inputs": "input_segment:0,input_token:0",
+    "outputs": "Identity:0",
+    "input_shape": {"input_segment:0": [1, 1024], "input_token:0": [1, 1024]},
+    "input_type": "FLOAT32,FLOAT32",
+    "dataset_name": "open_cail2019",
+    "max_batch_size": 64
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/swin-large-torch-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/swin-large-torch-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..21f08ff21d54532d27bac041bef03cd7de6bb040
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/swin-large-torch-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "swin-large-torch-fp32",
+    "model_path": "general_perf/model_zoo/popular/swin-large/swin-transformer-large.pt",
+    "framework": "Pytorch",
+    "framework_version": "1.12.0",
+    "model_format": "pt",
+    "model_precision": "FP32",
+    "inputs":"pixel_values.1",
+    "outputs":"logits",
+    "input_shape": {"pixel_values.1": [1,3,384,384]},
+    "input_type": "FLOAT32",
+    "dataset_name": "open_imagenet",
+    "max_batch_size": 64,
+    "is_quantized": false
+}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/unet-onnx-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/unet-onnx-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..210645124c053077374c1372b206b44f5a059dc5
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/unet-onnx-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "unet-onnx-fp32",
+    "model_path": "general_perf/model_zoo/sota/unet/model.onnx",
+    "framework": "Onnx",
+    "framework_version": "1.12.0",
+    "model_format": "onnx",
+    "model_precision": "FP32",
+    "inputs":"sample,timestep,encoder_hidden_states",
+    "outputs":"out_sample",
+    "input_shape": {"sample": [1,4,32,32],"timestep":[1],"encoder_hidden_states":[1,77,768]},
+    "input_type": "FLOAT32,INT64,FLOAT32",
+    "dataset_name": null,
+    "max_batch_size": 64,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/vae-decoder-onnx-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/vae-decoder-onnx-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..62a720eb9ba02d3fb9ffc8ef2bc04d063d6be6d8
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/vae-decoder-onnx-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "vae-decoder-onnx-fp32",
+    "model_path": "general_perf/model_zoo/sota/stable_diffusion/vae-decoder.onnx",
+    "framework": "Onnx",
+    "framework_version": "1.12.0",
+    "model_format": "onnx",
+    "model_precision": "FP32",
+    "inputs":"latent_sample",
+    "outputs":"Convsample_dim_0,Convsample_dim_1,Convsample_dim_2,Convsample_dim_3",
+    "input_shape": {"latent_sample": [1,4,32,32]},
+    "input_type": "FLOAT32",
+    "dataset_name": null,
+    "max_batch_size": 64,
+    "is_quantized": false
+}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/vae-encoder-onnx-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/vae-encoder-onnx-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..2d997bb784802cbb6f1c70e4d8fb5577f0a9c360
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/vae-encoder-onnx-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "vae-encoder-onnx-fp32",
+    "model_path": "general_perf/model_zoo/sota/stable_diffusion/vae-encoder.onnx",
+    "framework": "Onnx",
+    "framework_version": "1.12.0",
+    "model_format": "onnx",
+    "model_precision": "FP32",
+    "inputs":"sample",
+    "outputs":"latent_sample",
+    "input_shape": {"sample": [1,3,256,256]},
+    "input_type": "FLOAT32",
+    "dataset_name": null,
+    "max_batch_size": 64,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/videobert-onnx-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/videobert-onnx-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..23d2ef6913678d3bb4954baac0a7f95be3a11b26
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/videobert-onnx-fp32.json
@@ -0,0 +1,15 @@
+{
+    "model": "videobert-onnx-fp32",
+    "model_path": "general_perf/model_zoo/popular/open_videobert/video-bert.onnx",
+    "framework": "Onnx",
+    "framework_version": "1.8.1",
+    "model_format": "onnx",
+    "model_precision": "FP32",
+    "inputs":"image,text",
+    "outputs":"output",
+    "input_shape": {"image": [1,3,224,224], "text": [100, 77]},
+    "input_type": "FLOAT32,LONG",
+    "dataset_name": "open_cifar",
+    "max_batch_size": 64,
+    "is_quantized": false
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/widedeep-tf-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/widedeep-tf-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..5941f6fc43c3f84d566599fa17317785b9c8197a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/widedeep-tf-fp32.json
@@ -0,0 +1,14 @@
+{
+    "model": "widedeep-tf-fp32",
+    "model_path": "general_perf/model_zoo/regular/open_wide_deep_saved_model",
+    "framework": "Tensorflow",
+    "framework_version": "2.4.0",
+    "model_format": "saved_model",
+    "model_precision": "FP32",
+    "inputs": "new_categorical_placeholder:0,new_numeric_placeholder:0",
+    "outputs": "import/head/predictions/probabilities:0",
+    "input_shape": {"new_categorical_placeholder:0": [26, 2], "new_numeric_placeholder:0": [1, 13]},
+    "input_type": "INT64,FLOAT32",
+    "dataset_name": "open_criteo_kaggle",
+    "max_batch_size": 16384
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/yolov5-onnx-fp32.json b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/yolov5-onnx-fp32.json
new file mode 100644
index 0000000000000000000000000000000000000000..35961a3c76dbad2d3d139ed281c8c9d604eb34f7
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/yolov5-onnx-fp32.json
@@ -0,0 +1,14 @@
+{
+    "model": "yolov5-onnx-fp32",
+    "model_path": "general_perf/model_zoo/popular/open_yolov5/yolov5s.onnx",
+    "framework": "Onnx",
+    "framework_version": "1.10.2",
+    "model_format": "onnx",
+    "model_precision": "FP32",
+    "inputs":"images",
+    "outputs":"output,345,403,461",
+    "input_shape": {"images": [1,3,640,640]},
+    "input_type": "FLOAT32",
+    "dataset_name": null,
+    "max_batch_size": 64
+}
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/prepare_model_and_dataset.sh b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7e89c2ff2525d0d7b1203848db448cf30b8a3739
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/prepare_model_and_dataset.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+echo "******************* Downloading Model....  *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+
+#--Basic Model--
+# https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/bert_mhlo.tar
+# https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50_mhlo.tar
+
+if [ $1 == "bert-tf-fp32" -o $1 == "bert-torch-fp32" ]; then
+    wget -O general_perf/download/open_bert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_bert.tar
+    tar xf general_perf/download/open_bert.tar -C general_perf/model_zoo/regular/
+elif [ $1 == "resnet50-tf-fp32" -o $1 == "resnet50-torch-fp32" ]; then
+    wget -O general_perf/download/resnet50.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50.tar
+    tar xf general_perf/download/resnet50.tar -C general_perf/model_zoo/regular/
+elif [ $1 == "widedeep-tf-fp32" ]; then
+    wget -O general_perf/download/open_wide_deep.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_wide_deep_saved_model.tar
+    tar xf general_perf/download/open_wide_deep.tar -C general_perf/model_zoo/regular/
+#--Popular Model--
+elif [ $1 == "albert-torch-fp32" ]; then
+    wget -O general_perf/download/open_albert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_albert.tar
+    tar xf general_perf/download/open_albert.tar -C general_perf/model_zoo/popular/ 
+elif [ $1 == "roformer-tf-fp32" ]; then
+    wget -O general_perf/download/open_roformer.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roformer.tar
+    tar xf general_perf/download/open_roformer.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "videobert-onnx-fp32" ]; then
+    wget -O general_perf/download/open_videobert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_videobert.tar
+    tar xf general_perf/download/open_videobert.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "yolov5-onnx-fp32" ]; then
+    wget -O general_perf/download/open_yolov5.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_yolov5.tar
+    tar xf general_perf/download/open_yolov5.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "conformer-encoder-onnx-fp32" ]; then
+    wget -O general_perf/download/open_conformer.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_conformer.tar
+    tar xf general_perf/download/open_conformer.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "roberta-torch-fp32" ]; then
+    wget -O general_perf/download/open_roberta.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roberta.tar
+    tar xf general_perf/download/open_roberta.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "deberta-torch-fp32" ]; then
+    wget -O general_perf/download/open_deberta.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_deberta.tar
+    tar xf general_perf/download/open_deberta.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "swin-large-torch-fp32" ]; then
+    wget -O general_perf/download/open-swin-large.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open-swin-large.tar
+    tar xf general_perf/download/open-swin-large.tar -C general_perf/model_zoo/popular/
+#--Sota Model--
+elif [ $1 == "vae-encoder-onnx-fp32" -o $1 == "vae-decoder-onnx-fp32" -o $1 == "clip-onnx-fp32" ]; then
+    wget -O general_perf/download/stable_diffusion.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/stable_diffusion.tar
+    tar xf general_perf/download/stable_diffusion.tar -C general_perf/model_zoo/sota/
+elif [ $1 == "unet-onnx-fp32" ]; then
+    wget -O general_perf/download/unet.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/unet.tar
+    tar xf general_perf/download/unet.tar -C general_perf/model_zoo/sota/
+elif [ $1 == "gpt2-torch-fp32" ]; then
+    wget -O general_perf/download/traced_gpt2.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/traced_gpt2.tar
+    mkdir general_perf/model_zoo/sota/traced_gpt2
+    tar xf general_perf/download/traced_gpt2.tar -C general_perf/model_zoo/sota/
+elif [ $1 == "chatglm2-6b-torch-fp16" ]; then
+    wget -O general_perf/download/chatglm2-6b.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/chatglm2-6b.tar
+    tar xf general_perf/download/chatglm2-6b.tar -C general_perf/model_zoo/sota/
+elif [ $1 == "llama2-7b-torch-fp16" ]; then
+    wget -O general_perf/download/llama-7b.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/llama-7b.tar
+    tar xf general_perf/download/llama-7b.tar -C general_perf/model_zoo/sota/
+fi
+
+# Download Datasets
+if [ $2 == "open_imagenet" ] && [ ! -f "general_perf/download/open_imagenet.tar" ] ; then
+    wget -O general_perf/download/open_imagenet.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_imagenet.tar
+    tar xf general_perf/download/open_imagenet.tar -C general_perf/datasets/
+elif [ $2 == "open_squad" ] && [ ! -f "general_perf/download/open_squad.tar" ]; then
+    wget -O general_perf/download/open_squad.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
+    tar xf general_perf/download/open_squad.tar -C general_perf/datasets/open_squad
+elif [ $2 == "open_criteo_kaggle" ] && [ ! -f "general_perf/download/eval.csv" ]; then
+    wget -O general_perf/download/eval.csv https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/eval.csv
+    cp general_perf/download/eval.csv general_perf/datasets/open_criteo_kaggle/eval.csv
+elif [ $2 == "open_cail2019" ] && [ ! -f "general_perf/download/open_cail2019.tar" ]; then
+    wget -O general_perf/download/open_cail2019.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_cail2019.tar
+    tar xf general_perf/download/open_cail2019.tar -C general_perf/datasets/open_cail2019 --strip-components 1
+elif [ $2 == "open_cifar" ] && [ ! -f "general_perf/download/cifar-100-python.tar" ]; then
+    wget -O general_perf/download/cifar-100-python.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/cifar-100-python.tar
+    tar xf general_perf/download/cifar-100-python.tar -C general_perf/datasets/open_cifar
+fi
+
+echo "Extract Done."
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e57e2c9c8f12a0f98011960785b28899a54741cf
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
@@ -0,0 +1,12 @@
+matplotlib
+pandas
+virtualenv==16.7.9
+scikit-learn
+prompt_toolkit
+tqdm
+opencv-python
+transformers
+tokenization
+fpdf
+typing-extensions==3.7.4.3
+numpy==1.23.0
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/build_pdf.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/build_pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d849045ba5f94b5be40e2e47fd57691455d8e29
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/build_pdf.py
@@ -0,0 +1,202 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from fpdf import FPDF
+import json
+import math
+import os
+
+
+class PDF(FPDF):
+    def titles(self, title, backend):
+        self.set_xy(0.0, 0.0)
+        self.set_font('Times', 'B', 16)
+        # self.set_text_color(220, 50, 50)
+        self.cell(w=210.0,
+                  h=40.0,
+                  align='C',
+                  txt=title + ' REPORT (' + backend + ')',
+                  border=0)
+
+    def lines(self):
+        self.rect(5.0, 5.0, 200.0, 287.0)
+
+    def icon(self, icon_path):
+        self.set_xy(10.0, 10.0)
+        self.image(icon_path, link='', type='', w=37.6, h=5.2)
+        self.set_xy(157.0, 0.0)
+        self.set_font('Times', 'B', 10)
+        # self.set_text_color(220, 50, 50)
+        self.cell(w=60.0, h=25.0, align='C', txt='BYTE MLPERF', border=0)
+
+    def charts(self, chart_path):
+        self.y += 5
+        self.x += 6
+        self.image(chart_path, link='', type='', w=700 / 4, h=450 / 4.9)
+
+    def diff_tables(self, data, dataset):
+        col_width = 45
+        # self.set_xy(10.00125,40)
+        x = self.x
+        i = 0
+        self.set_font("Times", 'B', size=10)
+        line_height = self.font_size * 2.5
+        self.x = x + 5
+        self.multi_cell(90 * math.ceil(((len(data)) / 3)),
+                        line_height,
+                        'Accuracy Results' + ' (' + dataset + ')',
+                        border=1,
+                        align='C')
+        y = self.y
+        reset_y = self.y
+        self.ln(line_height)
+        self.set_font("Times", size=10)
+        final_y = None
+        for i, (key, val) in enumerate(data.items()):
+            if i < 4:
+                if (i % 3 == 0):
+                    final_y = y
+                    y = reset_y
+                self.x = x + 90 * (i // 3) + 5
+                self.y = y
+                self.multi_cell(col_width,
+                                line_height,
+                                key,
+                                border=1,
+                                align='C')
+                self.x += (45 + 90 * (i // 3)) + 5
+                self.y = y
+                self.multi_cell(col_width,
+                                line_height,
+                                str(val),
+                                border=1,
+                                align='C')
+                y = self.y
+                i += 1
+        if final_y:
+            self.y = final_y
+
+    def graph_tables(self, data):
+        real_data = []
+        row_name = []
+        row_data = []
+        for key, val in data.items():
+            row_name.append(key)
+            row_data.append(str(val))
+        real_data.append(row_name)
+        real_data.append(row_data)
+
+        col_width = 45
+        self.set_xy(10.00125, 30)
+        x = self.x
+        self.x += 27
+        self.set_font("Times", 'B', size=10)
+        line_height = self.font_size * 2.5
+        self.multi_cell(135,
+                        line_height,
+                        'Graph Compilation Results',
+                        border=1,
+                        align='C')
+        y = self.y
+        self.ln(line_height)
+        self.set_font("Times", size=10)
+        for row in real_data:
+            self.x = x
+            for i, datum in enumerate(row):
+                self.y = y
+                self.x += (i + 1) * 45 - 18
+                self.multi_cell(col_width,
+                                line_height,
+                                str(datum),
+                                border=1,
+                                align='C')
+            y = self.y
+        self.y += 5
+
+    def performance_tables(self, data):
+        real_data = []
+        row_name = []
+        for i in range(len(data)):
+            row_data = []
+            for key, val in data[i].items():
+                if i == 0:
+                    row_name.append(key)
+                row_data.append(val)
+            real_data.append(row_data)
+        real_data.insert(0, row_name)
+
+        col_width = 33.75
+        self.set_xy(10.00125, 65)
+        x = self.x
+        self.x += 27
+        self.set_font("Times", 'B', size=10)
+        line_height = self.font_size * 2.5
+        self.multi_cell(135,
+                        line_height,
+                        'Performance Results',
+                        border=1,
+                        align='C')
+        y = self.y
+        self.ln(line_height)
+        self.set_font("Times", size=10)
+        for row in real_data:
+            self.x = x
+            for i, datum in enumerate(row):
+                self.y = y
+                self.x += (i + 1) * 33.75 - 6.75
+                self.multi_cell(col_width,
+                                line_height,
+                                str(datum),
+                                border=1,
+                                align='C')
+            y = self.y
+
+            self.ln(line_height)
+
+    def footer(self):
+        # Go to 1.5 cm from bottom
+        self.set_y(-15)
+        # Select Arial italic 8
+        self.set_font('Arial', 'I', 8)
+        # Print centered page number
+        self.cell(0, 10, '%s' % self.page_no(), 0, 0, 'C')
+
+    def generate_report(self, path):
+        with open(path, 'r') as f:
+            report = json.load(f)
+        output_dir = os.path.dirname(path) + '/'
+        index = output_dir.index('ByteMLPerf') + len('ByteMLPerf')
+        base_path = output_dir[:index]
+
+        icon_path = os.path.join(base_path, 'docs/images/icon.png')
+        self.add_page()
+        self.lines()
+        self.icon(icon_path)
+        self.graph_tables(report['Graph Compile'])
+        if 'Performance' in report:
+            self.performance_tables(report['Performance'])
+        if 'Accuracy' in report:
+            self.diff_tables(report['Accuracy'], report['Dataset'])
+            if 'Diff Dist' in report['Accuracy']:
+                self.charts(output_dir + report['Accuracy']['Diff Dist'])
+        self.titles(report['Model'], report['Backend'])
+        self.set_author('Bytedance')
+        precision = path.split('/')[-1].split('-')[1]
+        self.output(output_dir + report['Model'] + '-TO-' + precision.upper() + '.pdf', 'F')
+        return True
+
+
+def build_pdf(path):
+    pdf = PDF(orientation='P', unit='mm', format='A4')
+    return pdf.generate_report(path)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/convert.sh b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/convert.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f2238f750780de26d693f8e0d35914b4de31bdf6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/convert.sh
@@ -0,0 +1,17 @@
+#！bin/bash
+if [ ! -d "tools/venv" ]; then
+    python3 -m virtualenv tools/venv
+    source tools/venv/bin/activate
+    tools/venv/bin/python3 -m pip install --upgrade pip -q
+    tools/venv/bin/python3 -m pip install -r tools/requirements.txt -q
+else
+    source tools/venv/bin/activate
+fi
+
+if [ "$3" == "pt2onnx" ];then
+    python3 tools/torch_to_onnx.py --model_path $1 --output_path $2
+elif [ "$3" == "saved2onnx" ];then
+    python3 tools/saved_to_onnx.py --model_path $1 --output_path $2
+elif [ "$3" == "saved2frozen" ];then
+    python3 tools/saved_to_frozen.py --model_path $1 --output_path $2
+fi
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/frozen_to_saved.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/frozen_to_saved.py
new file mode 100644
index 0000000000000000000000000000000000000000..98a43e86cd661a377959a8c245651b6ebb49b41d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/frozen_to_saved.py
@@ -0,0 +1,67 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import signature_constants
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
+
+
+def convert_pb_to_server_model(pb_model_path, export_dir, input_names,
+                               output_names):
+    if not input_names:
+        raise ValueError("Converter needs inputs")
+    if not output_names:
+        raise ValueError("Converter needs outputs")
+    input_names = input_names.split(",")
+    output_names = output_names.split(",")
+    graph_def = read_pb_model(pb_model_path)
+    convert_pb_saved_model(graph_def, export_dir, input_names, output_names)
+
+
+def read_pb_model(pb_model_path):
+    with tf.io.gfile.GFile(pb_model_path, "rb") as f:
+        graph_def = tf.compat.v1.GraphDef()
+        graph_def.ParseFromString(f.read())
+        return graph_def
+
+
+def convert_pb_saved_model(graph_def, export_dir, input_names, output_names):
+    builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
+
+    sigs = {}
+    with tf.Session(graph=tf.Graph()) as sess:
+        tf.import_graph_def(graph_def, name="")
+        g = tf.get_default_graph()
+        input_infos = {}
+        output_infos = {}
+        for input_name in input_names:
+            input_infos[input_name] = g.get_tensor_by_name(input_name)
+        for output_name in output_names:
+            output_infos[output_name] = g.get_tensor_by_name(output_name)
+
+        sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \
+            tf.saved_model.signature_def_utils.predict_signature_def(
+                input_infos, output_infos)
+
+        builder.add_meta_graph_and_variables(sess, [tag_constants.SERVING],
+                                             signature_def_map=sigs)
+        builder.save()
+
+
+path = "densenet121.pb"
+convert_pb_to_server_model(path,
+                           os.path.abspath('.') + "/densenet_saved_model",
+                           "input_1", "fc1000")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/h5_to_frozen.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/h5_to_frozen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0310d4da2df87f612b8a101c163749bfeb054dca
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/h5_to_frozen.py
@@ -0,0 +1,56 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+from tensorflow.keras import backend
+from tensorflow.python.tools import freeze_graph
+from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
+import logging
+import argparse
+
+
+def frozen_graph(h5_file_path, workdir, pb_name):
+    model = tf.keras.models.load_model(h5_file_path,
+                                       custom_objects={
+                                           "backend": backend,
+                                       })
+    model.summary()
+
+    full_model = tf.function(lambda input_1: model(input_1))
+    full_model = full_model.get_concrete_function(
+        tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype))
+
+    # Get frozen ConcreteFunction
+    frozen_func = convert_variables_to_constants_v2(full_model)
+    frozen_func.graph.as_graph_def()
+
+    layers = [op.name for op in frozen_func.graph.get_operations()]
+    print(frozen_func.outputs)
+
+    # Save frozen graph from frozen ConcreteFunction to hard drive
+    tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
+                      logdir=workdir,
+                      name=pb_name,
+                      as_text=False)
+    print('model has been saved')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='VC model h5->freezedpb script')
+    parser.add_argument("--h5_model_path", type=str, required=True)
+    parser.add_argument("--freezed_pb_name", type=str, required=True)
+    parser.add_argument("--workdir", type=str, required=True)
+    args = parser.parse_args()
+    frozen_graph(args.h5_model_path, args.workdir, args.freezed_pb_name)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/model_trt_convert.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/model_trt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..2028a545d7c6b4078a730c29118d6595ec3a6f36
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/model_trt_convert.py
@@ -0,0 +1,43 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+import numpy as np
+gpus = tf.config.experimental.list_physical_devices('GPU')
+if gpus:
+    try:
+        # Currently, memory growth needs to be the same across GPUs
+        for gpu in gpus:
+            tf.config.experimental.set_memory_growth(gpu, True)
+        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
+        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
+    except RuntimeError as e:
+        # Memory growth must be set before GPUs have been initialized
+        print(e)
+
+
+def my_calibration_input_fn():
+    for _ in range(10):
+        yield np.random.normal(size=(1, 224, 224, 3)).astype(np.uint8),
+        # yield tf.random.normal((1, 224, 224, 3)).astype(np.uint8),
+
+
+saved_model_path = 'byte_mlperf/model_zoo/resnet50_saved_model'
+model_params = tf.experimental.tensorrt.ConversionParams(
+    precision_mode="int8".upper(), max_batch_size=64, use_calibration=True)
+model_trt = tf.experimental.tensorrt.Converter(
+    input_saved_model_dir=saved_model_path, conversion_params=model_params)
+model_trt.convert(calibration_input_fn=my_calibration_input_fn)
+output_saved_model_dir = 'test'
+model_trt.save(output_saved_model_dir)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/mxnet_to_onnx.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/mxnet_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af7385a29da02219b0c6e0ba381163f28f6290e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/mxnet_to_onnx.py
@@ -0,0 +1,84 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import mxnet as mx
+
+import numpy as np
+import onnx
+
+
+def get_mod(prefix, epoch, ctx, data_shape):
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+
+    mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+
+    mod.bind(for_training=False,
+             data_shapes=[("data", data_shape)],
+             label_shapes=mod._label_shapes)
+
+    mod.set_params(arg_params, aux_params, allow_missing=True)
+
+    return mod
+
+
+def load_mxnet():
+    prefix = "image_level_space"
+    epoch = 0
+    ctx = mx.cpu()
+    data_shape = (1, 3, 736, 416)
+
+    mod = get_mod(prefix, epoch, ctx, data_shape)
+
+    return mod
+
+
+'''
+require mxnet >= 19.0
+'''
+
+
+def do_mxnet2onnx(sym, params, onnx_file, in_shapes, in_types,
+                  dynamic_input_shapes):
+    '''
+    example:
+
+    sym = 'byte_mlperf/byte_mlperf/download/manysplit/image_level_space-symbol.json'
+    params = 'byte_mlperf/byte_mlperf/download/manysplit/image_level_space-0000.params'
+    onnx_file = 'manysplit.onnx'
+
+    in_shapes = [(1,3,736,416)]
+    in_types = [np.float32]
+    dynamic_input_shapes = [(None,3,736,416)]
+    '''
+
+    converted_model_path = mx.onnx.export_model(
+        sym,
+        params,
+        in_shapes,
+        in_types,
+        onnx_file,
+        dynamic=True,
+        dynamic_input_shapes=dynamic_input_shapes,
+        verbose=True)
+
+    # Load the ONNX model
+    model_proto = onnx.load_model(converted_model_path)
+
+    # Check if the converted ONNX protobuf is valid
+    onnx.checker.check_graph(model_proto.graph)
+
+
+if __name__ == "__main__":
+    # load_mxnet()
+    do_mxnet2onnx()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/onnx_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/onnx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d9e2638fc9f9e845643e02ad1377f3dac2c885
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/onnx_utils.py
@@ -0,0 +1,699 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import cast
+import numpy as np
+from numpy.lib.function_base import append
+import onnx
+import onnx.helper as helper
+import onnxruntime as rt
+from onnx import numpy_helper
+from onnx.tools import update_model_dims
+from onnx import shape_inference, TensorProto
+import struct
+import copy
+import sys
+'''
+DType Info
+'''
+ONNX_DTYPE = {
+    0: TensorProto.FLOAT,  # UNDEFINE, default as float32
+    1: TensorProto.FLOAT,
+    2: TensorProto.UINT8,
+    3: TensorProto.INT8,
+    4: TensorProto.UINT16,
+    5: TensorProto.INT16,
+    6: TensorProto.INT32,
+    7: TensorProto.INT64,
+    8: TensorProto.STRING,
+    9: TensorProto.BOOL,
+    10: TensorProto.FLOAT16,
+    11: TensorProto.DOUBLE,
+    12: TensorProto.UINT32,
+    13: TensorProto.UINT64,
+}
+'''
+Nodes
+'''
+
+
+def get_node_by_name(graph, name):
+    for node in graph.node:
+        if node.name == name:
+            return node
+    return None
+
+
+def get_nodes_by_optype(graph, typename):
+    nodes = []
+    for node in graph.node:
+        if node.op_type == typename:
+            nodes.append(node)
+    return nodes
+
+
+def get_node_by_output_name(graph, name):
+    for node in graph.node:
+        if node.output[0] == name:
+            return node
+    return None
+
+
+def get_node_successor(graph, target_node):
+    successor = []
+    for node in graph.node:
+        if len(list(set(node.input).intersection(set(
+                target_node.output)))) > 0:
+            successor.append(node)
+    return successor
+
+
+def get_value_info_by_name(graph, name):
+    for val_info in graph.value_info:
+        if val_info.name == name:
+            return val_info
+    return None
+
+
+def get_shape_from_value_info(val_info):
+    shape = [d.dim_value for d in val_info.type.tensor_type.shape.dim]
+    return shape
+
+
+def remove_weights(graph, name_list):
+    rm_list = []
+    for weight in graph.initializer:
+        if weight.name in name_list:
+            rm_list.append(weight)
+    for weight in rm_list:
+        graph.initializer.remove(weight)
+
+
+def remove_inputs(graph, name_list):
+    rm_list = []
+    for input_t in graph.input:
+        if input_t.name in name_list:
+            rm_list.append(input_t)
+    for input_t in rm_list:
+        graph.input.remove(input_t)
+
+
+def remove_value_infos(graph, name_list):
+    rm_list = []
+    for value_info in graph.value_info:
+        if value_info.name in name_list:
+            rm_list.append(value_info)
+    for value_info in rm_list:
+        graph.value_info.remove(value_info)
+
+
+def remove_node_by_name(graph, name):
+    target_node = get_node_by_name(graph, name)
+    remove_node(graph, target_node)
+
+
+def remove_node(graph, target_node):
+    '''
+        remove the node with only one input and only one output
+    '''
+    node_input = target_node.input[0]
+    node_output = target_node.output[0]
+    # set input of successor node to predecessor node of target node
+    for node in graph.node:
+        for i, n in enumerate(node.input):
+            if n == node_output:
+                node.input[i] = node_input
+
+    target_names = set(target_node.input) & set(
+        [weight.name for weight in graph.initializer])
+    remove_weights(graph, target_names)
+    target_names.add(node_output)
+    remove_inputs(graph, target_names)
+    remove_value_infos(graph, target_names)
+    graph.node.remove(target_node)
+
+
+'''
+Constant & Initializer
+'''
+
+
+def is_initializer(graph, name):
+    for tensor in graph.initializer:
+        if tensor.name == name:
+            return True
+    return False
+
+
+def get_initializer_by_name(graph, name):
+    for tensor in graph.initializer:
+        if tensor.name == name:
+            return tensor
+    return None
+
+
+def get_init_value(tensor):
+    return numpy_helper.to_array(tensor)
+
+
+def set_init_value(graph, weight, data_numpy):
+    # NOTE: weight can be stroed in human readable fields(float_data, int32_data, string_data, ...)
+    # as well as raw_data, if we set weight by raw_data, we must clear the fields above to make it effective
+    # NOTE: data_type between numpy and TensorProto
+
+    raw_shape = tuple([i for i in weight.dims])
+    new_shape = np.shape(data_numpy)
+
+    if weight.data_type == 8:
+        # string data type is special, it requires to store data in string_data field
+        # NOT the raw_data field
+        weight.string_data = bytes(data_numpy, encoding="utf8")
+        weight.ClearField("raw_data")
+
+        return
+
+    if new_shape != raw_shape:
+        print(
+            "Warning: the new weight shape is not consistent with original shape!"
+        )
+
+        weight.dims[:] = list(new_shape)
+
+        #  in cast is graph input?
+        for model_input in graph.input:
+            if model_input.name == weight.name:
+                # copy from onnx.helper...
+                tensor_shape_proto = model_input.type.tensor_type.shape
+                tensor_shape_proto.ClearField("dim")
+                tensor_shape_proto.dim.extend([])
+                for d in new_shape:
+                    dim = tensor_shape_proto.dim.add()
+                    dim.dim_value = d
+
+    weight.ClearField("float_data")
+    weight.ClearField("int32_data")
+    weight.ClearField("int64_data")
+    weight.raw_data = data_numpy.tobytes()
+
+    return
+
+
+def is_constant(node):
+    if node.op_type == "Constant":
+        return True
+    else:
+        return False
+
+
+def get_constant_value(node):
+    for attr in node.attribute:
+        if attr.name == 'value':
+            if attr.t.data_type == 1:
+                return np.array(struct.unpack('f', attr.t.raw_data))
+            elif attr.t.data_type == 2:
+                return np.array(struct.unpack('i', attr.t.raw_data))
+            elif attr.t.data_type == 3:
+                return np.array(struct.unpack('s', attr.t.raw_data))
+            elif attr.t.data_type == 4:
+                return np.array(struct.unpack('t', attr.t.raw_data))
+            elif attr.t.data_type == 5:
+                return np.array(struct.unpack('g', attr.t.raw_data))
+            elif attr.t.data_type == 6:
+                return np.frombuffer(attr.t.raw_data, dtype=np.float32)
+            elif attr.t.data_type == 7:
+                return np.frombuffer(attr.t.raw_data, dtype=np.int32)
+            elif attr.t.data_type == 8:
+                return np.frombuffer(attr.t.raw_data, dtype=np.string)
+            elif attr.t.data_type == 9:
+                return np.frombuffer(attr.t.raw_data, dtype=np.bool)
+            elif attr.t.data_type == 10:
+                return np.frombuffer(attr.t.raw_data, dtype=np.float16)
+            elif attr.t.data_type == 11:
+                return np.frombuffer(attr.t.raw_data, dtype=np.double)
+            elif attr.t.data_type == 12:
+                return np.frombuffer(attr.t.raw_data, dtype=np.uint32)
+            elif attr.t.data_type == 13:
+                return np.frombuffer(attr.t.raw_data, dtype=np.uint64)
+            else:
+                print("unsupported attribute data type with attribute name")
+
+
+def set_constant_value(target_node, value):
+    # NOTE : dtype value should match with target_node
+    for attr in target_node.attribute:
+        if (attr.name == "value"):
+            attr.t.raw_data = value.tobytes()
+
+
+'''
+Attributes
+'''
+
+
+def get_attribute_by_name(node, name):
+    for attr in node.attribute:
+        if attr.name == name:
+            return attr
+    return attr
+
+
+def set_node_attribute(target_node, attr_name, attr_value):
+    flag = False
+    for attr in target_node.attribute:
+        if (attr.name == attr_name):
+            if attr.type == 1:  # float value
+                attr.f = attr_value
+            elif attr.type == 2:  # int value
+                attr.i = attr_value
+            elif attr.type == 3:  # string value
+                attr.s = attr_value
+            elif attr.type == 4:  # tensor value
+                attr.t = attr_value
+            elif attr.type == 5:  # graph value
+                attr.g = attr_value
+            # NOTE: For repeated composite types, we should use something like
+            # del attr.xxx[:]
+            # attr.xxx.extend([n1, n2, n3])
+            elif attr.type == 6:  # float[]
+                attr.floats[:] = attr_value
+            elif attr.type == 7:  # int[]
+                attr.ints[:] = attr_value
+            elif attr.type == 8:  # strings[]
+                attr.strings[:] = attr_value
+            else:
+                print("unsupported attribute data type with attribute name")
+                return False
+            flag = True
+
+    if not flag:
+        # attribute not in original node
+        print("Warning: you are appending a new attribute to the node!")
+        target_node.attribute.append(
+            helper.make_attribute(attr_name, attr_value))
+        flag = True
+
+    return flag
+
+
+'''
+Graph Input/Output
+'''
+
+
+def add_extra_output(graph, target_output, target_shape):
+
+    extra_elem_type = 1
+    for vi in graph.value_info:
+        if vi.name == target_output:
+            extra_elem_type = vi.type.tensor_type.elem_type
+
+    extra_output = helper.make_tensor_value_info(target_output,
+                                                 extra_elem_type, target_shape)
+    '''
+    # NOTE
+    # if we know the value type and shape, we can alse use this
+    def make_tensor_value_info(
+        name,  # type: Text
+        elem_type,  # type: int
+        shape,  # type: Optional[Sequence[Union[Text, int]]]
+        doc_string="",  # type: Text
+        shape_denotation=None,  # type: Optional[List[Text]]
+    ):
+    '''
+
+    graph.output.append(extra_output)
+    return
+
+
+def get_graph_input_by_name(graph, name):
+    for input in graph.input:
+        if input.name == name:
+            return input
+    return None
+
+
+def get_graph_output_by_name(graph, name):
+    for out in graph.output:
+        if out.name == name:
+            return out
+    return None
+
+
+def resort_nodes(model):
+    new_model = copy.deepcopy(model)
+    for n in new_model.graph.node:
+        model.graph.node.remove(n)
+
+    ready_tensors = [n.name for n in model.graph.input]
+    ready_tensors.extend([n.name for n in model.graph.initializer])
+    ready_tensors = set(ready_tensors)
+    all_nodes = [n for n in new_model.graph.node]
+    while True:
+        activate_nodes = []
+        for node in all_nodes:
+            inputs = set(node.input)
+            if len(inputs - ready_tensors) == 0:
+                activate_nodes.append(node)
+
+        assert len(activate_nodes) != 0, 'invalid graph'
+        for node in activate_nodes:
+            model.graph.node.append(node)
+            ready_tensors = ready_tensors | set(node.output)
+            all_nodes.remove(node)
+
+        if len(all_nodes) == 0:
+            break
+    return model
+
+
+'''
+Pass
+'''
+
+
+def fix_model_shape(model,
+                    in_dim_dict=None,
+                    out_dim_dict=None,
+                    fully_si=False):
+
+    if in_dim_dict != None and out_dim_dict != None:
+        update_model_dims.update_inputs_outputs_dims(model, in_dim_dict,
+                                                     out_dim_dict)
+
+    if fully_si:
+        input_num = len(model.graph.input)
+        tensors = model.graph.initializer
+        for i, tensor in enumerate(tensors):
+            value_info = helper.make_tensor_value_info(
+                tensor.name, ONNX_DTYPE[tensor.data_type], tensor.dims)
+            model.graph.input.insert(i + input_num, value_info)
+
+    onnx.checker.check_model(model)
+    model = shape_inference.infer_shapes(model)
+
+    return model
+
+
+def remove_redundant_cast(graph):
+    cast_nodes = get_nodes_by_optype(graph, "Cast")
+    for node in cast_nodes:
+        in_node = get_node_by_output_name(graph, node.input[0])
+        if in_node.op_type == "Cast":
+            print("Removing redundant cast: ", in_node)
+            node.input[0] = in_node.input[0]
+            graph.node.remove(in_node)
+
+
+def onxx_sess_opt(model, opt_model):
+    sess_options = rt.SessionOptions()
+    sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_BASIC
+    sess_options.optimized_model_filepath = opt_model
+    rt.InferenceSession(model,
+                        sess_options,
+                        providers=['CPUExecutionProvider'])
+
+
+# ------------- Model speficted pass --------------------
+
+
+def convert_fp16_to_fp32(model):
+    # handle model.graph.initializer
+    to_convert = []
+    for init in model.graph.initializer:
+        # print(init.name)
+
+        if init.data_type != 10:
+            continue
+        to_convert.append(init)
+
+    for init in to_convert:
+        val = get_init_value(init)
+        new_val = val.astype(np.float32)
+        new_init = numpy_helper.from_array(new_val, init.name)
+        model.graph.initializer.remove(init)
+        model.graph.initializer.append(new_init)
+
+    # handle mode.graph.node
+    cons_ops = get_nodes_by_optype(model.graph, "Constant")
+    for op in cons_ops:
+        val_attr = get_attribute_by_name(op, "value")
+        if val_attr.t.data_type != 10:
+            continue
+
+        # import pdb;pdb.set_trace()
+        val = get_constant_value(op)
+        new_val = val.astype(np.float32)
+        set_constant_value(op, new_val)
+        val_attr.t.data_type = 1
+
+    for val_info in model.graph.value_info:
+        if val_info.type.tensor_type.elem_type != 10:
+            continue
+        val_info.type.tensor_type.elem_type = 1
+
+    # handle cast op
+    cast_ops = get_nodes_by_optype(model.graph, "Cast")
+
+    to_remove = []
+    for cast in cast_ops:
+        to = get_attribute_by_name(cast, "to")
+        if to.i != 10 and to.i != 1:
+            continue
+
+        if to.i == 10:
+            up_node = get_node_by_output_name(model.graph, cast.input[0])
+            set_node_attribute(cast, "to", 1)
+
+            if up_node.op_type != "Cast":
+                continue
+
+            up_to = get_attribute_by_name(up_node, "to")
+            if up_to.i != 1:
+                continue
+
+        if to.i == 1:
+            down_node = get_node_successor(model.graph, cast)
+            if len(down_node) == 0:
+                continue
+
+            if down_node[0].op_type != "Cast":
+                continue
+
+            down_to = get_attribute_by_name(down_node[0], "to")
+            if down_to.i != 10:
+                continue
+
+        # print(cast.name)
+        succs = get_node_successor(model.graph, cast)
+        for succ in succs:
+            for idx, in_name in enumerate(succ.input):
+                if in_name == cast.output[0]:
+                    succ.input[idx] = cast.input[0]
+
+        to_remove.append(cast)
+
+    for cast in to_remove:
+        out_info = get_graph_output_by_name(model.graph, cast.output[0])
+        if out_info == None:
+            model.graph.node.remove(cast)
+        else:
+            node = get_node_by_output_name(model.graph, cast.input[0])
+            if node != None:
+                for idx, out in enumerate(node.output):
+                    if out == cast.input[0]:
+                        node.output[idx] = cast.output[0]
+
+            model.graph.node.remove(cast)
+
+    return model
+
+
+def replace_mask_where(model):
+    # pattern: sub -> cast ----|
+    #           |-----------> where
+    where_ops = get_nodes_by_optype(model.graph, "Where")
+
+    to_replace = []
+    for where_node in where_ops:
+        cond = where_node.input[0]
+        node = get_node_by_output_name(model.graph, cond)
+        if node.op_type != "Cast":
+            continue
+
+        y_in = where_node.input[2]
+        node = get_node_by_output_name(model.graph, y_in)
+        if node.op_type != "Sub":
+            continue
+
+        to_replace.append(where_node)
+
+    to_remove = []
+    for where in to_replace:
+        x_in = where.input[1]
+        y_in = where.input[2]
+        mul_op = onnx.helper.make_node('Mul', [x_in, y_in],
+                                       where.output,
+                                       name="{}_mask_mul_replaced".format(
+                                           where.name))
+        model.graph.node.append(mul_op)
+
+        cast_op = get_node_by_output_name(model.graph, where.input[0])
+        to_remove.append(cast_op)
+        to_remove.append(where)
+
+    for node in to_remove:
+        model.graph.node.remove(node)
+
+    return model
+
+
+def convert_expand_to_tile(model):
+    expand_ops = get_nodes_by_optype(model.graph, "Expand")
+
+    for expand_node in expand_ops:
+        ifm = expand_node.input[0]
+        ofm = expand_node.output[0]
+
+        ifm_vi = get_value_info_by_name(model.graph, expand_node.input[0])
+        if ifm_vi == None:
+            continue
+
+        init_shape = get_initializer_by_name(model.graph, expand_node.input[1])
+        if init_shape == None:
+            continue
+        shape_val = get_init_value(init_shape)
+
+        ofm_shape = shape_val.tolist()
+        ifm_shape = [
+            dim.dim_value for dim in ifm_vi.type.tensor_type.shape.dim
+        ]
+
+        repeats = [
+            1 if i == j else int(j / i) for i, j in zip(ifm_shape, ofm_shape)
+        ]
+
+        repeats = np.array(repeats)
+        repeats = numpy_helper.from_array(
+            repeats, 'Tile_{}_repeats'.format(expand_node.name))
+        tile_node = onnx.helper.make_node('Tile', [ifm, repeats.name], [ofm],
+                                          name=expand_node.name)
+
+        model.graph.node.append(tile_node)
+        model.graph.initializer.append(repeats)
+        model.graph.node.remove(expand_node)
+
+    return model
+
+
+def concat_to_tile(model):
+    def is_tile_type(node):
+        tile_flag = True
+        for idx in range(len(node.input) - 1):
+            if node.input[idx] == node.input[idx + 1]:
+                continue
+            else:
+                tile_flag = False
+                break
+        return tile_flag
+
+    concat_ops = get_nodes_by_optype(model.graph, "Concat")
+
+    for concat in concat_ops:
+        if not is_tile_type(concat):
+            continue
+
+        print("Converting concat to tile")
+
+        in_val = get_value_info_by_name(model.graph, concat.input[0])
+        out_val = get_value_info_by_name(model.graph, concat.output[0])
+        ifm_shape = get_shape_from_value_info(in_val)
+        ofm_shape = get_shape_from_value_info(out_val)
+
+        repeats = [
+            1 if i == j else int(j / i) for i, j in zip(ifm_shape, ofm_shape)
+        ]
+
+        repeats = np.array(repeats)
+        repeats = numpy_helper.from_array(
+            repeats, 'Tile_{}_repeats'.format(concat.name))
+        tile_node = onnx.helper.make_node('Tile',
+                                          [concat.input[0], repeats.name],
+                                          [concat.output[0]],
+                                          name=concat.name)
+
+        model.graph.node.append(tile_node)
+        model.graph.initializer.append(repeats)
+        model.graph.node.remove(concat)
+
+
+def remove_qdq(model):
+    q_ops = get_nodes_by_optype(model.graph, "QuantizeLinear")
+
+    for q_op in q_ops:
+        dq = get_node_successor(model.graph, q_op)
+        if len(dq) != 1 and dq[0].op_type != "DequantizeLinear":
+            continue
+
+        qdq_succ = get_node_successor(model.graph, dq[0])
+        for i, n in enumerate(qdq_succ[0].input):
+            if n == dq[0].output[0]:
+                qdq_succ[0].input[i] = q_op.input[0]
+
+        model.graph.node.remove(q_op)
+        model.graph.node.remove(dq[0])
+
+
+import torch
+from onnx2torch import convert
+import onnxruntime as ort
+
+if __name__ == "__main__":
+    # Path to ONNX model
+    onnx_model_path = 'converted_models/no_qdq_2.onnx'
+    onnx_model = onnx.load(onnx_model_path)
+    in_shape_dict = {
+        "data": [2, 10, 3, 256, 256],
+    }
+    out_shape_dict = {'logits': [2, 2], '1383': [1, 20]}
+    onnx_model = fix_model_shape(onnx_model, in_shape_dict, out_shape_dict,
+                                 True)
+    onnx.save(onnx_model, 'converted_models/no_qdq_3.onnx')
+
+    onxx_sess_opt('converted_models/no_qdq_3.onnx',
+                  'converted_models/no_qdq_3.onnx')
+    onnx_model = onnx.load('converted_models/no_qdq_3.onnx')
+
+    torch_model_2 = convert(onnx_model)
+
+    # You can pass the path to the onnx model to convert it or...
+    # torch_model_1 = convert(onnx_model_path)
+
+    # Create example data
+    x = torch.ones((2, 10, 3, 256, 256))
+
+    out_torch = torch_model_2(x)
+
+    trace_model = torch.jit.trace(torch_model_2, x)
+
+    ort_sess = ort.InferenceSession(onnx_model_path)
+    outputs_ort = ort_sess.run(None, {'data': x.numpy()})
+
+    print(outputs_ort[0] - out_torch[0].detach().numpy())
+    print(outputs_ort[1] - out_torch[1].detach().numpy())
+
+    # Check the Onnx output against PyTorch
+    # print(torch.max(torch.abs(outputs_ort[0] - out_torch[0].detach().numpy())))
+    # print(torch.max(torch.abs(outputs_ort[1] - out_torch[1].detach().numpy())))
+    # print(np.allclose(outputs_ort[0], out_torch[0].detach().numpy(), atol=1.e-7))
+    # print(np.allclose(outputs_ort[1], out_torch[1].detach().numpy(), atol=1.e-7))
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f59eea6505a4b43ae61fc6c09d66dc36aa7cafc7
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/requirements.txt
@@ -0,0 +1,4 @@
+tensorflow>=2.6.0
+tf2onnx
+numpy
+torch==1.9.1
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_frozen.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_frozen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da7fbf3ea73f2306aab1b86ee115c1da6c90ee8
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_frozen.py
@@ -0,0 +1,147 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+An Interface to export saved_models to frozen models.
+
+Please notice, this API makes 2 assumptions
+
+    1. saved_model like below:
+        |--save-model.pb
+        |--variable
+        |-- |--variables.data-00000-of-00001
+        |-- |--variables.index
+
+    2. saved_tags is tag_constants.SERVING by default if not specific
+    3. signature is signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY by default if not specific
+Copyright Reserve: Habana Labs
+'''
+
+import sys
+from tensorflow.python.tools import freeze_graph
+from tensorflow.python.tools import saved_model_cli
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import signature_constants
+import argparse
+from six import StringIO
+import contextlib
+
+
+def freeze_saved_model(saved_model_dir,
+                       output_nodes,
+                       pb_name,
+                       saved_tags=tag_constants.SERVING):
+    input_saved_model_dir = saved_model_dir
+    output_node_names = output_nodes
+    input_binary = False
+    input_saver_def_path = False
+    restore_op_name = None
+    filename_tensor_name = None
+    clear_devices = True
+    input_meta_graph = False
+    checkpoint_path = None
+    input_graph_filename = None
+    saved_model_tags = saved_tags
+    output_graph_filename = pb_name
+
+    freeze_graph.freeze_graph(input_graph_filename, input_saver_def_path,
+                              input_binary, checkpoint_path, output_node_names,
+                              restore_op_name, filename_tensor_name,
+                              output_graph_filename, clear_devices, "", "", "",
+                              input_meta_graph, input_saved_model_dir,
+                              saved_model_tags)
+
+
+@contextlib.contextmanager
+def captured_output():
+    new_out, new_err = StringIO(), StringIO()
+    old_out, old_err = sys.stdout, sys.stderr
+
+    try:
+        sys.stdout, sys.stderr = new_out, new_err
+        yield sys.stdout, sys.stderr
+    finally:
+        sys.stdout, sys.stderr = old_out, old_err
+
+
+def get_output_node(saved_model_dir, saved_tags, sign):
+
+    parser = saved_model_cli.create_parser()
+    args = parser.parse_args([
+        'show', '--dir', saved_model_dir, '--tag_set', saved_tags,
+        '--signature_def', sign
+    ])
+
+    with captured_output() as (out, err):
+        saved_model_cli.show(args)
+
+    result = out.getvalue().strip()
+
+    print(result)
+
+    output_num = 0
+    output_nodes = None
+    lines = result.split('\n')
+    for idx, line in enumerate(result.split('\n')):
+        if "outputs[" in line:
+            line = lines[idx + 3]
+            output = line.split(":")[1]
+            if output_num > 0:
+                output_nodes = output_nodes + "," + output
+            else:
+                output_nodes = output
+            output_num = output_num + 1
+
+    if output_nodes == None:
+        raise RuntimeError("No Output Nodes found in saved_model.")
+
+    return output_nodes, output_num
+
+
+def saved_to_frozen(
+    saved_model_dir,
+    frozen_path,
+    saved_tags=tag_constants.SERVING,
+    sign=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+
+    output_nodes, output_num = get_output_node(saved_model_dir, saved_tags,
+                                               sign)
+
+    output_nodes = output_nodes
+
+    print("[INFO]: Save Model has [", output_num, "] outputs.")
+    print("[INFO]: Outputs Nodes: [", output_nodes, "].")
+
+    # cwd = os.getcwd()
+    # frozen_path = os.path.join(cwd, "converted_frozen.pb")
+
+    freeze_saved_model(saved_model_dir, output_nodes, frozen_path, saved_tags)
+
+    print("[INFO]: Saved Model convert to Frozen Model done.")
+    print("[INFO]: Frozen Model saved here: ", frozen_path)
+
+    return frozen_path
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    saved_to_frozen(args.model_path, args.output_path)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_onnx.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b6a74560e8ce6b733f0c78d6e8984736c84ee60
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tf2onnx
+from tf2onnx import tf_loader
+import argparse
+ONNX_OPSET = 11
+
+
+def _convert_graphdef_to_onnx(graph_def,
+                              inputs=None,
+                              outputs=None,
+                              output_path='',
+                              **kwargs):
+    inputs_as_nchw = kwargs.get('inputs_as_nchw', None)
+    custom_ops = kwargs.get('custom_ops', None)
+    custom_op_handlers = kwargs.get('custom_op_handlers', None)
+    custom_rewriter = kwargs.get('custom_rewriter', None)
+    extra_opset = kwargs.get('extra_opset', None)
+    large_model = kwargs.get('large_model', False)
+    name = kwargs.get('name', 'habana_convert')
+    target = kwargs.get('target', None)
+    shape_override = kwargs.get('shape_override', {})
+
+    tf2onnx.convert.from_graph_def(graph_def,
+                                   name=name,
+                                   input_names=inputs,
+                                   output_names=outputs,
+                                   opset=ONNX_OPSET,
+                                   custom_ops=custom_ops,
+                                   custom_op_handlers=custom_op_handlers,
+                                   custom_rewriter=custom_rewriter,
+                                   inputs_as_nchw=inputs_as_nchw,
+                                   extra_opset=extra_opset,
+                                   shape_override=shape_override,
+                                   target=target,
+                                   large_model=large_model,
+                                   output_path=output_path)
+    return output_path
+
+
+def savedmodel_to_onnx(model_path, output_path='', **kwargs):
+    inputs = kwargs.get('inputs', None)
+    outputs = kwargs.get('outputs', None)
+    graph_def, inputs, outputs = tf_loader.from_saved_model(
+        model_path, inputs, outputs)
+    return _convert_graphdef_to_onnx(graph_def, inputs, outputs, output_path,
+                                     **kwargs)
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    savedmodel_to_onnx(args.model_path, args.output_path)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_fp32_to_fp16.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_fp32_to_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1f6dad9687d751c55a0e5c5f9a0a73997d70fff
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_fp32_to_fp16.py
@@ -0,0 +1,189 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+# tf.contrib.resampler
+from tensorflow.core.framework import types_pb2, graph_pb2, attr_value_pb2
+from tensorflow.tools.graph_transforms import TransformGraph
+from google.protobuf import text_format
+import numpy as np
+from textops import tf_load_op_library
+
+# Const should be float32 in object detection api during nms (see here: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/non-max-suppression-v4.html)
+keep_fp32_node_name = []
+keep_fp16_node_name = []
+
+
+def load_graph(model_path):
+    graph = tf.Graph()
+    with graph.as_default():
+        graph_def = tf.GraphDef()
+        if model_path.endswith("pb"):
+            with open(model_path, "rb") as f:
+                graph_def.ParseFromString(f.read())
+        else:
+            with open(model_path, "r") as pf:
+                text_format.Parse(pf.read(), graph_def)
+        tf.import_graph_def(graph_def, name="")
+        sess = tf.Session(graph=graph)
+        return sess
+
+
+def rewrite_batch_norm_node_v2(node, graph_def, target_type='fp16'):
+    """
+    Rewrite FusedBatchNorm with FusedBatchNormV2 for reserve_space_1 and reserve_space_2 in FusedBatchNorm require float32 for 
+    gradient calculation (See here: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/fused-batch-norm)
+    """
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp64':
+        dtype = types_pb2.DT_DOUBLE
+    else:
+        dtype = types_pb2.DT_FLOAT
+    new_node = graph_def.node.add()
+    new_node.op = "FusedBatchNormV2"
+    new_node.name = node.name
+    new_node.input.extend(node.input)
+    new_node.attr["U"].CopyFrom(
+        attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+    for attr in list(node.attr.keys()):
+        if attr == "T":
+            node.attr[attr].type = dtype
+        new_node.attr[attr].CopyFrom(node.attr[attr])
+    print("rewrite fused_batch_norm done!")
+
+
+def convert_graph_to_fp16(model_path,
+                          save_path,
+                          name,
+                          as_text=False,
+                          target_type='fp16',
+                          input_name=None,
+                          output_names=None):
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp64':
+        dtype = types_pb2.DT_DOUBLE
+    else:
+        dtype = types_pb2.DT_FLOAT
+
+    source_sess = load_graph(model_path)
+    source_graph_def = source_sess.graph.as_graph_def()
+    target_graph_def = graph_pb2.GraphDef()
+    target_graph_def.versions.CopyFrom(source_graph_def.versions)
+
+    for node in source_graph_def.node:
+        # fused batch norm node
+        if node.op == "FusedBatchNorm":
+            rewrite_batch_norm_node_v2(node,
+                                       target_graph_def,
+                                       target_type=target_type)
+            continue
+
+        # replicate node
+        new_node = target_graph_def.node.add()
+        new_node.op = node.op
+        new_node.name = node.name
+        new_node.input.extend(node.input)
+        attrs = list(node.attr.keys())
+
+        # keep batch norm params node
+        if ("BatchNorm" in node.name) or ('batch_normalization' in node.name):
+            for attr in attrs:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+            continue
+
+        # replace dtype in node attr with target dtype
+        for attr in attrs:
+            # keep special node in fp32
+            if node.name in keep_fp32_node_name:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+                continue
+
+            if node.attr[attr].type == types_pb2.DT_FLOAT:
+                # modify node dtype
+                node.attr[attr].type = dtype
+
+            if attr == "value":
+                tensor = node.attr[attr].tensor
+                if tensor.dtype == types_pb2.DT_FLOAT:
+                    # if float_val exists
+                    if tensor.float_val:
+                        float_val = tf.make_ndarray(node.attr[attr].tensor)
+                        new_node.attr[attr].tensor.CopyFrom(
+                            tf.make_tensor_proto(float_val, dtype=dtype))
+                        continue
+
+                    # if tensor content exists
+                    if tensor.tensor_content:
+                        tensor_shape = [
+                            x.size for x in tensor.tensor_shape.dim
+                        ]
+                        tensor_weights = tf.make_ndarray(tensor)
+                        # reshape tensor
+                        tensor_weights = np.reshape(tensor_weights,
+                                                    tensor_shape)
+                        tensor_proto = tf.make_tensor_proto(tensor_weights,
+                                                            dtype=dtype)
+                        new_node.attr[attr].tensor.CopyFrom(tensor_proto)
+                        continue
+
+            new_node.attr[attr].CopyFrom(node.attr[attr])
+
+    # transform graph
+    if output_names:
+        if not input_name:
+            input_name = []
+        transforms = ["strip_unused_nodes"]
+        target_graph_def = TransformGraph(target_graph_def, input_name,
+                                          output_names, transforms)
+
+    # write graph_def to model
+    tf.io.write_graph(target_graph_def,
+                      logdir=save_path,
+                      name=name,
+                      as_text=as_text)
+    print("Converting done ...")
+
+
+def main():
+    # input_name = ["input_ids", "segment_ids", "input_mask"]
+    # output_names = ["output_scores"]
+    input_name = [
+        "block_ids", "font_size", "height", "strclass", "tag_titles", "tags",
+        "text", "urls", "width", "x_axis", "y_axis"
+    ]
+    output_names = ["loss/Softmax", "init_all_tables"]
+
+    model_path = "frozen_init_all_table.pb"
+    save_path = "./"
+    name = "fp32_frozen_init_all_table.pb"
+    as_text = False
+    target_type = 'fp32'
+    convert_graph_to_fp16(model_path,
+                          save_path,
+                          name,
+                          as_text=as_text,
+                          target_type=target_type,
+                          input_name=input_name,
+                          output_names=output_names)
+    # test loading
+    # ISSUE: loading detection model is extremely slow while loading classification model is normal
+    sess = load_graph(save_path + "/" + name)
+    print("DONE!")
+
+
+if __name__ == "__main__":
+    tf_load_op_library()
+    main()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e197d4aeec3ef820283500b676e6af5f6399ad7
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_utils.py
@@ -0,0 +1,861 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import tensorflow as tf
+from tensorflow.core import framework
+from tensorflow.core.framework import types_pb2, graph_pb2, attr_value_pb2
+from tensorflow.tools.graph_transforms import TransformGraph
+from google.protobuf import text_format
+import numpy as np
+
+
+def isTextProtobuf(filename):
+    """ Returns whether a filename is a text protobuf based on the file extension.
+
+    Args:
+        filename: string - file name to process.
+
+    Returns:
+        true if `filename`'s extension is .pbtxt, false otherwise.
+    """
+
+    retval = False
+
+    _, filename_ext = os.path.splitext(filename)
+    if filename_ext and filename_ext.lower() == ".pbtxt":
+        retval = True
+
+    return retval
+
+
+def saveGraphProtobufToFile(file_name, graph_d):
+    """ Saves a `GraphDef` protocol buffer graph to a file.
+
+    Args:
+        file_name: string - name of the file where to write the graph.
+        graph_d: The `GraphDef` protocol buffer to save.
+    """
+    output_file_name_no_dir = os.path.basename(file_name)
+    output_file_dir = os.path.dirname(file_name)
+    tf.io.write_graph(graph_d,
+                      output_file_dir,
+                      output_file_name_no_dir,
+                      as_text=isTextProtobuf(file_name))
+
+
+def loadGraphProtobufFromFile(file_name):
+    """ Loads a `GraphDef` protocol buffer graph from a file.
+
+    Args:
+        file_name: string - name of the file to load.
+
+    Returns:
+        A `GraphDef` protocol buffer loaded from the file.
+    """
+    graph_d = framework.graph_pb2.GraphDef()
+    with open(file_name, "rb") as f:
+        if isTextProtobuf(file_name):
+            # for text file:
+            text_format.Merge(f.read(), graph_d)
+        else:
+            # for binary file:
+            graph_d.ParseFromString(f.read())
+    return graph_d
+
+
+def duplicateGraph(graph_d):
+    """ Creates a deep copy of a tf GraphDef.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to duplicate.
+
+    Returns:
+        A deep copy of the specified tf GraphDef.
+    """
+
+    with tf.Graph().as_default() as tmp_graph:
+        _ = tf.import_graph_def(graph_d, name="")
+        return tmp_graph.as_graph_def()
+
+
+def getNodeNames(nodes_d):
+    """ Compiles a list of strings representing all the name of
+    the nodes in the specified list of nodes.
+
+    Args:
+        nodes_d: List of `NodeDef` objects to process.
+
+    Returns:
+        A list of strings representing all the name of the nodes in `nodes_d`.
+    """
+    return [node_d.name for node_d in nodes_d]
+
+
+def getNodeIndexByName(nodes_d, node_name):
+    """ Finds the NodeDef node in list of NodeDef corresponding to
+    the specified name.
+
+    Args:
+        nodes_d: List of `NodeDef` objects to process.
+        node_name: node to find.
+
+    Returns:
+        And integer index representing the index of the node in the list
+        passed or -1 if not found.
+    """
+
+    retval = -1
+    for i, node_d in enumerate(nodes_d):
+        if node_d.name == node_name:
+            retval = i
+            break
+    return retval
+
+
+def getNodeInputNamesClean(node_input_names):
+    retval = []
+    for input_name in node_input_names:
+        tensor_idx = input_name.rfind(":")
+        if tensor_idx < 0:
+            retval.append(input_name)
+        else:
+            retval.append(input_name[:tensor_idx])
+    return retval
+
+
+def getNodeByName(nodes_d, node_name):
+    """ Finds the NodeDef node in list of NodeDef corresponding to
+    the specified name.
+
+    Args:
+        nodes_d: List of `NodeDef` objects to process.
+        node_name: node to find.
+
+    Returns:
+        The `NodeDef` node in `nodes_d` corresponding to the specified name,
+        or None if name is not found in `nodes_d`.
+    """
+
+    retval = getNodeIndexByName(nodes_d, node_name)
+    if (retval < 0):
+        retval = None
+    else:
+        retval = nodes_d[retval]
+    return retval
+
+
+def getInputNodeNames(graph_d):
+    """ Finds the placeholder nodes (or inputs) in the graph.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+
+    Returns:
+        A list of node names corresponding to all nodes that are
+        inputs to the graph.
+    """
+
+    retval = []
+    for node_d in graph_d.node:
+        if node_d.op == "Placeholder":
+            retval.append(node_d.name)
+    return retval
+
+
+def getOutputNodeNames(graph_d):
+    """ Finds the nodes that are leaf nodes (or outputs) in the graph.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+
+    Returns:
+        A list of node names corresponding to all nodes that are
+        leaf nodes (or outputs) in the graph.
+    """
+
+    non_output_node_names = set()
+    for node_d in graph_d.node:
+        non_output_node_names = non_output_node_names | set(
+            getNodeInputNamesClean(node_d.input))
+    graph_node_names = set(getNodeNames(graph_d.node))
+    return list(graph_node_names - non_output_node_names)
+
+
+def getNodesInOutput(graph_d, node_name):
+    """ Finds all nodes that use the output of specified node as
+    their input in the specified graph.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+        node_name: String name of node to check.
+
+    Returns:
+        A list of node names corresponding to all nodes that use the
+        output of specified node as their input.
+    """
+    retval = []
+
+    for node_d in graph_d.node:
+        node_input_names = getNodeInputNamesClean(node_d.input)
+        for id, input_name in enumerate(node_input_names):
+            if input_name == node_name:
+                retval.append([id, node_d.name])
+                break
+
+    return retval
+
+
+def getNodesInSubGraph(graph_d, start_nodes, end_nodes):
+    subgraph = []
+    for node in start_nodes:
+        subgraph.append(node)
+
+    successor = start_nodes
+    while len(successor) != 0:
+        for node in successor:
+            tmp_suc = getNodesInOutput(graph_d, node)
+            for suc in tmp_suc:
+                if suc in subgraph:
+                    continue
+                else:
+                    subgraph.append(suc)
+        successor = tmp_suc
+
+    return subgraph
+
+
+def convertTensorflow2NumpyShape(shape_tf):
+    """ Converts a tensorflow `TensorShape` to a numpy shape.
+    All unknown values for partial shapes will be converted to -1.
+
+    Args:
+        shape_tf: A `TensorShape` object to convert.
+
+    Returns:
+        A list of values representing a valid numpy style shape.
+    """
+    retval = [
+        shape_val if shape_val is not None else -1
+        for shape_val in shape_tf.as_list()
+    ]
+    return retval
+
+
+def convertNumpy2TensorflowShape(shape_np):
+    """ Converts a numpy shape to a tensorflow shape.
+    All unknown (-1) values for partial shapes will be converted to None.
+
+    Args:
+        shape_np: A list of values representing a valid numpy shape.
+
+    Returns:
+        A list of values representing a valid tensorflow style shape.
+    """
+    retval = [shape_val if shape_val >= 0 else None for shape_val in shape_np]
+    return retval
+
+
+def getInputShape(graph_d, numpy_format=False):
+    """ Retrieves the shape of all inputs to specified `GraphDef` object.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+        numpy_format: boolean - if False (default), shape is given in tensorflow format,
+            otherwise, numpy format.
+
+    Returns:
+        A mapping string => list: from input tensor name to shape.
+    """
+
+    retval = {}
+
+    input_node_names = getInputNodeNames(graph_d)
+
+    tf.import_graph_def(graph_d, name="")
+    for input_node_name in input_node_names:
+        # find all output tensors for this placeholder, i.e. input:0, input:1, etc.
+        try:
+            i = 0
+            while True:
+                input_tensor_name = input_node_name + ":" + str(i)
+                next_input_tensor = tf.get_default_graph().get_tensor_by_name(
+                    input_tensor_name)
+                tensor_shape = next_input_tensor.shape
+                if numpy_format:
+                    tensor_shape = convertTensorflow2NumpyShape(tensor_shape)
+                retval[input_tensor_name] = tensor_shape
+                i += 1
+        except:
+            pass  # reached the end of the placeholder outputs
+
+    return retval
+
+
+def getInputOutputNodes(frozen_graph):
+    """ Finds all input and output nodes in the specified graph.
+
+    Args:
+        frozen_graph: TensorFlow frozen graph
+
+    Returns:
+        A list of input and output node names.
+    """
+    predefined_inputs = ['segment', 'mask', 'input_ids']
+    graph_d = loadGraphProtobufFromFile(frozen_graph)
+    inputs = getInputNodeNames(graph_d)
+    outputs = getOutputNodeNames(graph_d)
+    nodes = [
+        str for str in inputs if any(sub in str for sub in predefined_inputs)
+    ]
+    if len(nodes) == len(predefined_inputs):
+        return [inputs, outputs]
+    else:
+        status, inputs = findNodeByName(graph_d, predefined_inputs)
+        if status:
+            return [inputs, outputs]
+        else:
+            raise RuntimeError(
+                "Cannot find suitable inputs for this tool, please indicate the names of inputs after preprocessing"
+            )
+
+
+def findNodeByName(graph_d, node_name):
+    """ Finds nodes specified by name in the specified graph.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+        node_name: String name of node to check.
+
+    Returns:
+        status - True if all nodes are found, False otherwise
+        A list of node names.
+    """
+    status = False
+    all_nodes = list(getNodeNames(graph_d.node))
+    retval = [str for str in all_nodes if any(sub in str for sub in node_name)]
+    if len(node_name) == len(retval):
+        status = True
+
+    return status, retval
+
+
+def load_graph(model_path):
+    graph = tf.Graph()
+    with graph.as_default():
+        graph_def = tf.GraphDef()
+        if model_path.endswith("pb"):
+            with open(model_path, "rb") as f:
+                graph_def.ParseFromString(f.read())
+        else:
+            with open(model_path, "r") as pf:
+                text_format.Parse(pf.read(), graph_def)
+        return graph_def
+
+
+from opt_tf import *
+import os
+import tensorflow as tf
+import sys
+from tensorflow.python.tools import freeze_graph
+from tensorflow.python.tools import saved_model_cli
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.tools.graph_transforms import TransformGraph
+from six import StringIO, iteritems
+import contextlib
+
+from tensorflow.core.framework import types_pb2, tensor_shape_pb2, graph_pb2, attr_value_pb2
+
+import numpy as np
+from load_runstep import load_runstep
+
+
+def load_graph(model):
+    graph_def = tf.GraphDef()
+
+    print("load model: ", model)
+    with open(model, 'rb') as f:
+        graph_def.ParseFromString(f.read())
+
+    return graph_def
+
+
+def find_node(graph_def, name):
+    node = None
+    for n in graph_def.node:
+        if n.name == name:
+            node = n
+            break
+    # if node == None:
+    #     print('Node {} not found'.format(name))
+
+    return node
+
+
+def find_node_by_type(graph_def, type):
+    node = []
+    for n in graph_def.node:
+        if n.op == type:
+            node.append(n)
+    return node
+
+
+def get_node_successor(graph_def, node_name):
+    outputs = []
+    for n in graph_def.node:
+        for input in n.input:
+            if node_name == input.split(':')[0]:
+                outputs.append(n)
+
+    # if len(outputs) == 0:
+    #     print("[INFO] {} has no successor".format(node_name))
+
+    return outputs
+
+
+def get_node_output(graph_def, node_name):
+    outputs = []
+    for n in graph_def.node:
+        for input in n.input:
+            if node_name == input.split(':')[0]:
+                if len(input.split(':')) == 1:
+                    if not input + ":0" in outputs:
+                        outputs.append(input + ":0")
+                else:
+                    if not input in outputs:
+                        outputs.append(input)
+
+    # if len(outputs) == 0:
+    #     print("[INFO] {} has no output".format(node_name))
+
+    return outputs
+
+
+# single in & singel out
+
+
+def remove_nodes(graph_d, nodes):
+    for node in nodes:
+        # assert len(node.input) == 1
+        pre_node = node.input[0]
+
+        succ_nodes = get_node_successor(graph_d, node.name)
+        for succ in succ_nodes:
+            for idx, name in enumerate(succ.input):
+                if name == node.name:
+                    succ.input[idx] = pre_node
+
+        graph_d.node.remove(node)
+
+    return graph_d
+
+
+def create_shape_proto(shape):
+    shape_proto = tensor_shape_pb2.TensorShapeProto()
+    for dim in shape:
+        shape_proto.dim.add().size = dim
+    return attr_value_pb2.AttrValue(shape=shape_proto)
+
+
+def set_shape(node, shape):
+    node.attr["shape"].CopyFrom(create_shape_proto(shape))
+
+
+def remove_control_dep(graph_def):
+    # reset & import
+    tf.reset_default_graph()
+    tf.import_graph_def(graph_def, name="")
+
+    for node in graph_def.node:
+        op = tf.get_default_graph().get_operation_by_name(node.name)
+        if len(op.control_inputs) != 0:
+            tf.contrib.graph_editor.remove_control_inputs(
+                op, op.control_inputs)
+
+    graph_def = tf.get_default_graph().as_graph_def()
+    return graph_def
+
+
+def is_leaf_node(graph_d, name):
+    for n in graph_d.node:
+        for in_n in n.input:
+            if name == in_n or name == in_n.split(":0")[0]:
+                return False
+    return True
+
+
+def get_node_shape(node):
+    return [d.size for d in node.attr["shape"].shape.dim]
+
+
+def get_graph_input(graph_d):
+    in_node = []
+    for n in graph_d.node:
+        if n.op == "Placeholder":
+            in_node.append(n.name)
+
+    to_remove = []
+    for in_n in in_node:
+        if is_leaf_node(graph_d, in_n):
+            to_remove.append(in_n)
+
+    for name in to_remove:
+        node = find_node(graph_d, name)
+        graph_d.node.remove(node)
+
+    real_in = set(in_node) - set(to_remove)
+
+    return list(real_in)
+
+
+def get_graph_output(graph_d):
+    out_node = []
+    for n in graph_d.node:
+        if len(get_node_successor(graph_d, n.name)) == 0:
+            out_node.append(n.name)
+
+    # if len(out_node) == 0:
+    #     print("[INFO] Graph No Outputs??")
+
+    return out_node
+
+
+def get_constant_val(node):
+    val = tf.make_ndarray(node.attr["value"].tensor)
+    return val
+
+
+def get_dtype_from_np(val):
+    if val.dtype == np.int32:
+        return types_pb2.DT_INT32
+
+    if val.dtype == np.float32:
+        return types_pb2.DT_FLOAT
+
+    if val.dtype == np.int64:
+        return types_pb2.DT_INT64
+
+    if val.dtype == np.float16:
+        return types_pb2.DT_HALF
+
+    raise ValueError("DTYPE {} NOT SUPPORTEED!".format(val.dtype))
+
+
+def set_constant_val(node, val):
+    tf_dtype = get_dtype_from_np(val)
+    node.attr["value"].tensor.CopyFrom(
+        tf.make_tensor_proto(val, dtype=tf_dtype))
+
+
+@contextlib.contextmanager
+def captured_output():
+    new_out, new_err = StringIO(), StringIO()
+    old_out, old_err = sys.stdout, sys.stderr
+
+    try:
+        sys.stdout, sys.stderr = new_out, new_err
+        yield sys.stdout, sys.stderr
+    finally:
+        sys.stdout, sys.stderr = old_out, old_err
+
+
+def get_saved_input_node(saved_model_dir, saved_tags, sign):
+
+    parser = saved_model_cli.create_parser()
+    args = parser.parse_args([
+        'show', '--dir', saved_model_dir, '--tag_set', saved_tags,
+        '--signature_def', sign
+    ])
+
+    with captured_output() as (out, err):
+        saved_model_cli.show(args)
+
+    result = out.getvalue().strip()
+
+    input_tensors = []
+
+    lines = result.split('\n')
+    for idx, line in enumerate(result.split('\n')):
+        if "inputs[" in line:
+            line = lines[idx + 3]
+            input = line.split(":")[1]
+            input_tensors.append(input.strip() + ":0")
+    return input_tensors
+
+
+def get_saved_output_node(saved_model_dir, saved_tags, sign):
+
+    parser = saved_model_cli.create_parser()
+    args = parser.parse_args([
+        'show', '--dir', saved_model_dir, '--tag_set', saved_tags,
+        '--signature_def', sign
+    ])
+
+    with captured_output() as (out, err):
+        saved_model_cli.show(args)
+
+    result = out.getvalue().strip()
+
+    # print(result)
+
+    output_nodes = []
+    lines = result.split('\n')
+    for idx, line in enumerate(result.split('\n')):
+        if "outputs[" in line:
+            line = lines[idx + 3]
+            output = line.split(":")[1]
+            output_nodes.append(output.strip() + ":0")
+
+    return output_nodes
+
+
+def duplicate_const(graph_d):
+    all_consts = find_node_by_type(graph_d, "Const")
+
+    need_duplicate = []
+    for node in all_consts:
+        if len(get_node_successor(graph_d, node.name)) > 1:
+            need_duplicate.append(node.name)
+
+    for node in need_duplicate:
+        succ_nodes = get_node_successor(graph_d, node)
+
+        for idx, succ in enumerate(succ_nodes):
+            ori_node = find_node(graph_d, node)
+
+            new_node = graph_d.node.add()
+            new_node.op = ori_node.op
+            new_node.name = ori_node.name + "new_{}".format(idx)
+            new_node.input.extend(ori_node.input)
+            attrs = list(ori_node.attr.keys())
+            for attr in attrs:
+                new_node.attr[attr].CopyFrom(ori_node.attr[attr])
+
+            for i, input in enumerate(succ.input):
+                if input == ori_node.name:
+                    succ.input[i] = new_node.name
+
+    return graph_d
+
+
+def rewrite_batch_norm_node_v2(node, graph_def, target_type):
+    """
+    Rewrite FusedBatchNorm with FusedBatchNormV2 for reserve_space_1 and reserve_space_2 in FusedBatchNorm require float32 for 
+    gradient calculation (See here: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/fused-batch-norm)
+    """
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp32':
+        dtype = types_pb2.DT_FLOAT
+
+    new_node = graph_def.node.add()
+    new_node.op = "FusedBatchNormV2"
+    new_node.name = node.name
+    new_node.input.extend(node.input)
+    new_node.attr["U"].CopyFrom(
+        attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+    for attr in list(node.attr.keys()):
+        if attr == "T":
+            node.attr[attr].type = dtype
+        new_node.attr[attr].CopyFrom(node.attr[attr])
+
+    print("rewrite fused_batch_norm done!")
+
+
+def convert_graph_to_fp16(model_path,
+                          save_path,
+                          name,
+                          as_text=False,
+                          target_type='fp16',
+                          input_name=None,
+                          output_names=None,
+                          keep_fp32_node_name=[]):
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp32':
+        dtype = types_pb2.DT_FLOAT
+
+    source_sess = load_graph(model_path)
+    source_graph_def = source_sess.graph.as_graph_def()
+    target_graph_def = graph_pb2.GraphDef()
+    target_graph_def.versions.CopyFrom(source_graph_def.versions)
+
+    for node in source_graph_def.node:
+        # fused batch norm node
+        if node.op == "FusedBatchNorm":
+            rewrite_batch_norm_node_v2(node,
+                                       target_graph_def,
+                                       target_type=target_type)
+            continue
+
+        # replicate node
+        new_node = target_graph_def.node.add()
+        new_node.op = node.op
+        new_node.name = node.name
+        new_node.input.extend(node.input)
+        attrs = list(node.attr.keys())
+
+        # keep batch norm params node
+        if ("BatchNorm" in node.name) or ('batch_normalization' in node.name):
+            for attr in attrs:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+            continue
+
+        # replace dtype in node attr with target dtype
+        for attr in attrs:
+            # keep special node in fp32
+            if node.name in keep_fp32_node_name:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+                continue
+
+            if node.attr[attr].type == types_pb2.DT_FLOAT:
+                # modify node dtype
+                node.attr[attr].type = dtype
+
+            if attr == "value":
+                tensor = node.attr[attr].tensor
+                if tensor.dtype == types_pb2.DT_FLOAT:
+                    # if float_val exists
+                    if tensor.float_val:
+                        float_val = tf.make_ndarray(node.attr[attr].tensor)
+                        new_node.attr[attr].tensor.CopyFrom(
+                            tf.make_tensor_proto(float_val, dtype=dtype))
+                        continue
+
+                    # if tensor content exists
+                    if tensor.tensor_content:
+                        tensor_shape = [
+                            x.size for x in tensor.tensor_shape.dim
+                        ]
+                        tensor_weights = tf.make_ndarray(tensor)
+                        # reshape tensor
+                        tensor_weights = np.reshape(tensor_weights,
+                                                    tensor_shape)
+                        tensor_proto = tf.make_tensor_proto(tensor_weights,
+                                                            dtype=dtype)
+                        new_node.attr[attr].tensor.CopyFrom(tensor_proto)
+                        continue
+
+            new_node.attr[attr].CopyFrom(node.attr[attr])
+
+    # transform graph
+    if output_names:
+        if not input_name:
+            input_name = []
+        transforms = ["strip_unused_nodes"]
+        target_graph_def = TransformGraph(target_graph_def, input_name,
+                                          output_names, transforms)
+
+    # write graph_def to model
+    tf.io.write_graph(target_graph_def,
+                      logdir=save_path,
+                      name=name,
+                      as_text=as_text)
+    print("Converting done ...")
+
+
+def convert_graph_to_fp32(model_path,
+                          save_path,
+                          name,
+                          as_text=False,
+                          target_type='fp32',
+                          input_name=None,
+                          output_names=None,
+                          keep_fp16_node_name=[]):
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp32':
+        dtype = types_pb2.DT_FLOAT
+
+    source_sess = load_graph(model_path)
+    source_graph_def = source_sess.graph.as_graph_def()
+    target_graph_def = graph_pb2.GraphDef()
+    target_graph_def.versions.CopyFrom(source_graph_def.versions)
+
+    for node in source_graph_def.node:
+        # fused batch norm node
+        if node.op == "FusedBatchNorm":
+            rewrite_batch_norm_node_v2(node,
+                                       target_graph_def,
+                                       target_type=target_type)
+            continue
+
+        # replicate node
+        new_node = target_graph_def.node.add()
+        new_node.op = node.op
+        new_node.name = node.name
+        new_node.input.extend(node.input)
+        attrs = list(node.attr.keys())
+
+        # keep batch norm params node
+        if ("BatchNorm" in node.name) or ('batch_normalization' in node.name):
+            for attr in attrs:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+            continue
+
+        # replace dtype in node attr with target dtype
+        for attr in attrs:
+            # keep special node in fp16
+            if node.name in keep_fp16_node_name:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+                continue
+
+            if node.attr[attr].type == types_pb2.DT_HALF:
+                # modify node dtype
+                node.attr[attr].type = dtype
+
+            if attr == "value":
+                tensor = node.attr[attr].tensor
+                if tensor.dtype == types_pb2.DT_HALF:
+                    # if half_val exists
+                    if tensor.half_val:
+                        half_val = tf.make_ndarray(node.attr[attr].tensor)
+                        new_node.attr[attr].tensor.CopyFrom(
+                            tf.make_tensor_proto(half_val, dtype=dtype))
+                        continue
+
+                    # if tensor content exists
+                    if tensor.tensor_content:
+                        tensor_shape = [
+                            x.size for x in tensor.tensor_shape.dim
+                        ]
+                        tensor_weights = tf.make_ndarray(tensor)
+                        # reshape tensor
+                        tensor_weights = np.reshape(tensor_weights,
+                                                    tensor_shape)
+                        tensor_proto = tf.make_tensor_proto(tensor_weights,
+                                                            dtype=dtype)
+                        new_node.attr[attr].tensor.CopyFrom(tensor_proto)
+                        continue
+
+            new_node.attr[attr].CopyFrom(node.attr[attr])
+
+    # transform graph
+    if output_names:
+        if not input_name:
+            input_name = []
+        transforms = ["strip_unused_nodes"]
+        target_graph_def = TransformGraph(target_graph_def, input_name,
+                                          output_names, transforms)
+
+    # write graph_def to model
+    tf.io.write_graph(target_graph_def,
+                      logdir=save_path,
+                      name=name,
+                      as_text=as_text)
+    print("Converting done ...")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/torch_to_onnx.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/torch_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ffd1cc42ad93036b28c445bc87955e66249ca5d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/torch_to_onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+    model_name = output_path.split("/")[-1][:-4]
+    with open("general_perf/model_zoo/" + model_name + "json", "r") as f:
+        model_info = json.load(f)
+    model_inputs = model_info["inputs"].split(",")
+    input_shapes = model_info["input_shape"]
+    input_type = model_info["input_type"].split(",")
+    example_inputs = _get_fake_samples(input_shapes, input_type)
+
+    model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+    model.eval()
+
+    names = model_inputs
+    dynamic_inputs = {}
+    for i in range(len(names)):
+        dynamic_inputs[names[i]] = {0: "batch_size"}
+    outputs = model_info["outputs"].split(",")
+    for output in outputs:
+        dynamic_inputs[output] = {0: "batch_size"}
+    torch.onnx.export(
+        model,
+        example_inputs,
+        output_path,
+        opset_version=11,
+        input_names=names,
+        output_names=outputs,
+        dynamic_axes=dynamic_inputs,
+    )
+
+
+def _get_fake_samples(shape, type):
+    data = []
+    idx = 0
+    for key, val in shape.items():
+        val = [val[0] * 1] + val[1:]
+        data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+        idx += 1
+    return data
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    torch_to_onnx(args.model_path, args.output_path)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/version.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..608f35d6f6b03ca23f46fbd6500fc32f694a858f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/version.py
@@ -0,0 +1 @@
+__version__ = '1.0.0'
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/README.md b/toolbox/ByteMLPerf/byte_micro_perf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6033f40e265b9060de9e25a6a8f6fc20aaa3aa7f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/README.md
@@ -0,0 +1,73 @@
+# ByteMicroPerf
+
+## Introduction
+ByteMicroPerf is a part of ByteMLPerf, which is mainly used to evaluate the performance of frequent computation and communication operators in mainstream deep learning models on new emerging heterogeneous hardwares. The main characteristics are as follows:
+
+- Easy and quick access for diverse heterogeneous hardware
+- Evaluation process fitting realistic business scenarios
+- Coverage of frequent operators across multiple categories
+
+## Quickstart
+
+### Prepare running environment
+
+```
+git clone https://github.com/bytedance/ByteMLPerf.git
+cd ByteMLPerf/byte_micro_perf
+```
+
+### Prepare hardware configuration(optional)
+Please follow the given style at `ByteMLPerf/vendor_zoo` directory to create a new hardware config file for your own heterogeneous hardware. Because this helps the framework evaluate operator performance on new hardware more precisely.
+
+### An example
+
+```
+python3 launch.py --task exp --hardware_type GPU
+```
+#### Usage
+```
+--task: operator name                              please create a workload file for new operators by following the existing style in byte_micro_perf/workloads.
+
+--hardware_type: hardware category name            please derive a Backend class for your heterogeneous hardware in byte_micro_perf/backends.
+```
+
+### Expected Output
+For different types of operators (Compute-bound / Memory-bound), we adopt various metrics to comprehensively evaluate the performance of the operator. Regarding the various metrics, the explanations are as follows:
+| Metric    | Description |
+| -------- | ------- |
+| Memory Size(MB) | the rough sum of read/write bytes    |
+| Kernel bandwidth(GB/s) | the achieved bandwidth under given input size of this kernel     |
+| Bandwidth Utilization(%)    | the ratio of achieved bandwidth and theoretical bandwidth   |
+| Avg latency(us) |the average of kernel latencies|
+
+Example:
+```
+{
+    "Operator": "EXP",
+    "Backend": "GPU",
+    "Host Info": "Intel(R) Xeon(R) Platinum 8336C CPU @ 2.30GHz",
+    "Device Info": "NVIDIA A800-SXM4-80GB",
+    "Performance": [
+        {
+            "Dtype": "float32",
+            "Tensor Shapes": [
+                [
+                    256,
+                    8192
+                ]
+            ],
+            "Read IO Size(MB)": 8.0,
+            "Write IO Size(MB)": 8.0,
+            "Memory Size(MB)": 16.0,
+            "Kernel bandwidth(GB/s)": 1790.52,
+            "Bandwidth Utilization(%)": 87.81,
+            "Avg latency(us)": 9.37,
+            "QPS": 27321.24
+        }
+    ]
+}
+```
+
+## Trouble Shooting
+
+For more details, you can visit our offical website here: [bytemlperf.ai](https://bytemlperf.ai/). Please let us know if you need any help or have additional questions and issues!
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/backend_gpu.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/backend_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb40d5ea22ff85ecb75dba93b211db2080d1cd16
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/backend_gpu.py
@@ -0,0 +1,282 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import math
+import os
+from datetime import timedelta
+from typing import Any, Dict, List
+
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as dist_c10d
+
+from backends.backend import Backend
+from backends.module_store import *
+from backends.utils import get_dtype_bytes
+
+from .custom_ops import GPUGemmOp, GPUBatchGemmOp, GPUGroupGemmOp
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+
+
+class BackendGPU(Backend):
+    def get_device_name(self):
+        return torch.cuda.get_device_name(0)
+
+    def get_backend_properties(self):
+        self.memory_limit = int(
+            torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        )
+
+        if self.vendor_path is not None and os.path.exists(self.vendor_path) and (self.vendor_path).endswith(".json"):
+            with open(self.vendor_path, "r") as f:
+                self.hw_info_dict = json.load(f)
+                # if the vendor path does not exist, please set this param manaually
+                self.bandwidth_limit = self.hw_info_dict["内存参数"]["内存"]["内存带宽(GB/s)"]
+        else:
+            log.warning(
+                "Vendor_path: [ {} ] was not found or not a full path points to json, please check your path!!! Otherwise, please set the hardware info manaually.".format(
+                    self.vendor_path
+                )
+            )
+
+
+    # device/host ops
+    def host2device(self):
+        self.op = Host2DeviceOp(torch.device("cuda"))
+
+    def device2host(self):
+        self.op = Device2HostOp()
+
+
+    # communication ops
+    def allreduce(self):
+        self.setup_2d_group()
+        self.op = AllReduceOp(self.group)
+
+    def allgather(self):
+        self.setup_2d_group()
+        self.op = AllGatherOp(self.group)
+
+    def reducescatter(self):
+        self.setup_2d_group()
+        self.op = ReduceScatterOp(self.group)
+
+    def alltoall(self):
+        self.setup_2d_group()
+        self.op = AllToAllOp(self.group)
+
+    def broadcast(self):
+        self.setup_2d_group()
+        self.op = BroadcastOp(self.group)
+
+    def p2p(self):
+        self.setup_2d_group()
+        self.op = P2POp(self.group, self.ranks, self.rank)
+
+    # compute ops
+    # unary ops
+    def sin(self):
+        self.op = SinOp()
+
+    def cos(self):
+        self.op = CosOp()
+
+    def exp(self):
+        self.op = ExpOp()
+
+    def exponential(self):
+        self.op = ExponentialOp()
+
+    def silu(self):
+        self.op = SiluOp()
+
+    def gelu(self):
+        self.op = GeluOp()
+
+    def swiglu(self):
+        self.op = SwiGLUOp()
+
+    def cast(self):
+        self.op = CastOp()
+
+
+    # binary ops
+    def add(self):
+        self.op = AddOp()
+
+    def mul(self):
+        self.op = MulOp()
+
+    def sub(self):
+        self.op = SubOp()
+
+    def div(self):
+        self.op = DivOp()
+
+
+    # reduce ops
+    def layernorm(self):
+        self.op = LayerNormOp()
+
+    def softmax(self):
+        self.op = SoftmaxOp()
+
+    def reduce_sum(self):
+        self.op = ReduceSumOp()
+
+    def reduce_min(self):
+        self.op = ReduceMinOp()
+
+    def reduce_max(self):
+        self.op = ReduceMaxOp()
+
+
+    # index ops
+    def index_add(self):
+        self.op = IndexAddOp()
+
+    def sort(self):
+        self.op = SortOp()
+
+    def unique(self):
+        self.op = UniqueOp()
+
+    def scatter(self):
+        self.op = ScatterOp()
+    
+    def gather(self):
+        self.op = GatherOp()
+
+    # gemm ops
+    def gemm(self):
+        self.op = GPUGemmOp()
+
+    def gemv(self):
+        self.op = GPUGemmOp()
+
+    def batch_gemm(self):
+        self.op = GPUBatchGemmOp()
+
+    def group_gemm(self):
+        self.op = GPUGroupGemmOp()
+
+
+
+    # create input tensors
+    def build_tensor(self, input_shapes, dtype):
+        torch.cuda.empty_cache()
+        torch_dtype = getattr(torch, dtype)
+
+        # compute size of input and output tensors
+        if hasattr(self.op, "compute_size"):
+            bytes_per_cnt = self.op.compute_size(input_shapes, dtype)
+        # default: input_tensors_size == output_tensor_size, all tensors have same dtype
+        else:
+            dtype_size = get_dtype_bytes(dtype)
+            element_num = 2 * sum([math.prod(shape) for shape in input_shapes])
+            bytes_per_cnt = dtype_size * element_num
+
+        # compute max avail tensors for compute
+        avail_bytes = (self.memory_limit - 4) * 1024**3
+        avail_cnts = avail_bytes // bytes_per_cnt
+        max_data_cnt = min(self.iterations, avail_cnts)
+
+        # create input tensors for each op
+        input_tensors_list = []
+        for _ in range(max_data_cnt):
+            # create input tensors
+            if hasattr(self.op, "custom_create_tensors"):
+                input_tensors = self.op.custom_create_tensors(input_shapes, torch_dtype, "cuda")
+                input_tensors_list.append(input_tensors)
+            # default: all input tensors have same dtype
+            else:
+                if torch_dtype in [torch.int8, torch.int32]:
+                    input_tensors = [
+                        torch.randint(-3, 3, size=shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                else:
+                    input_tensors = [
+                        torch.randn(shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                input_tensors_list.append(input_tensors)
+        if hasattr(self.op, "process_inputs"):
+            input_tensors_list = [
+                self.op.process_inputs(*(input_tensor))
+                for input_tensor in input_tensors_list
+            ]
+        return input_tensors_list, max_data_cnt, bytes_per_cnt
+
+
+
+    def _run_operation(self, operation, inputs):
+        result = operation(*inputs)
+        return result
+
+    def device_synchronize(self):
+        torch.cuda.synchronize()
+        return True
+
+    def initialize_ccl(self, rank, world_size):
+        """
+        initialize distributed process groups and relevant ENVs
+        """
+        # check device_count
+        device_count = torch.cuda.device_count()
+        if world_size > device_count:
+            world_size = device_count
+        if rank >= world_size:
+            return False
+
+        # set envs
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = "49373"
+        os.environ["LOCAL_RANK"] = str(rank)
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
+        torch.cuda.set_device(rank)
+
+        # Call the init process
+        timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30))
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=world_size,
+            rank=rank,
+            store=None,
+            timeout=timedelta(seconds=timeout_seconds),
+        )
+        self.setup_2d_group()
+        log.warning("DIST: rank {}, world_size {}".format(rank, world_size))
+        return True
+
+    def setup_2d_group(self):
+        self.rank = dist.get_rank()
+        torch.cuda.set_device(self.rank)
+        origin_store_based_barrier = dist_c10d._store_based_barrier
+        dist_c10d._store_based_barrier = lambda *a, **kw: None
+        self.world_size = dist.get_world_size()
+        self.ranks = range(0, self.world_size)
+        group = dist.new_group(self.ranks)
+        if self.rank in self.ranks:
+            self.group = group
+        dist_c10d._store_based_barrier = origin_store_based_barrier
+        # wait for all ranks finish group initializing
+        torch.distributed.barrier()
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/custom_ops.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f4a6b9acae5806bc06a1d21a66effe92b0122e9
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/custom_ops.py
@@ -0,0 +1,119 @@
+from typing import List
+
+import torch
+import cutlass
+
+from backends.module_store import GemmOp, BatchGemmOp, GroupGemmOp
+
+
+# gemm(pytorch) float32/float16/bfloat16 --> float32/float16/bfloat16
+# gemm(cutlass) int8 --> int32
+class GPUGemmOp(GemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.Gemm(
+                alpha=1, beta=0,
+                element_A=dtype,
+                element_B=dtype,
+                element_C=accum_dtype,
+                element_D=accum_dtype,
+                layout_A=cutlass.LayoutType.RowMajor,
+                layout_B=cutlass.LayoutType.RowMajor,
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='gemm', cc=self.plan.cc, 
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("GPUGemmOp cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+        if compute_dtype == torch.int8:
+            output_tensor = self.gemm_op_int8.run(input_tensor_a, input_tensor_b)
+        else:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# batch_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# batch_gemm(cutlass)   int8 --> int32
+class GPUBatchGemmOp(BatchGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+        except:
+            raise Exception("GPUBatchGemmOp import cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+
+        output_tensor = None
+        if compute_dtype == torch.int8:
+            bs, m, n = input_tensor_a.shape[0], input_tensor_a.shape[1], input_tensor_b.shape[2]
+            c_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            output_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            plan = cutlass.op.Gemm(A=input_tensor_a, B=input_tensor_b, C=c_tensor, D=output_tensor, element_accumulator=cutlass.DataType.s32)
+            plan.run(input_tensor_a, input_tensor_b, c_tensor, output_tensor, 1, 0)
+        else:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# group_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# group_gemm(cutlass)   int8 --> int32
+class GPUGroupGemmOp(GroupGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.GroupedGemm(
+                alpha=1, beta=0, 
+                element_A=dtype, 
+                element_B=dtype, 
+                element_C=accum_dtype, 
+                element_D=accum_dtype, 
+                layout_A=cutlass.LayoutType.RowMajor, 
+                layout_B=cutlass.LayoutType.RowMajor, 
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='group_gemm', cc=self.plan.cc,
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("GPUGroupGemmOp cutlass error")
+
+    def forward(self, 
+        a_list : List[torch.Tensor], 
+        b_list : List[torch.Tensor]
+    ):
+        compute_dtype = a_list[0].dtype
+        if compute_dtype == torch.int8:
+            output_tensors = self.gemm_op_int8.run(a_list, b_list)
+        else:
+            output_tensors = [a @ b for a, b in zip(a_list, b_list)]
+        return output_tensors
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/requirements.txt b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e45aca82def1619f23d7570e8b4f90f122686636
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/requirements.txt
@@ -0,0 +1,2 @@
+torch==2.1.0
+nvidia-cutlass
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
new file mode 100644
index 0000000000000000000000000000000000000000..02807ac4263cb031ef1492b72deb4a96d0fe4978
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
@@ -0,0 +1,280 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+## limitations under the License.
+
+import json
+import logging
+import math
+import os
+from datetime import timedelta
+from typing import Any, Dict, List
+
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as dist_c10d
+
+from backends.backend import Backend
+from backends.module_store import *
+from backends.utils import get_dtype_bytes 
+
+from backends.module_store import GemmOp, GemvOp, BatchGemmOp, GroupGemmOp
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+
+
+class BackendILUVATAR(Backend):
+    def get_device_name(self):
+        return torch.cuda.get_device_name(0)
+
+    def get_backend_properties(self):
+        self.memory_limit = int(
+            torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        )
+        if self.vendor_path is not None and os.path.exists(self.vendor_path) and (self.vendor_path).endswith(".json"):
+            with open(self.vendor_path, "r") as f:
+                self.hw_info_dict = json.load(f)
+                # if the vendor path does not exist, please set this param manaually
+                self.bandwidth_limit = self.hw_info_dict["内存参数"]["内存"]["内存带宽(GB/s)"]
+        else:
+            log.warning(
+                "Vendor_path: [ {} ] was not found or not a full path points to json, please check your path!!! Otherwise, please set the hardware info manaually.".format(
+                    self.vendor_path
+                )
+            )
+  
+    # device/host ops
+    def host2device(self):
+        self.op = Host2DeviceOp(torch.device("cuda"))
+
+    def device2host(self):
+        self.op = Device2HostOp()
+
+
+    # communication ops
+    def allreduce(self):
+        self.setup_2d_group()
+        self.op = AllReduceOp(self.group)
+
+    def allgather(self):
+        self.setup_2d_group()
+        self.op = AllGatherOp(self.group)
+
+    def reducescatter(self):
+        self.setup_2d_group()
+        self.op = ReduceScatterOp(self.group)
+
+    def alltoall(self):
+        self.setup_2d_group()
+        self.op = AllToAllOp(self.group)
+
+    def broadcast(self):
+        self.setup_2d_group()
+        self.op = BroadcastOp(self.group)
+
+    def p2p(self):
+        self.setup_2d_group()
+        self.op = P2POp(self.group, self.ranks, self.rank)
+    
+
+    # compute ops
+    # unary ops
+    def sin(self):
+        self.op = SinOp()
+
+    def cos(self):
+        self.op = CosOp()
+
+    def exp(self):
+        self.op = ExpOp()
+
+    def exponential(self):
+        self.op = ExponentialOp()
+
+    def silu(self):
+        self.op = SiluOp()
+
+    def gelu(self):
+        self.op = GeluOp()
+
+    def swiglu(self):
+        self.op = SwiGLUOp()
+
+    def cast(self):
+        self.op = CastOp()
+
+
+    # binary ops
+    def add(self):
+        self.op = AddOp()
+
+    def mul(self):
+        self.op = MulOp()
+
+    def sub(self):
+        self.op = SubOp()
+
+    def div(self):
+        self.op = DivOp()
+
+
+    # reduce ops
+    def layernorm(self):
+        self.op = LayerNormOp()
+
+    def softmax(self):
+        self.op = SoftmaxOp()
+
+    def reduce_sum(self):
+        self.op = ReduceSumOp()
+
+    def reduce_min(self):
+        self.op = ReduceMinOp()
+
+    def reduce_max(self):
+        self.op = ReduceMaxOp()
+
+
+    # index ops
+    def index_add(self):
+        self.op = IndexAddOp()
+
+    def sort(self):
+        self.op = SortOp()
+
+    def unique(self):
+        self.op = UniqueOp()
+
+    def scatter(self):
+        self.op = ScatterOp()
+
+    def gather(self):
+        self.op = GatherOp()
+
+
+    # gemm ops
+    def gemm(self):
+        self.op = GemmOp()
+
+    def gemv(self):
+        self.op = GemvOp()
+
+    def batch_gemm(self):
+        self.op = BatchGemmOp()
+
+    def group_gemm(self):
+        self.op = GroupGemmOp()
+
+
+    # create input tensors
+    def build_tensor(self, input_shapes, dtype):
+        torch.cuda.empty_cache()
+        torch_dtype = getattr(torch, dtype)
+
+        # compute size of input and output tensors
+        if hasattr(self.op, "compute_size"):
+            bytes_per_cnt = self.op.compute_size(input_shapes, dtype)
+        # default: input_tensors_size == output_tensor_size, all tensors have same dtype
+        else:
+            dtype_size = get_dtype_bytes(dtype)
+            element_num = 2 * sum([math.prod(shape) for shape in input_shapes])
+            bytes_per_cnt = dtype_size * element_num
+
+        # compute max avail tensors for compute
+        avail_bytes = (self.memory_limit - 4) * 1024**3
+        avail_cnts = avail_bytes // bytes_per_cnt
+        max_data_cnt = min(self.iterations, avail_cnts)
+
+        # create input tensors for each op
+        input_tensors_list = []
+        for _ in range(max_data_cnt):
+            # create input tensors
+            if hasattr(self.op, "custom_create_tensors"):
+                input_tensors = self.op.custom_create_tensors(input_shapes, torch_dtype, "cuda")
+                input_tensors_list.append(input_tensors)
+            # default: all input tensors have same dtype
+            else:
+                if torch_dtype in [torch.int8, torch.int32]:
+                    input_tensors = [
+                        torch.randint(-3, 3, size=shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                else:
+                    input_tensors = [
+                        torch.randn(shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                input_tensors_list.append(input_tensors)
+        if hasattr(self.op, "process_inputs"):
+            input_tensors_list = [
+                self.op.process_inputs(*(input_tensor))
+                for input_tensor in input_tensors_list
+            ]
+        return input_tensors_list, max_data_cnt, bytes_per_cnt
+
+
+    def _run_operation(self, operation, inputs):
+        result = operation(*inputs)
+        return result
+
+    def device_synchronize(self):
+        torch.cuda.synchronize()
+        return True
+
+    def initialize_ccl(self, rank, world_size):
+        """
+        initialize distributed process groups and relevant ENVs
+        """
+        # check device_count
+        device_count = torch.cuda.device_count()
+        if world_size > device_count:
+            world_size = device_count
+        if rank >= world_size:
+            return False
+
+        # set envs
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = "49373"
+        os.environ["LOCAL_RANK"] = str(rank)
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
+        torch.cuda.set_device(rank)
+
+        # Call the init process
+        timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30))
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=world_size,
+            rank=rank,
+            store=None,
+            timeout=timedelta(seconds=timeout_seconds),
+        )
+        self.setup_2d_group()
+        log.warning("DIST: rank {}, world_size {}".format(rank, world_size))
+        return True
+
+    def setup_2d_group(self):
+        self.rank = dist.get_rank()
+        torch.cuda.set_device(self.rank)
+        origin_store_based_barrier = dist_c10d._store_based_barrier
+        dist_c10d._store_based_barrier = lambda *a, **kw: None
+        self.world_size = dist.get_world_size()
+        self.ranks = range(0, self.world_size)
+        group = dist.new_group(self.ranks)
+        if self.rank in self.ranks:
+            self.group = group
+        dist_c10d._store_based_barrier = origin_store_based_barrier
+        # wait for all ranks finish group initializing
+        torch.distributed.barrier()
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/custom_ops.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fcb1dfb655ef76e1a46c0a16c0ba0700bf8c8b9
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/custom_ops.py
@@ -0,0 +1,119 @@
+from typing import List
+
+import torch
+import cutlass
+
+from backends.module_store import GemmOp, BatchGemmOp, GroupGemmOp
+
+
+# gemm(pytorch) float32/float16/bfloat16 --> float32/float16/bfloat16
+# gemm(cutlass) int8 --> int32
+class ILUVATARGemmOp(GemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.Gemm(
+                alpha=1, beta=0,
+                element_A=dtype,
+                element_B=dtype,
+                element_C=accum_dtype,
+                element_D=accum_dtype,
+                layout_A=cutlass.LayoutType.RowMajor,
+                layout_B=cutlass.LayoutType.RowMajor,
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='gemm', cc=self.plan.cc, 
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("ILUVATARGemmOp cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+        if compute_dtype == torch.int8:
+            output_tensor = self.gemm_op_int8.run(input_tensor_a, input_tensor_b)
+        else:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# batch_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# batch_gemm(cutlass)   int8 --> int32
+class ILUVATARBatchGemmOp(BatchGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+        except:
+            raise Exception("ILUVATARBatchGemmOp import cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+
+        output_tensor = None
+        if compute_dtype == torch.int8:
+            bs, m, n = input_tensor_a.shape[0], input_tensor_a.shape[1], input_tensor_b.shape[2]
+            c_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            output_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            plan = cutlass.op.Gemm(A=input_tensor_a, B=input_tensor_b, C=c_tensor, D=output_tensor, element_accumulator=cutlass.DataType.s32)
+            plan.run(input_tensor_a, input_tensor_b, c_tensor, output_tensor, 1, 0)
+        else:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# group_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# group_gemm(cutlass)   int8 --> int32
+class ILUVATARGroupGemmOp(GroupGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.GroupedGemm(
+                alpha=1, beta=0, 
+                element_A=dtype, 
+                element_B=dtype, 
+                element_C=accum_dtype, 
+                element_D=accum_dtype, 
+                layout_A=cutlass.LayoutType.RowMajor, 
+                layout_B=cutlass.LayoutType.RowMajor, 
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='group_gemm', cc=self.plan.cc,
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("ILUVATARGroupGemmOp cutlass error")
+
+    def forward(self, 
+        a_list : List[torch.Tensor], 
+        b_list : List[torch.Tensor]
+    ):
+        compute_dtype = a_list[0].dtype
+        if compute_dtype == torch.int8:
+            output_tensors = self.gemm_op_int8.run(a_list, b_list)
+        else:
+            output_tensors = [a @ b for a, b in zip(a_list, b_list)]
+        return output_tensors
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/requirements.txt b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/__init__.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/backend.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ac40f2c72b62b945b05c5f3856d621b3427f60
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/backend.py
@@ -0,0 +1,253 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import random
+import traceback
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+
+from backends.utils import dump_communication_ops_report, dump_computation_ops_report
+
+class Backend(ABC):
+    def __init__(self, workload_dict: Dict[str, Any], vendor_path: str):
+        self.op_name = workload_dict["operator"]
+        self.iterations = workload_dict["iterations"]
+        self.warmup = int(0.1 * workload_dict["iterations"])
+        self.vendor_path = vendor_path
+        self.op = None
+
+        # communication params
+        self.rank = None
+        self.world_size = None
+        self.group = None
+
+        # hardware info
+        self.hw_info_dict = None
+        self.memory_limit = None
+        self.bandwidth_limit = None
+        self.get_backend_properties()
+
+        self.target_dtype = None
+
+    @abstractmethod
+    def get_device_name(self):
+        pass
+
+    @abstractmethod
+    def get_backend_properties(self):
+        pass
+
+    @abstractmethod
+    def build_tensor(self, input_shapes: List[List[int]], dtype):
+        pass
+
+    @abstractmethod
+    def _run_operation(self, operation, inputs):
+        pass
+
+    @abstractmethod
+    def device_synchronize(self):
+        pass
+
+    @abstractmethod
+    def initialize_ccl(self, rank, world_size):
+        pass
+
+    @abstractmethod
+    def setup_2d_group(self):
+        pass
+
+
+    # communication ops
+    def host2device(self):
+        pass
+
+    def device2host(self):
+        pass
+
+    def allreduce(self):
+        pass
+
+    def allgather(self):
+        pass
+
+    def reducescatter(self):
+        pass
+
+    def alltoall(self):
+        pass
+
+    def broadcast(self):
+        pass
+
+    def p2p(self):
+        pass
+
+    # compute ops
+    # unary ops
+    def sin(self):
+        pass
+
+    def cos(self):
+        pass
+
+    def exp(self):
+        pass
+
+    def exponential(self):
+        pass
+
+    def silu(self):
+        pass
+
+    def gelu(self):
+        pass
+
+    def swiglu(self):
+        pass
+
+    def cast(self):
+        pass
+
+
+    # binary ops
+    def add(self):
+        pass
+
+    def mul(self):
+        pass
+
+    def sub(self):
+        pass
+
+    def div(self):
+        pass
+
+
+    # reduce ops
+    def layernorm(self):
+        pass
+
+    def softmax(self):
+        pass
+
+    def reduce_sum(self):
+        pass
+
+    def reduce_min(self):
+        pass
+
+    def reduce_max(self):
+        pass
+
+
+    # index ops
+    def index_add(self):
+        pass
+
+    def sort(self):
+        pass
+
+    def unique(self):
+        pass
+
+    def scatter(self):
+        pass
+        
+    def gather(self):
+        pass
+
+
+    # gemm ops
+    def gemm(self):
+        pass
+
+    def gemv(self):
+        pass
+
+    def batch_gemm(self):
+        pass
+
+    def group_gemm(self):
+        pass
+
+
+    # perf specify input_shape for 
+    def perf(self, input_shapes: List[List[int]], dtype):
+        error = ""
+
+        # create input tensors based on input_shapes and dtype
+        tensor_list, tensor_cnt, tensor_size_perc_cnt = self.build_tensor(
+            input_shapes, dtype
+        )
+
+        if tensor_cnt > 0:
+            try:
+                # random select input tensors
+                input_index_list = [
+                    random.randint(0, tensor_cnt - 1) for _ in range(self.iterations)
+                ]
+
+                # warmup
+                num_warm_up = 10
+                for _ in range(num_warm_up):
+                    self._run_operation(self.op, tensor_list[0])
+
+                # perf
+                self.device_synchronize()
+                start_time = time.perf_counter_ns()
+                for i in range(self.iterations):
+                    self._run_operation(
+                        self.op,
+                        tensor_list[input_index_list[i]]
+                    )
+                self.device_synchronize()
+                end_time = time.perf_counter_ns()
+
+                # time in us
+                total_exec_time = (end_time - start_time) / 1e3
+                latency = round(total_exec_time / self.iterations, 2)
+            except Exception as e:
+                traceback.print_exc()
+                latency = 0
+                error = "RUN_OP_ERROR"
+        else:
+            latency = 0
+            error = "OOM"
+
+        tensor_list = []
+        
+        if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
+            report = dump_communication_ops_report(
+                self.op_name,
+                dtype,
+                input_shapes,
+                self.group.size(),
+                None,
+                latency,
+                error
+            )
+        else:
+            report = dump_computation_ops_report(
+                self.op_name, 
+                dtype, 
+                input_shapes, 
+                self.bandwidth_limit, 
+                latency, 
+                error
+            )
+        return report
+
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/module_store.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/module_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..a821ab1143a2d942d6b951e82048b09e67e809cc
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/module_store.py
@@ -0,0 +1,615 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+from typing import List
+
+import torch
+import torch.distributed as dist
+
+from .utils import get_dtype_bytes
+
+
+class GemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[M, K], [K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        M, K = a_shape
+        K, N = b_shape
+        d_shape = [M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"GemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class GemvOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+    
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[M, K], [K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        M, K = a_shape
+        K, N = b_shape
+        d_shape = [M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"GemvOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class BatchGemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[bs, M, K], [bs, K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        bs, M, K = a_shape
+        bs, K, N = b_shape
+        d_shape = [bs, M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("int32") * output_element_num * 2
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"BatchGemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class GroupGemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        """
+        [
+            [[M1, K1], [K1, N1]], 
+            [[M2, K2], [K2, N2]]
+        ]
+        """
+        torch_dtype = getattr(torch, dtype)
+        bytes_per_cnt = 0
+        for problem_shape in input_shapes:
+            a_shape, b_shape = problem_shape
+            M, K = a_shape
+            K, N = b_shape
+            d_shape = [M, N]
+            dtype_size = get_dtype_bytes(dtype)
+            input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+            output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+            if torch_dtype == torch.int8:
+                bytes_per_cnt += dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+            else:
+                bytes_per_cnt += dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        """
+        [
+            [[M1, K1], [K1, N1]], 
+            [[M2, K2], [K2, N2]]
+        ]
+        """
+        left_tensors = []
+        right_tensors = []
+
+        for problem_shape in input_shapes:
+            a_shape, b_shape = problem_shape
+            if torch_dtype in [torch.int8, torch.int32]:
+                left_tensor = torch.randint(-3, 3, size=a_shape, dtype=torch_dtype, device=xpu_device)
+                right_tensor = torch.randint(-3, 3, size=b_shape, dtype=torch_dtype, device=xpu_device)
+            else:
+                left_tensor = torch.randn(a_shape, dtype=torch_dtype, device=xpu_device)
+                right_tensor = torch.randn(b_shape, dtype=torch_dtype, device=xpu_device)
+            left_tensors.append(left_tensor)
+            right_tensors.append(right_tensor)
+
+        return [left_tensors, right_tensors]
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a[0].dtype
+        output_tensor_list = []
+        for a, b in zip(input_tensor_a, input_tensor_b):
+            if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                output_tensor = torch.mm(a, b)
+                output_tensor_list.append(output_tensor)
+            else:
+                raise Exception(f"GroupGemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor_list
+
+
+class Host2DeviceOp(torch.nn.Module):
+    def __init__(self, xpu_device):
+        super().__init__()
+        self.xpu_device = xpu_device
+
+    def process_inputs(self, input_tensors):
+        new_inputs = input_tensors.cpu()
+        return [new_inputs]
+
+    def forward(self, input_tensors):
+        assert input_tensors.device.type == "cpu"
+        output_xpu = input_tensors.to(self.xpu_device)
+        return output_xpu
+
+
+class Device2HostOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        assert input_tensors.device.type != "cpu"
+        output_cpu = input_tensors.cpu()
+        return output_cpu
+
+
+class AllReduceOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def forward(self, input_tensors):
+        dist.all_reduce(input_tensors, group=self.group)
+        return True
+
+
+class AllGatherOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensors):
+        input_tensor_list = list(
+            torch.chunk(input_tensors, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list]
+
+    def forward(self, input_tensor_list):
+        dist.all_gather(
+            input_tensor_list,
+            input_tensor_list[dist.get_rank(self.group)],
+            group=self.group,
+        )
+        return True
+
+
+class ReduceScatterOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensors):
+        input_tensor_list = list(
+            torch.chunk(input_tensors, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list]
+
+    def forward(self, input_tensor_list):
+        dist.reduce_scatter(
+            input_tensor_list[dist.get_rank(self.group)],
+            input_tensor_list,
+            group=self.group,
+        )
+        return True
+
+
+class AllToAllOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensor, output_tensor):
+        input_tensor_list = list(
+            torch.chunk(input_tensor, dist.get_world_size(self.group))
+        )
+        output_tensor_list = list(
+            torch.chunk(output_tensor, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list, output_tensor_list]
+
+    def forward(self, in_tensors_list, out_tensors_list):
+        dist.all_to_all(out_tensors_list, in_tensors_list, group=self.group)
+        return True
+
+
+class BroadcastOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def forward(self, input_tensors):
+        dist.broadcast(input_tensors, 0, self.group)
+        return True
+
+
+class P2POp(torch.nn.Module):
+    def __init__(self, group, ranks, rank):
+        super().__init__()
+        self.group = group
+        self.group_size = self.group.size()
+        self.rank = rank
+        self.ranks = ranks
+        self.rank_size = len(ranks)
+
+    def next_rank(self):
+        return self.ranks[(self.rank + 1) % self.rank_size]
+
+    def prev_rank(self):
+        return self.ranks[(self.rank - 1) % self.rank_size]
+
+    def forward(self, send_tensor, recv_tensor):
+        reqs = []
+        if self.rank != (self.group_size - 1):
+            send_req = dist.isend(send_tensor, self.next_rank(), self.group)
+            reqs.append(send_req)
+        if self.rank != 0:
+            recv_req = dist.irecv(recv_tensor, self.prev_rank(), self.group)
+            reqs.append(recv_req)
+
+        for req in reqs:
+            req.wait()
+        return True
+
+
+class SinOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.sin(input_tensors)
+        return result
+
+
+class CosOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.cos(input_tensors)
+        return result
+
+
+class ExpOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.exp(input_tensors)
+        return result
+
+
+class ExponentialOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = input_tensors.exponential_()
+        return result
+
+
+class SiluOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.nn.functional.silu(input_tensors)
+        return result
+
+
+class GeluOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.nn.functional.gelu(input_tensors)
+        return result
+
+
+class SwiGLUOp(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.w = 1
+        self.v = 2
+
+    def forward(self, input_tensors):
+        result = (torch.nn.functional.sigmoid(input_tensors) * self.w) + (input_tensors * self.v)
+        return result
+
+
+class CastOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def set_dtype(self, src_dtype: str):
+        target_dtype = "bfloat16" if src_dtype == "float32" else "float32"
+        self.target_dtype = target_dtype
+        self.target_torch_dtype = getattr(torch, target_dtype)
+
+    def compute_size(self, input_shapes, dtype):
+        torch_dtype = getattr(torch, dtype)
+        self.set_dtype(dtype)
+        dtype_size = get_dtype_bytes(dtype)
+        target_dtype_size = get_dtype_bytes(self.target_dtype)
+        element_num = sum([math.prod(shape) for shape in input_shapes])
+        bytes_per_cnt = dtype_size * element_num + target_dtype_size * element_num
+        return bytes_per_cnt
+
+    def forward(self, input_tensors):
+        result = input_tensors.to(self.target_torch_dtype)
+        return result
+
+
+class AddOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a + input_tensor_b
+        return result
+
+
+class MulOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a * input_tensor_b
+        return result
+
+
+class SubOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a - input_tensor_b
+        return result
+
+
+class DivOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a / input_tensor_b
+        return result
+
+
+class LayerNormOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.nn.functional.layer_norm(
+            input_tensors, (input_tensors.shape[-1],)
+        )
+        return result
+
+
+class SoftmaxOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.nn.functional.softmax(input_tensors, dim=-1)
+        return result
+
+
+class ReduceSumOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.sum(input_tensors, dim=-1)
+        return result
+
+
+class ReduceMinOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.min(input_tensors, dim=-1)
+        return result
+
+
+class ReduceMaxOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.max(input_tensors, dim=-1)
+        return result
+
+
+class IndexAddOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def process_inputs(self, input_tensor, source_tensor):
+        index = torch.randint(0, input_tensor.shape[0], (source_tensor.shape[0],)).to(
+            input_tensor.device
+        )
+        return [input_tensor, index, source_tensor]
+
+    def forward(self, input_tensor, index, source_tensor):
+        result = input_tensor.index_add_(0, index, source_tensor)
+        return result
+
+
+class SortOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.sort(input_tensors)
+        return result
+
+
+class UniqueOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.unique(input_tensors, return_counts=True)
+        return result
+
+
+class ScatterOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def compute_size(self, input_shapes, dtype):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        tensor_dtype_size = get_dtype_bytes(dtype)
+        index_dtype_size = get_dtype_bytes("int64")
+
+        shape_func = lambda shape: math.prod(shape)
+
+        bytes_per_cnt = (
+            shape_func(tensor_shape) * tensor_dtype_size
+            + shape_func(tensor_shape) * index_dtype_size
+            + shape_func(tensor_shape) * tensor_dtype_size
+        )
+        
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        dst_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+        src_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+
+        # dim = 0
+        # dst[index[i, j], j] = src[i, j]
+        batch_size = tensor_shape[0]
+        tensor_len = tensor_shape[1]
+
+        index = [i for i in range(batch_size)]
+        random.shuffle(index)
+        index_tensor = torch.cat(
+            [torch.full((1, tensor_len), i, dtype=torch.int64, device=xpu_device) for i in index], 
+            dim=0
+        )
+        
+        return [dst_tensor, index_tensor, src_tensor]
+
+
+    def forward(self, dst_tensor, index_tensor, src_tensor):
+        dst_tensor.scatter_(0, index_tensor, src_tensor)
+        return dst_tensor
+
+
+class GatherOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def compute_size(self, input_shapes, dtype):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        tensor_dtype_size = get_dtype_bytes(dtype)
+        index_dtype_size = get_dtype_bytes("int64")
+
+        shape_func = lambda shape: math.prod(shape)
+
+        bytes_per_cnt = (
+            shape_func(tensor_shape) * tensor_dtype_size
+            + shape_func(tensor_shape) * index_dtype_size
+            + shape_func(tensor_shape) * tensor_dtype_size
+        )
+        
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        dst_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+        src_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+
+        # dim = 0
+        # dst[index[i, j], j] = src[i, j]
+        batch_size = tensor_shape[0]
+        tensor_len = tensor_shape[1]
+
+        index = [i for i in range(batch_size)]
+        random.shuffle(index)
+        index_tensor = torch.cat(
+            [torch.full((1, tensor_len), i, dtype=torch.int64, device=xpu_device) for i in index], 
+            dim=0
+        )
+        
+        return [dst_tensor, index_tensor, src_tensor]
+
+
+    def forward(self, dst_tensor, index_tensor, src_tensor):
+        torch.gather(src_tensor, 0, index_tensor, out=dst_tensor)
+        return dst_tensor
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/utils.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3216286ac4752a0129d92730a310d2984ffb9f62
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/utils.py
@@ -0,0 +1,207 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List
+
+import numpy as np
+import torch
+
+
+def get_dtype_bytes(dtype: str):
+    torch_dtype = getattr(torch, dtype)
+    dtype_size = 0
+    if torch_dtype in [torch.int64, torch.int32, torch.int8]:
+        dtype_size = torch.iinfo(torch_dtype).bits // 8
+    elif torch_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+        dtype_size = torch.finfo(torch_dtype).bits // 8
+    else:
+        # not supported yet
+        pass
+    return dtype_size
+
+
+def get_io_amount(op_name, input_shapes, dtype):
+    batch_size = input_shapes[0][0]
+    dtype_size = get_dtype_bytes(dtype)
+    if op_name in ["add", "mul", "sub", "div"]:
+        # c = a + b
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * math.prod(input_shapes[0])
+    elif op_name == "gemm":
+        M = input_shapes[0][0]
+        K = input_shapes[0][1]
+        N = input_shapes[1][1]
+        read_io_amount = dtype_size * (M * K + K * N)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * (M * N)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * (M * N)
+    elif op_name == "batch_gemm":
+        bs = input_shapes[0][0]
+        M = input_shapes[0][1]
+        K = input_shapes[0][2]
+        N = input_shapes[1][2]
+        read_io_amount = dtype_size * bs * (M * K + K * N)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * bs * (M * N)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * bs * (M * N)
+    elif op_name == "group_gemm":
+        in_size_list = []
+        out_size_list = []
+        m_list = []
+        for problem_shape in input_shapes:
+            M = problem_shape[0][0]
+            K = problem_shape[0][1]
+            N = problem_shape[1][1]
+            in_size_list.append(M * K + K * N)
+            out_size_list.append(M * N)
+            m_list.append(M)
+        batch_size = sum(m_list)
+        read_io_amount = dtype_size * sum(in_size_list)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * sum(out_size_list)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * sum(out_size_list)
+    elif op_name in ["device2host"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = 0
+    elif op_name in ["host2device"]:
+        read_io_amount = 0
+        write_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+    elif op_name in ["reduce_sum", "reduce_max", "reduce_min"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * sum([math.prod(shape[:-1]) for shape in input_shapes])
+    elif op_name in ["unqiue", "sort"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = 2 * dtype_size * sum([math.prod(shape) for shape in input_shapes])
+    elif op_name in ["scatter", "gather"]:
+        tensor_shape = input_shapes[0]
+        read_io_amount = (dtype_size + get_dtype_bytes("int64")) * math.prod(tensor_shape)
+        write_io_amount = dtype_size * math.prod(tensor_shape)
+    elif op_name == "cast":
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = read_io_amount / 2 if dtype == torch.float32 else read_io_amount * 2
+    elif op_name in ["index_add"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes]) + get_dtype_bytes("int32") * input_shapes[1][0]
+        write_io_amount = dtype_size * math.prod(input_shapes[0])
+    else:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+
+    total_io_amount = read_io_amount + write_io_amount
+
+    return batch_size, total_io_amount, read_io_amount, write_io_amount
+
+
+def dump_communication_ops_report(
+    op_name: str,
+    dtype: str,
+    input_shapes: List[List[int]],
+    group_size: List[int],
+    bandwidth_limit: float,
+    latency: float,
+    error: str = ""
+):
+    size = math.prod(input_shapes[0])
+    dtype_size = get_dtype_bytes(dtype)
+    mb = dtype_size * size / 1024 / 1024
+    if error == "":
+        algo_bw = dtype_size * size / latency / 1e3
+
+        """
+        allreduce:      2 * (group_size - 1) * (tensor_size / group_size)
+        allgather:      1 * (group_size - 1) * (tensor_size / group_size)
+        reducescatter:  1 * (group_size - 1) * (tensor_size / group_size)
+        alltoall:       1 * (group_size - 1) * (tensor_size / group_size)
+        broadcast:      tensor_size
+        p2p:            tensor_size
+        """
+        bus_bw = algo_bw * (group_size - 1) / group_size
+        if op_name in ["broadcast", "p2p"]:
+            bus_bw = algo_bw
+        if op_name == "allreduce":
+            bus_bw *= 2
+
+        bandwidth_utils = None
+        if bandwidth_limit is not None:
+            bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Memory Size(MB)": round(mb, 2),
+            "Group": group_size,
+            "Kernel bandwidth(GB/s)": round(algo_bw, 2),
+            "Bus bandwidth(GB/s)": round(bus_bw, 2),
+            "Bandwidth Utilization(%)": bandwidth_utils,
+            "Avg latency(us)": round(latency, 2),
+        }
+    else:
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Memory Size(MB)": round(mb, 2),
+            "Group": group_size,
+            "Kernel bandwidth(GB/s)": 0,
+            "Bus bandwidth(GB/s)": 0,
+            "Bandwidth Utilization(%)": None,
+            "Avg latency(us)": 0,
+            "Error": error,
+        }
+    return report
+
+
+def dump_computation_ops_report(
+    op_name: str,
+    dtype: str,
+    input_shapes: List[List[int]],
+    bandwidth_limit: float,
+    latency: float,
+    error: str = ""
+):
+    batch_size, total_io_amount, read_io_amount, write_io_amount = get_io_amount(op_name, input_shapes, dtype)
+
+    if error == "":
+        qps = round(1000 / latency * batch_size, 2)
+        algo_bw = total_io_amount / latency / 1e3
+
+        bandwidth_utils = None
+        if bandwidth_limit is not None:
+            bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Read IO Size(MB)": round(read_io_amount / 1024 / 1024, 2),
+            "Write IO Size(MB)": round(write_io_amount / 1024 / 1024, 2),
+            "Memory Size(MB)": round(total_io_amount / 1024 / 1024, 2),
+            "Kernel bandwidth(GB/s)": round(algo_bw, 2),
+            "Bandwidth Utilization(%)": bandwidth_utils,
+            "Avg latency(us)": round(latency, 2),
+            "QPS": qps,
+        }
+    else:
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Read IO Size(MB)": round(read_io_amount / 1024 / 1024, 2),
+            "Write IO Size(MB)": round(write_io_amount / 1024 / 1024, 2),
+            "Memory Size(MB)": round(total_io_amount / 1024 / 1024, 2),
+            "Kernel bandwidth(GB/s)": 0,
+            "Bandwidth Utilization(%)": None,
+            "Avg latency(us)": 0,
+            "QPS": 0,
+            "Error": error,
+        }
+    return report
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/compiled_cache.db b/toolbox/ByteMLPerf/byte_micro_perf/compiled_cache.db
new file mode 100644
index 0000000000000000000000000000000000000000..1894846cceaf75183b5eb2e4bb7f753badc9056f
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_micro_perf/compiled_cache.db differ
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/core/perf_engine.py b/toolbox/ByteMLPerf/byte_micro_perf/core/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2774c8218943ccc794e5368b0d6d4d4d2f3afa6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/core/perf_engine.py
@@ -0,0 +1,398 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import importlib
+import json
+import logging
+import math
+import os
+import subprocess
+import sys
+import pathlib
+import traceback
+import random
+from typing import Any, Dict, List
+import itertools
+
+
+import torch.multiprocessing as mp
+import virtualenv
+
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+from backends.backend import Backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="gemm",
+        help="The task going to be evaluted, refs to workloads/",
+    )
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/",
+    )
+    parser.add_argument(
+        "--vendor_path",
+        required=False,
+        help="The hardware configs need to be loaded, refs to vendor_zoo/NVIDIA/A100-PCIe.json",
+    )
+    parser.add_argument(
+        "--compile_only", action="store_true", help="Run compilation only"
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def load_workload(task: str) -> Dict[str, Any]:
+    """
+    Return a list of dictionary with model Configuration
+    Args: List[str]
+    Returns: List[dic]
+    """
+    modules_dir = (
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/workloads"
+    )
+
+    for file in os.listdir(modules_dir):
+        path = os.path.join(modules_dir, file)
+        if (
+            not file.startswith("_")
+            and not file.startswith(".")
+            and (file.endswith(".json") or os.path.isdir(path))
+            and file[: file.find(".json")] == task
+        ):
+            module_name = file
+            with open("workloads/" + module_name, "r") as f:
+                workload_dict = json.load(f)
+            return workload_dict
+    else:
+        log.error(
+            "Task name: [ {} ] was not found, please check your task name".format(task)
+        )
+
+def parse_workload(workload):
+    shape_list = []
+    if "input_shape_list" in workload:
+        shape_list.extend(workload["input_shape_list"])
+    # gemm or batch_gemm
+    elif "M/K/N" in workload:
+        if "batch_size" in workload:
+            for batch_size in workload["batch_size"]:
+                for M, K, N in workload["M/K/N"]:
+                    shape_list.append([
+                        [batch_size, M, K],
+                        [batch_size, K, N]
+                    ])
+        else:
+            for M, K, N in workload["M/K/N"]:
+                shape_list.append([[M, K], [K, N]])
+    # group_gemm
+    elif "MKN_choices" in workload:
+        seed = workload["seed"]
+        MKN_list = workload["MKN_choices"]
+        problems_list = workload["problems"]
+
+        random.seed(seed)
+        for problems in problems_list:
+            cur_inputs = []
+            for _ in range(problems):
+                M, K, N = [random.choice(MKN_list) for _ in range(3)]
+                cur_shapes = [[M, K], [K, N]]
+                cur_inputs.append(cur_shapes)
+        shape_list.append(cur_inputs)
+
+
+    if "input_shape_groups" in workload:
+        input_shape_groups = workload["input_shape_groups"] if isinstance(workload["input_shape_groups"], list) else [workload["input_shape_groups"]]
+
+        for input_shape_group in input_shape_groups:
+            if "inputs" in input_shape_group:
+                input_shape_list = []
+                for input_shapes in input_shape_group["inputs"]:
+                    input_shape_list.append([list(shape) for shape in itertools.product(*input_shapes)])
+                if len(input_shape_list) == 1:
+                    shape_list.extend(input_shape_list[0])
+                else:
+                    shape_list.extend([list(input_shape) for input_shape in zip(*input_shape_list)])
+
+            else:
+                gemm_keys = ["M", "K", "N", "MN", "MK", "KN"]
+                gemm_values = [input_shape_group.get(k, []) for k in gemm_keys]
+                if any(gemm_values):
+                    m ,k, n, mn, mk, kn = gemm_values
+                    # batch gemm
+                    if "batch_size" in input_shape_group:
+                        bs = input_shape_group.get("batch_size", [])
+                        if m and n and k:
+                            for p in itertools.product(bs, m, k, n):
+                                shape_list.append([[p[0], p[1], p[2]], [p[0], p[2], p[3]]])
+                        if mn and k:
+                            for p in itertools.product(bs, mn, k):
+                                shape_list.append([[p[0], p[1][0], p[2]], [p[0], p[2], p[1][1]]])
+                        if mk and n:
+                            for p in itertools.product(bs, mk, n):
+                                shape_list.append([[p[0], p[1][0], p[1][1]], [p[0], p[1][1], p[2]]])
+                        if m and kn:
+                            for p in itertools.product(bs, m, kn):
+                                shape_list.append([[p[0], p[1], p[2][0]], [p[0], p[2][0], p[2][1]]])
+                    # group gemm
+                    elif "gemm_group" in input_shape_group:
+                        groups = input_shape_group.get("gemm_group", [])
+                        kn = input_shape_group.get("KN", [])
+                        if k and n:
+                            kn.append([list(shape) for shape in itertools.product(k, n)])
+                        for group in groups:
+                            for _kn in kn:
+                                group_input_shape_list = []
+                                for m in group:
+                                    group_input_shape_list.append([[m, _kn[0]], [_kn[0], _kn[1]]])
+                                shape_list.append(group_input_shape_list)
+                    # gemm
+                    else:
+                        if m and n and k:
+                            for p in itertools.product(m, k, n):
+                                shape_list.append([[p[0], p[1]], [p[1], p[2]]])
+                        if mn and k:
+                            for p in itertools.product(mn, k):
+                                shape_list.append([[p[0][0], p[1]], [p[1], p[0][1]]])
+                        if mk and n:
+                            for p in itertools.product(mk, n):
+                                shape_list.append([[p[0][0], p[0][1]], [p[0][1], p[1]]])
+                        if m and kn:
+                            for p in itertools.product(m, kn):
+                                shape_list.append([[p[0], p[1][0]], [p[1][0], p[1][1]]])
+    return shape_list
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.old_os_path = os.environ["PATH"]
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+
+    def init_process(self, rank: int, world_size: int):
+        """
+        Initialize the distributed environment.
+
+        """
+        initialize_func = getattr(self.backend, "initialize_ccl")
+
+        # world_size may excced available device count
+        ret = initialize_func(rank, world_size)
+        if ret is not None and not ret:
+            return
+
+        status = self.start_perf(self.workload)
+        return status
+
+    def init_backend(self, hardware_type: str) -> Backend:
+        """
+        Load related compile backend with input hardware type
+
+        Arguments: str
+
+        Returns: Heterogeneous Backend()
+        """
+        log.info("Loading Heterogeneous Backend: {}".format(hardware_type))
+
+        backend = importlib.import_module(
+            "backends." + hardware_type + ".backend_" + hardware_type.lower()
+        )
+        backend = getattr(backend, "Backend" + hardware_type)
+        return backend(self.workload, self.args.vendor_path)
+
+    def start_engine(self) -> None:
+        #status = self.activate_venv(self.backend_type)
+        #if not status:
+        #    log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.backend = self.init_backend(self.backend_type)
+        output_dir = os.path.abspath("reports/" + self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+
+        if self.args.task in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
+            for group in self.workload["group"]:
+                try:
+                    mp.spawn(fn=self.init_process, args=(group,), nprocs=group)
+                except Exception as e:
+                    traceback.print_exc()
+                    log.error(f"Execute task: {self.args.task} failed, group: {group}, error msg: {e}")
+        else:
+            status = self.start_perf(self.workload)
+
+        self.deactivate_venv()
+
+    def start_perf(self, workload: Dict[str, Any]) -> bool:
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        if local_rank == 0:
+            log.info(
+                "******************************************* Start to test op: [{}]. *******************************************".format(
+                    workload["operator"]
+                )
+            )
+
+        # Initalize Output Dir and Reports
+        output_dir = pathlib.Path("reports").joinpath(self.backend_type).joinpath(workload["operator"])
+        os.makedirs(output_dir, exist_ok=True)
+
+        op_name = workload["operator"]
+        base_report = {
+            "Operator": op_name.upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name(),
+            "Device Info": getattr(self.backend, "get_device_name")(),
+        }
+
+        op = getattr(self.backend, op_name.lower(), None)
+        if op is not None and callable(op):
+            op()
+        else:
+            raise ValueError(f"Unknown operation: {op_name.lower()}")
+
+        # get input shape info
+        shape_list = parse_workload(self.workload)
+
+        # dtype list
+        dtype_list = self.workload["dtype"]
+
+        for dtype in dtype_list:
+            perf_reports = []
+            base_report["Performance"] = {}
+
+            for input_shape in shape_list:
+                """
+                input_shape could be:
+                  List[int]: single shape. cos
+                  List[List[int]]: multiple inputs. add
+                  List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
+                """
+                if local_rank == 0:
+                    log.info(f"Execute op: [{op_name.lower()}], input_shape: {input_shape}, dtype: {dtype}")
+                if isinstance(input_shape[0], int):
+                    input_shape = [input_shape]
+                try:
+                    reports = self.backend.perf(input_shape, dtype)
+                except Exception as e:
+                    traceback.print_exc()
+                    log.error(f"Execute op: {op_name.lower()} failed, input_shape: {input_shape}, dtype: {dtype}, error msg: {e}")
+                    reports = {}
+                perf_reports.append(reports)
+            base_report["Performance"] = perf_reports
+
+            # write output to json file
+            has_group = "Group" in base_report["Performance"][0]
+            output_report_path = (
+                f"result-{str(dtype)}"
+                + (
+                    f"-group{base_report['Performance'][0]['Group']}"
+                    if has_group
+                    else ""
+                )
+                + ".json"
+            )
+            output_report_path = os.path.join(output_dir, output_report_path)
+            if local_rank == 0:
+                # logging.info(base_report["Performance"])
+                with open(output_report_path, "w") as file:
+                    json.dump(base_report, file, indent=4)
+        if local_rank == 0:
+            log.info(
+                "******************************************* Test op: [{}] SUCCESS. *******************************************".format(
+                    workload["operator"]
+                )
+            )
+        return True
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        if os.path.exists("backends/" + hardware_type + "/requirements.txt"):
+            log.info("Activating Virtual Env for " + hardware_type)
+
+            venv_dir = os.path.join("backends", hardware_type + "/venv")
+            activate_file = os.path.join(venv_dir, "bin", "activate_this.py")
+            if not os.path.exists(venv_dir):
+                log.info("venv not exist, Creating Virtual Env for " + hardware_type)
+
+                virtualenv.create_environment(venv_dir, True)
+
+                exec(open(activate_file).read(), {"__file__": activate_file})
+                python_path = os.path.join(venv_dir, "bin", "python3")
+                subprocess.call(
+                    [python_path, "-m", "pip", "install", "--upgrade", "pip", "--quiet"]
+                )
+                subprocess.call(
+                    [
+                        python_path,
+                        "-m",
+                        "pip",
+                        "install",
+                        "-r",
+                        "backends/" + hardware_type + "/requirements.txt",
+                        "-q",
+                    ]
+                )
+            else:
+                exec(open(activate_file).read(), {"__file__": activate_file})
+                """
+                just in case install failed in pre-run.
+                """
+                python_path = os.path.join(venv_dir, "bin", "python3")
+                subprocess.call(
+                    [python_path, "-m", "pip", "install", "--upgrade", "pip", "--quiet"]
+                )
+                subprocess.call(
+                    [
+                        python_path,
+                        "-m",
+                        "pip",
+                        "install",
+                        "-r",
+                        "backends/" + hardware_type + "/requirements.txt",
+                        "-q",
+                    ]
+                )
+
+                if not hasattr(sys, "real_prefix"):
+                    return False
+                return True
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:0] = self.prev_sys_path   #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ["PATH"] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/launch.py b/toolbox/ByteMLPerf/byte_micro_perf/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f31829180868a3f884d3433d84948680591761d8
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/launch.py
@@ -0,0 +1,108 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import logging
+import os
+import random
+import socket
+import subprocess
+import sys
+
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.abspath(__file__))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("lanuch")
+
+
+def parse_task(task_dir):
+    tasks = []
+    if os.path.isdir(task_dir):
+        for root, _, files in os.walk(task_dir, topdown=False):
+            for name in files:
+                if name.endswith(".json"):
+                    tasks.append(name.rsplit('.', 1)[0])
+    return tasks
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task", default="", help="The task going to be evaluted, refs to workloads/"
+    )
+    parser.add_argument(
+        "--task_dir", default="", help="The direcotry of tasks going to be evaluted, e.g., set to workloads"
+    )
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/",
+    )
+    parser.add_argument(
+        "--vendor_path",
+        required=False,
+        help="The hardware configs need to be loaded, refs to vendor_zoo/NVIDIA/A100-PCIe.json",
+    )
+    parser.add_argument(
+        "--compile_only",
+        action="store_true",
+        help="Task will stoped after compilation finished",
+    )
+    parser.add_argument(
+        "--show_task_list", action="store_true", help="Print all task names"
+    )
+    parser.add_argument(
+        "--show_hardware_list",
+        action="store_true",
+        help="Print all hardware bytemlperf supported",
+    )
+    args = parser.parse_args()
+
+    if args.show_task_list:
+        logging.info("******************* Supported Task *******************")
+        for file in os.listdir("workloads"):
+            print(file[:-5])
+    if args.show_hardware_list:
+        log.info("***************** Supported Hardware Backend *****************")
+        for file in os.listdir("backends"):
+            if not file.endswith(".py") and not file.startswith("_"):
+                print(file)
+    if args.task or args.task_dir:
+        log.info("******************* Pip Package Installing *******************")
+        subprocess.call(
+            ["python3", "-m", "pip", "install", "pip", "--upgrade", "--quiet"]
+        )
+
+        subprocess.call(
+            ["python3", "-m", "pip", "install", "-r", "requirements.txt", "--quiet"]
+        )
+
+        if args.task:
+            if args.task_dir:
+                log.warning("task and task_dir are both set, task_dir will be ignored")
+            tasks = args.task.split(',')
+        elif args.task_dir:
+            tasks = parse_task(args.task_dir)
+        logging.info(f"******************* Tasks: {tasks}")
+        exit_code = 0
+        for task in tasks:
+            cmd = "python3 core/perf_engine.py --hardware_type {} --task {} --vendor_path {}".format(
+                args.hardware_type, task, args.vendor_path
+            )
+            exit_code = subprocess.call(cmd, shell=True)
+
+        sys.exit(exit_code)
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/requirements.txt b/toolbox/ByteMLPerf/byte_micro_perf/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9adbfddb211eb4cbfa11958041c824719565a885
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/requirements.txt
@@ -0,0 +1,14 @@
+matplotlib
+pandas
+virtualenv==16.7.12
+scikit-learn
+prompt_toolkit
+tqdm
+opencv-python
+transformers
+tokenization
+fpdf
+attrs
+decorator
+typing-extensions
+pydot
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/docs/images/flowchat.png b/toolbox/ByteMLPerf/docs/images/flowchat.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a32f6eb7693027f4039e9d36ab58278de6117f2
Binary files /dev/null and b/toolbox/ByteMLPerf/docs/images/flowchat.png differ
diff --git a/toolbox/ByteMLPerf/docs/images/icon.png b/toolbox/ByteMLPerf/docs/images/icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e4654a4df64458f3358c42e5545bff0ebf8b51f
Binary files /dev/null and b/toolbox/ByteMLPerf/docs/images/icon.png differ
diff --git a/toolbox/ByteMLPerf/pylint.conf b/toolbox/ByteMLPerf/pylint.conf
new file mode 100644
index 0000000000000000000000000000000000000000..c6398108cf4df3ac866bf762e677f2f6afc0cc2b
--- /dev/null
+++ b/toolbox/ByteMLPerf/pylint.conf
@@ -0,0 +1,630 @@
+[MAIN]
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Clear in-memory caches upon conclusion of linting. Useful if running pylint
+# in a server-like mode.
+clear-cache-post-run=no
+
+# Load and enable all available extensions. Use --list-extensions to see a list
+# all available extensions.
+#enable-all-extensions=
+
+# In error mode, messages with a category besides ERROR or FATAL are
+# suppressed, and no reports are done by default. Error mode is compatible with
+# disabling specific errors.
+#errors-only=
+
+# Always return a 0 (non-error) status code, even if lint errors are found.
+# This is primarily useful in continuous integration scripts.
+#exit-zero=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+extension-pkg-whitelist=
+
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+fail-on=
+
+# Specify a score threshold under which the program will exit with error.
+fail-under=10
+
+# Interpret the stdin as a python script, whose filename needs to be passed as
+# the module_or_package argument.
+#from-stdin=
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=CVS
+
+# Add files or directories matching the regular expressions patterns to the
+# ignore-list. The regex matches against paths and can be in Posix or Windows
+# format. Because '\\' represents the directory delimiter on Windows systems,
+# it can't be used as an escape character.
+ignore-paths=
+
+# Files or directories matching the regular expression patterns are skipped.
+# The regex matches against base names, not paths. The default value ignores
+# Emacs file locks
+ignore-patterns=^\.#
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=no
+
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+py-version=3.7
+
+# Discover python modules and packages in the file system subtree.
+recursive=no
+
+# Add paths to the list of the source roots. Supports globbing patterns. The
+# source root is an absolute path or a path relative to the current working
+# directory used to determine a package namespace for modules located under the
+# source root.
+source-roots=
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# In verbose mode, extra non-checker-related info will be displayed.
+#verbose=
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style. If left empty, argument names will be checked with the set
+# naming style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
+#class-attribute-rgx=
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style. If left empty, class constant names will be checked with
+# the set naming style.
+#class-const-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style. If left empty, class names will be checked with the set naming style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style. If left empty, function names will be checked with the set
+# naming style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           ex,
+           Run,
+           _
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style. If left empty, method names will be checked with the set naming style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style. If left empty, module names will be checked with the set naming style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct type alias names. If left empty, type
+# alias names will be checked with the set naming style.
+#typealias-rgx=
+
+# Regular expression matching correct type variable names. If left empty, type
+# variable names will be checked with the set naming style.
+#typevar-rgx=
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style. If left empty, variable names will be checked with the set
+# naming style.
+#variable-rgx=
+
+
+[CLASSES]
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[DESIGN]
+
+# List of regular expressions of class ancestor names to ignore when counting
+# public methods (see R0903)
+exclude-too-few-public-methods=
+
+# List of qualified class names to ignore when counting class parents (see
+# R0901)
+ignored-parents=
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when caught.
+overgeneral-exceptions=builtins.BaseException,builtins.Exception
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow explicit reexports by alias from a package __init__.
+allow-reexport-from-package=no
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=
+
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
+ext-import-graph=
+
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
+import-graph=
+
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
+# UNDEFINED.
+confidence=HIGH,
+           CONTROL_FLOW,
+           INFERENCE,
+           INFERENCE_FAILURE,
+           UNDEFINED
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[METHOD_ARGS]
+
+# List of qualified names (i.e., library.method) which require a timeout
+# parameter e.g. 'requests.api.get,requests.api.post'
+timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+
+# Regular expression of note tags to take in consideration.
+notes-rgx=
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit,argparse.parse_error
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+#output-format=
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[SIMILARITIES]
+
+# Comments are removed from the similarity computation
+ignore-comments=yes
+
+# Docstrings are removed from the similarity computation
+ignore-docstrings=yes
+
+# Imports are removed from the similarity computation
+ignore-imports=yes
+
+# Signatures are removed from the similarity computation
+ignore-signatures=yes
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. No available dictionaries : You need to install
+# both the python package and the system dependency for enchant to work..
+spelling-dict=
+
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of symbolic message names to ignore for Mixin members.
+ignored-checks-for-mixins=no-member,
+                          not-async-context-manager,
+                          not-context-manager,
+                          attribute-defined-outside-init
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# Regex pattern to define which classes are considered mixins.
+mixin-class-rgx=.*[Mm]ixin
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of names allowed to shadow builtins
+allowed-redefined-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
diff --git a/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_16.png b/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_16.png
new file mode 100644
index 0000000000000000000000000000000000000000..b6489b27a9a842564b982b9f9b19d8105a3bdc8a
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_16.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_17.png b/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_17.png
new file mode 100644
index 0000000000000000000000000000000000000000..971af92d31837d6da5df3d53d7f00e287fea97a8
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_17.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Graphcore/image_12.png b/toolbox/ByteMLPerf/vendor_zoo/Graphcore/image_12.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c2abda9a4b4043e7cd809aba2eac4075c56f970
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Graphcore/image_12.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Graphcore/mem_12.png b/toolbox/ByteMLPerf/vendor_zoo/Graphcore/mem_12.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfd3e79f2e83f85be0c23945546e7171633e2704
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Graphcore/mem_12.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Habana/image_14.png b/toolbox/ByteMLPerf/vendor_zoo/Habana/image_14.png
new file mode 100644
index 0000000000000000000000000000000000000000..6db434318a3bcaf11ecdf0ff89502878212057f8
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Habana/image_14.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Habana/mem_14.png b/toolbox/ByteMLPerf/vendor_zoo/Habana/mem_14.png
new file mode 100644
index 0000000000000000000000000000000000000000..adce2a466e1e1c0c36ee0c8b3b6d802c14ffa55a
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Habana/mem_14.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Habana/pe_14.png b/toolbox/ByteMLPerf/vendor_zoo/Habana/pe_14.png
new file mode 100644
index 0000000000000000000000000000000000000000..adce2a466e1e1c0c36ee0c8b3b6d802c14ffa55a
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Habana/pe_14.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_7.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_7.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e0891587d699378e4e08ced41cdfe67878207fb
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_7.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_8.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_8.png
new file mode 100644
index 0000000000000000000000000000000000000000..3111cd349fe65dfadfb656b8305c9e9fa382358b
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_8.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_9.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_9.png
new file mode 100644
index 0000000000000000000000000000000000000000..93335f33413656497a71491067e747c375767703
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_9.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_7.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_7.png
new file mode 100644
index 0000000000000000000000000000000000000000..db3996b73c8b635269e3c64d0ce7bb83d996c716
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_7.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_8.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_8.png
new file mode 100644
index 0000000000000000000000000000000000000000..db3996b73c8b635269e3c64d0ce7bb83d996c716
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_8.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_9.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_9.png
new file mode 100644
index 0000000000000000000000000000000000000000..db3996b73c8b635269e3c64d0ce7bb83d996c716
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_9.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_7.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_7.png
new file mode 100644
index 0000000000000000000000000000000000000000..db0043861da069590e7eaf1f96386f0ec7a4366c
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_7.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_9.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_9.png
new file mode 100644
index 0000000000000000000000000000000000000000..db0043861da069590e7eaf1f96386f0ec7a4366c
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_9.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_0.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..1fd0af5316b7dc05521a5bd9ba425e11abce0635
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_0.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_1.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..b53324ccb4f07b2ae651aa3b528f13ed94ef1416
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_1.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_2.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..5af4f434dbb1899fbaf4ca30c6a423f7e28f26e8
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_2.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_3.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d258b00dcbcad1f4a8692f126469360036d0da06
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_3.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_4.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa897b4433b9b32333503f1b0ec2f5c445a65dd5
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_4.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_5.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_5.png
new file mode 100644
index 0000000000000000000000000000000000000000..09e64d5ab3614d367598d624fad76be1a60c1270
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_5.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_6.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_6.png
new file mode 100644
index 0000000000000000000000000000000000000000..09e64d5ab3614d367598d624fad76be1a60c1270
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_6.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/image_10.png b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/image_10.png
new file mode 100644
index 0000000000000000000000000000000000000000..7a191a58b0ae6baf74206236c00bec8480d25d28
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/image_10.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/mem_10.png b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/mem_10.png
new file mode 100644
index 0000000000000000000000000000000000000000..a09e8defbca86952cc607ace303ae0178f3d421d
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/mem_10.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/pe_10.png b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/pe_10.png
new file mode 100644
index 0000000000000000000000000000000000000000..1c6a50d3079988f4e5cfa9f960581db5995dba3f
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/pe_10.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Stream/image_13.png b/toolbox/ByteMLPerf/vendor_zoo/Stream/image_13.png
new file mode 100644
index 0000000000000000000000000000000000000000..63e4378c1fde4b7ef1a585fba9d8ee6883b03e9c
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Stream/image_13.png differ