diff --git a/README.md b/README.md
index 90c697a015b9f333988e3a205fb5176f0b4749b8..a65a7d7c0605ee94158de6dd0a9277a4971ec75e 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,29 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
         <td>-</td>
     </tr>
-        <tr align="center">
+    <tr align="center">
+        <td rowspan=2>ConvNeXt-Small</td>
+        <td>FP16</td>
+        <td><a href="models/cv/classification/convnext_small/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td rowspan=2>CSPDarkNet50</td>
+        <td>FP16</td>
+        <td><a href="models/cv/classification/cspdarknet50/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
         <td rowspan=2>CSPResNet50</td>
         <td>FP16</td>
         <td>-</td>
@@ -65,6 +87,17 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
         <td><a href="models/cv/classification/cspresnet50/ixrt/README.md#int8">Supported</a></td>
     </tr>
+    <tr align="center">
+        <td rowspan=2>DeiT-tiny</td>
+        <td>FP16</td>
+        <td><a href="models/cv/classification/deit_tiny/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
     <tr align="center">
         <td rowspan=2>DenseNet121</td>
         <td>FP16</td>
@@ -87,6 +120,17 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
         <td>-</td>
     </tr>
+    <tr align="center">
+        <td rowspan=2>DenseNet169</td>
+        <td>FP16</td>
+        <td><a href="models/cv/classification/densenet169/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
     <tr align="center">
         <td rowspan=2>EfficientNet-B0</td>
         <td>FP16</td>
@@ -99,7 +143,7 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td><a href="models/cv/classification/efficientnet_b0/ixrt/README.md#int8">Supported</a></td>
     </tr>
     <tr align="center">
-        <td rowspan=2>EfficientNet_B1</td>
+        <td rowspan=2>EfficientNet-B1</td>
         <td>FP16</td>
         <td><a href="models/cv/classification/efficientnet_b1/igie/README.md#fp16">Supported</a></td>
         <td><a href="models/cv/classification/efficientnet_b1/ixrt/README.md#fp16">Supported</a></td>
@@ -110,9 +154,20 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td><a href="models/cv/classification/efficientnet_b1/ixrt/README.md#int8">Supported</a></td>
     </tr>
     <tr align="center">
-        <td rowspan=2>EfficientNetV2</td>
+        <td rowspan=2>EfficientNet-B2</td>
         <td>FP16</td>
+        <td><a href="models/cv/classification/efficientnet_b2/igie/README.md#fp16">Supported</a></td>
         <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td rowspan=2>EfficientNetV2</td>
+        <td>FP16</td>
+        <td><a href="models/cv/classification/efficientnet_v2/igie/README.md#fp16">Supported</a></td>
         <td><a href="models/cv/classification/efficientnet_v2/ixrt/README.md#fp16">Supported</a></td>
     </tr>
     <tr align="center">
@@ -222,7 +277,7 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
     <tr align="center">
         <td rowspan=2>RepVGG</td>
         <td>FP16</td>
-        <td>-</td>
+        <td><a href="models/cv/classification/repvgg/igie/README.md#fp16">Supported</a></td>
         <td><a href="models/cv/classification/repvgg/ixrt/README.md#fp16">Supported</a></td>
     </tr>
     <tr align="center">
@@ -329,6 +384,17 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
         <td>-</td>
     </tr>
+    <tr align="center">
+        <td rowspan=2>SEResNet50</td>
+        <td>FP16</td>
+        <td><a href="models/cv/classification/se_resnet50/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
     <tr align="center">
         <td rowspan=2>ShuffleNetV1</td>
         <td>FP16</td>
@@ -351,6 +417,17 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
         <td>-</td>
     </tr>
+    <tr align="center">
+        <td rowspan=2>ShuffleNetV2_x1_0</td>
+        <td>FP16</td>
+        <td><a href="models/cv/classification/shufflenetv2_x1_0/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
     <tr align="center">
         <td rowspan=2>SqueezeNet 1.0</td>
         <td>FP16</td>
@@ -417,6 +494,17 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <th>IGIE</th>
         <th>IxRT</th>
     </tr>
+    <tr align="center">
+        <td rowspan=2>ATSS</td>
+        <td>FP16</td>
+        <td><a href="models/cv/detection/atss/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
     <tr align="center">
         <td rowspan=2>CenterNet</td>
         <td>FP16</td>
@@ -442,7 +530,7 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
     <tr align="center">
         <td rowspan=2>FCOS</td>
         <td>FP16</td>
-        <td>-</td>
+        <td><a href="models/cv/detection/fcos/igie/README.md#fp16">Supported</a></td>
         <td><a href="models/cv/detection/fcos/ixrt/README.md#fp16">Supported</a></td>
     </tr>
     <tr align="center">
@@ -461,6 +549,17 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
         <td>-</td>
     </tr>
+    <tr align="center">
+        <td rowspan=2>FSAF</td>
+        <td>FP16</td>
+        <td><a href="models/cv/detection/fsaf/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
     <tr align="center">
         <td rowspan=2>HRNet</td>
         <td>FP16</td>
@@ -472,6 +571,17 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
         <td>-</td>
     </tr>
+    <tr align="center">
+        <td rowspan=2>RetinaFace</td>
+        <td>FP16</td>
+        <td><a href="models/cv/detection/retinaface/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
     <tr align="center">
         <td rowspan=2>RetinaNet</td>
         <td>FP16</td>
@@ -483,6 +593,17 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
         <td>-</td>
         <td>-</td>
     </tr>
+    <tr align="center">
+        <td rowspan=2>RTMDet</td>
+        <td>FP16</td>
+        <td><a href="models/cv/detection/rtmdet/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
     <tr align="center">
         <td rowspan=2>YOLOv3</td>
         <td>FP16</td>
@@ -573,6 +694,28 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
     </tr>
 </table>
 
+### Pose Estimation
+
+<table>
+    <tr align="center">
+        <th>Models</th>
+        <th>Precision</th>
+        <th>IGIE</th>
+        <th>IxRT</th>
+    </tr>
+    <tr align="center">
+        <td rowspan=2>RTMPose</td>
+        <td>FP16</td>
+        <td><a href="models/cv/pose_estimation/rtmpose/igie/README.md#fp16">Supported</a></td>
+        <td>-</td>
+    </tr>
+    <tr align="center">
+        <td>INT8</td>
+        <td>-</td>
+        <td>-</td>
+    </tr>
+</table>
+
 ### Segmentation
 
 <table>
diff --git a/data/datasets/README.md b/data/datasets/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f40c2e824e94c038e186d9e1ffa149a8382a41e2
--- /dev/null
+++ b/data/datasets/README.md
@@ -0,0 +1 @@
+# This is the default datasets location required by inference models
diff --git a/models/cv/classification/convnext_small/igie/README.md b/models/cv/classification/convnext_small/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e0aa51a06162b6d0a3c73826e93204c4bd362b0a
--- /dev/null
+++ b/models/cv/classification/convnext_small/igie/README.md
@@ -0,0 +1,47 @@
+# ConvNeXt Small
+
+## Description
+
+The ConvNeXt Small model represents a significant stride in the evolution of convolutional neural networks (CNNs), introduced by researchers at Facebook AI Research (FAIR) and UC Berkeley. It is part of the ConvNeXt family, which challenges the dominance of Vision Transformers (ViTs) in the realm of visual recognition tasks.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnx
+pip3 install tqdm
+```
+
+### Download
+
+Pretrained model: <https://download.pytorch.org/models/convnext_small-0c510722.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+python3 export.py --weight convnext_small-0c510722.pth --output convnext_small.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_convnext_small_fp16_accuracy.sh
+# Performance
+bash scripts/infer_convnext_small_fp16_performance.sh
+```
+
+## Results
+
+| Model          | BatchSize | Precision | FPS     | Top-1(%) | Top-5(%) |
+| -------------- | --------- | --------- | ------- | -------- | -------- |
+| ConvNeXt Small | 32        | FP16      | 725.437 | 83.267   | 96.515   |
diff --git a/models/cv/classification/convnext_small/igie/build_engine.py b/models/cv/classification/convnext_small/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/convnext_small/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/convnext_small/igie/export.py b/models/cv/classification/convnext_small/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ec39771ca57ad629d1ebbd5dac4ee71cd3d303c
--- /dev/null
+++ b/models/cv/classification/convnext_small/igie/export.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import torchvision
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    
+    model = torchvision.models.convnext_small()
+    model.load_state_dict(torch.load(args.weight))
+    model.eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )    
+    
+    print("Export onnx model successfully! ")
+    
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/convnext_small/igie/inference.py b/models/cv/classification/convnext_small/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aef3ec70fa7e88917c54aeb8242fc73a910c696
--- /dev/null
+++ b/models/cv/classification/convnext_small/igie/inference.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256, interpolation=InterpolationMode.BILINEAR),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+            
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/convnext_small/igie/scripts/infer_convnext_small_fp16_accuracy.sh b/models/cv/classification/convnext_small/igie/scripts/infer_convnext_small_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c7837d790c90b4aa6af02ecc15e517d82acfc9c4
--- /dev/null
+++ b/models/cv/classification/convnext_small/igie/scripts/infer_convnext_small_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="convnext_small.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path convnext_small_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                     \
+    --engine convnext_small_bs_${batchsize}_fp16.so      \
+    --batchsize ${batchsize}                             \
+    --input_name input                                   \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/classification/convnext_small/igie/scripts/infer_convnext_small_fp16_performance.sh b/models/cv/classification/convnext_small/igie/scripts/infer_convnext_small_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cde8da5aaf4e74f7d6681cc43915e0faf6d1bbfc
--- /dev/null
+++ b/models/cv/classification/convnext_small/igie/scripts/infer_convnext_small_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="convnext_small.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path convnext_small_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                     \
+    --engine convnext_small_bs_${batchsize}_fp16.so      \
+    --batchsize ${batchsize}                             \
+    --input_name input                                   \
+    --datasets ${datasets_path}                          \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/cspdarknet50/igie/README.md b/models/cv/classification/cspdarknet50/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..99a374a454e09f84799c75ab3d444315a5b8bb40
--- /dev/null
+++ b/models/cv/classification/cspdarknet50/igie/README.md
@@ -0,0 +1,68 @@
+# CSPDarkNet50
+
+## Description
+
+CSPDarkNet50 is an enhanced convolutional neural network architecture that reduces redundant computations by integrating cross-stage partial network features and truncating gradient flow, thereby maintaining high accuracy while lowering computational costs.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install onnxsim
+pip3 install mmcv==1.5.3
+pip3 install mmcls
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmclassification/v0/cspnet/cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+# git clone mmpretrain
+git clone -b v0.24.0 https://github.com/open-mmlab/mmpretrain.git
+
+# export onnx model
+python3 export.py --cfg mmpretrain/configs/cspnet/cspdarknet50_8xb32_in1k.py --weight cspdarknet50_3rdparty_8xb32_in1k_20220329-bd275287.pth --output cspdarknet50.onnx
+
+# Use onnxsim optimize onnx model
+onnxsim cspdarknet50.onnx cspdarknet50_opt.onnx
+
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_cspdarknet_fp16_accuracy.sh
+# Performance
+bash scripts/infer_cspdarknet_fp16_performance.sh
+```
+
+## Results
+
+| Model        | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| ------------ | --------- | --------- | -------- | -------- | -------- |
+| CSPDarkNet50 | 32        | FP16      | 3214.387 | 79.063   | 94.492   |
+
+## Reference
+
+CSPDarkNet50: <https://github.com/open-mmlab/mmpretrain>
diff --git a/models/cv/classification/cspdarknet50/igie/build_engine.py b/models/cv/classification/cspdarknet50/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/cspdarknet50/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/cspdarknet50/igie/export.py b/models/cv/classification/cspdarknet50/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc8d9fde531853bb9d25966dc7f706f2d9276dd
--- /dev/null
+++ b/models/cv/classification/cspdarknet50/igie/export.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+
+import torch
+from mmcls.apis import init_model
+
+class Model(torch.nn.Module):
+    def __init__(self, config_file, checkpoint_file):
+        super().__init__()
+        self.model = init_model(config_file, checkpoint_file, device="cpu") 
+  
+    def forward(self, x):
+        feat = self.model.backbone(x)
+        feat = self.model.neck(feat)
+        out_head = self.model.head.fc(feat[0])
+        return out_head
+    
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    config_file = args.cfg
+    checkpoint_file = args.weight
+    model = Model(config_file, checkpoint_file).eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
+
diff --git a/models/cv/classification/cspdarknet50/igie/inference.py b/models/cv/classification/cspdarknet50/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c602a8f5c899e34f621851c10a5d00c47583c
--- /dev/null
+++ b/models/cv/classification/cspdarknet50/igie/inference.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/cspdarknet50/igie/scripts/infer_cspdarknet_fp16_accuracy.sh b/models/cv/classification/cspdarknet50/igie/scripts/infer_cspdarknet_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5a1ab2b29abf074f7e2e532ad6c6f7c38d48357c
--- /dev/null
+++ b/models/cv/classification/cspdarknet50/igie/scripts/infer_cspdarknet_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="cspdarknet50_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path cspdarknet_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine cspdarknet_bs_${batchsize}_fp16.so   \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/classification/cspdarknet50/igie/scripts/infer_cspdarknet_fp16_performance.sh b/models/cv/classification/cspdarknet50/igie/scripts/infer_cspdarknet_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e033958ccf69e04357f4b13fc06ce1fa8bc11d68
--- /dev/null
+++ b/models/cv/classification/cspdarknet50/igie/scripts/infer_cspdarknet_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="cspdarknet50_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path cspdarknet_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine cspdarknet_bs_${batchsize}_fp16.so   \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}                   \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/deit_tiny/igie/README.md b/models/cv/classification/deit_tiny/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..663ca8b7bc56eb702922f3ff3457086ac6f270e5
--- /dev/null
+++ b/models/cv/classification/deit_tiny/igie/README.md
@@ -0,0 +1,68 @@
+# DeiT-tiny
+
+## Description
+
+DeiT Tiny is a lightweight vision transformer designed for data-efficient learning. It achieves rapid training and high accuracy on small datasets through innovative attention distillation methods, while maintaining the simplicity and efficiency of the model.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install onnxsim
+pip3 install mmcv==1.5.3
+pip3 install mmcls
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmclassification/v0/deit/deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+# git clone mmpretrain
+git clone -b v0.24.0 https://github.com/open-mmlab/mmpretrain.git
+
+# export onnx model
+python3 export.py --cfg mmpretrain/configs/deit/deit-tiny_pt-4xb256_in1k.py --weight deit-tiny_pt-4xb256_in1k_20220218-13b382a0.pth --output deit_tiny.onnx
+
+# Use onnxsim optimize onnx model
+onnxsim deit_tiny.onnx deit_tiny_opt.onnx
+
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_deit_tiny_fp16_accuracy.sh
+# Performance
+bash scripts/infer_deit_tin_fp16_performance.sh
+```
+
+## Results
+
+| Model     | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| --------- | --------- | --------- | -------- | -------- | -------- |
+| DeiT-tiny | 32        | FP16      | 2172.771 | 74.334   | 92.175   |
+
+## Reference
+
+Deit_tiny: <https://github.com/open-mmlab/mmpretrain>
diff --git a/models/cv/classification/deit_tiny/igie/build_engine.py b/models/cv/classification/deit_tiny/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/deit_tiny/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/deit_tiny/igie/export.py b/models/cv/classification/deit_tiny/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..0078670ffbf4bbcce3358d4a2cedc42ce61176f5
--- /dev/null
+++ b/models/cv/classification/deit_tiny/igie/export.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+
+import torch
+from mmcls.apis import init_model
+
+class Model(torch.nn.Module):
+    def __init__(self, config_file, checkpoint_file):
+        super().__init__()
+        self.model = init_model(config_file, checkpoint_file, device="cpu") 
+  
+    def forward(self, x):
+        feat = self.model.backbone(x)
+        head = self.model.head.pre_logits(feat)
+        out_head = self.model.head.layers.head(head)
+        return out_head
+    
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    config_file = args.cfg
+    checkpoint_file = args.weight
+    model = Model(config_file, checkpoint_file).eval()
+    
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
+
diff --git a/models/cv/classification/deit_tiny/igie/inference.py b/models/cv/classification/deit_tiny/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c602a8f5c899e34f621851c10a5d00c47583c
--- /dev/null
+++ b/models/cv/classification/deit_tiny/igie/inference.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/deit_tiny/igie/scripts/infer_deit_tiny_fp16_accuracy.sh b/models/cv/classification/deit_tiny/igie/scripts/infer_deit_tiny_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f6f9a8fb070df948cbf9dbe3e6e3c043c6270063
--- /dev/null
+++ b/models/cv/classification/deit_tiny/igie/scripts/infer_deit_tiny_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="deit_tiny_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path deit_tiny_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine deit_tiny_bs_${batchsize}_fp16.so    \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}
diff --git a/models/cv/classification/deit_tiny/igie/scripts/infer_deit_tiny_fp16_performance.sh b/models/cv/classification/deit_tiny/igie/scripts/infer_deit_tiny_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a21f13b246ba7a65399c12d1613055be39d73e81
--- /dev/null
+++ b/models/cv/classification/deit_tiny/igie/scripts/infer_deit_tiny_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="deit_tiny_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path deit_tiny_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine deit_tiny_bs_${batchsize}_fp16.so    \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}                   \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/densenet169/igie/README.md b/models/cv/classification/densenet169/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6b8abd4859a500a096b917c5668670db19fb233
--- /dev/null
+++ b/models/cv/classification/densenet169/igie/README.md
@@ -0,0 +1,47 @@
+# DenseNet169
+
+## Description
+
+DenseNet-169 is a variant of the Dense Convolutional Network (DenseNet) architecture, characterized by its 169 layers and a growth rate of 32. This network leverages the dense connectivity pattern, where each layer is connected to every other layer in a feed-forward fashion, resulting in a substantial increase in the number of direct connections compared to traditional convolutional networks. This connectivity pattern facilitates the reuse of features and enhances the flow of information and gradients throughout the network, which is particularly beneficial for deep architectures.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnx
+pip3 install tqdm
+```
+
+### Download
+
+Pretrained model: <https://download.pytorch.org/models/densenet169-b2777c0a.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+python3 export.py --weight densenet169-b2777c0a.pth --output densenet169.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_densenet169_fp16_accuracy.sh
+# Performance
+bash scripts/infer_densenet169_fp16_performance.sh
+```
+
+## Results
+
+| Model       | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| ----------- | --------- | --------- | -------- | -------- | -------- |
+| DenseNet169 | 32        | FP16      | 1384.649 | 75.548   | 92.778   |
diff --git a/models/cv/classification/densenet169/igie/build_engine.py b/models/cv/classification/densenet169/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/densenet169/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/densenet169/igie/export.py b/models/cv/classification/densenet169/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..44c7269055407350e6af92f9a85d8d9bc5e5bab5
--- /dev/null
+++ b/models/cv/classification/densenet169/igie/export.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import torchvision
+import argparse
+import re
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    
+    model = torchvision.models.densenet169(weights=False)
+
+    state_dict = torch.load(args.weight)
+
+    pattern = re.compile(r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$'
+    )
+    for key in list(state_dict.keys()):
+        res = pattern.match(key)
+        if res:
+            new_key = res.group(1) + res.group(2)
+            state_dict[new_key] = state_dict[key]
+            del state_dict[key]
+
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )    
+    
+    print("Export onnx model successfully! ")
+    
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/densenet169/igie/inference.py b/models/cv/classification/densenet169/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aef3ec70fa7e88917c54aeb8242fc73a910c696
--- /dev/null
+++ b/models/cv/classification/densenet169/igie/inference.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256, interpolation=InterpolationMode.BILINEAR),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+            
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/densenet169/igie/scripts/infer_densenet169_fp16_accuracy.sh b/models/cv/classification/densenet169/igie/scripts/infer_densenet169_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6475ddcdd58e63bfd7cd911dc3be1801026d6097
--- /dev/null
+++ b/models/cv/classification/densenet169/igie/scripts/infer_densenet169_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="densenet169.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path densenet169_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                  \
+    --engine densenet169_bs_${batchsize}_fp16.so      \
+    --batchsize ${batchsize}                          \
+    --input_name input                                \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/classification/densenet169/igie/scripts/infer_densenet169_fp16_performance.sh b/models/cv/classification/densenet169/igie/scripts/infer_densenet169_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8a36fc2f0129a71e0fd3ef678e06ee4eb3651fa3
--- /dev/null
+++ b/models/cv/classification/densenet169/igie/scripts/infer_densenet169_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="densenet169.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path densenet169_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                  \
+    --engine densenet169_bs_${batchsize}_fp16.so      \
+    --batchsize ${batchsize}                          \
+    --input_name input                                \
+    --datasets ${datasets_path}                       \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_b0/ixrt/README.md b/models/cv/classification/efficientnet_b0/ixrt/README.md
index aeba963e8848fa9f97f92f734179d963b888f307..55690187dbc51e74d89765ab0457d72aa7f65150 100644
--- a/models/cv/classification/efficientnet_b0/ixrt/README.md
+++ b/models/cv/classification/efficientnet_b0/ixrt/README.md
@@ -55,5 +55,5 @@ bash scripts/infer_efficientnet_b0_int8_performance.sh
 
 Model           |BatchSize  |Precision |FPS       |Top-1(%)  |Top-5(%)
 ----------------|-----------|----------|----------|----------|--------
-EfficientNet_B0 |    32     |   FP16   | 2325.54  |  77.66   | 93.58
-EfficientNet_B0 |    32     |   INT8   | 2666.00  |  74.27   | 91.85
+EfficientNet B0 |    32     |   FP16   | 2325.54  |  77.66   | 93.58
+EfficientNet B0 |    32     |   INT8   | 2666.00  |  74.27   | 91.85
diff --git a/models/cv/classification/efficientnet_b1/igie/README.md b/models/cv/classification/efficientnet_b1/igie/README.md
index dc5344e2dccb120849168f959d1678473dd8e858..0656f187cda59092a2912b82f05bbd425e196201 100644
--- a/models/cv/classification/efficientnet_b1/igie/README.md
+++ b/models/cv/classification/efficientnet_b1/igie/README.md
@@ -44,4 +44,4 @@ bash scripts/infer_efficientnet_b1_fp16_performance.sh
 
 Model           |BatchSize  |Precision |FPS      |Top-1(%) |Top-5(%)
 ----------------|-----------|----------|---------|---------|--------
-Efficientnet_b1 |    32     |   FP16   | 1292.31 | 78.823  | 94.494
+EfficientNet B1 |    32     |   FP16   | 1292.31 | 78.823  | 94.494
diff --git a/models/cv/classification/efficientnet_b2/igie/README.md b/models/cv/classification/efficientnet_b2/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f439723944caa5cc6d27ccf481117157bfc3eec2
--- /dev/null
+++ b/models/cv/classification/efficientnet_b2/igie/README.md
@@ -0,0 +1,47 @@
+# EfficientNet B2
+
+## Description
+
+EfficientNet B2 is a member of the EfficientNet family, a series of convolutional neural network architectures that are designed to achieve excellent accuracy and efficiency. Introduced by researchers at Google, EfficientNets utilize the compound scaling method, which uniformly scales the depth, width, and resolution of the network to improve accuracy and efficiency.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnx
+pip3 install tqdm
+```
+
+### Download
+
+Pretrained model: <https://download.pytorch.org/models/efficientnet_b2_rwightman-c35c1473.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+python3 export.py --weight efficientnet_b2_rwightman-c35c1473.pth --output efficientnet_b2.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_efficientnet_b2_fp16_accuracy.sh
+# Performance
+bash scripts/infer_efficientnet_b2_fp16_performance.sh
+```
+
+## Results
+
+| Model           | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| --------------- | --------- | --------- | -------- | -------- | -------- |
+| EfficientNet B2 | 32        | FP16      | 1527.044 | 77.739   | 93.702   |
diff --git a/models/cv/classification/efficientnet_b2/igie/build_engine.py b/models/cv/classification/efficientnet_b2/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/efficientnet_b2/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_b2/igie/export.py b/models/cv/classification/efficientnet_b2/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..7761ffaca5fd02798d898e12392f3e0501aef0ec
--- /dev/null
+++ b/models/cv/classification/efficientnet_b2/igie/export.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import torchvision
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    
+    model = torchvision.models.efficientnet_b2()
+    model.load_state_dict(torch.load(args.weight))
+    model.eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )    
+    
+    print("Export onnx model successfully! ")
+    
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/efficientnet_b2/igie/inference.py b/models/cv/classification/efficientnet_b2/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aef3ec70fa7e88917c54aeb8242fc73a910c696
--- /dev/null
+++ b/models/cv/classification/efficientnet_b2/igie/inference.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256, interpolation=InterpolationMode.BILINEAR),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+            
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_b2/igie/scripts/infer_efficientnet_b2_fp16_accuracy.sh b/models/cv/classification/efficientnet_b2/igie/scripts/infer_efficientnet_b2_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d4ed3ef0893017781d18c20c4387aa8c7f52851c
--- /dev/null
+++ b/models/cv/classification/efficientnet_b2/igie/scripts/infer_efficientnet_b2_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="efficientnet_b2.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path efficientnet_b2_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                      \
+    --engine efficientnet_b2_bs_${batchsize}_fp16.so      \
+    --batchsize ${batchsize}                              \
+    --input_name input                                    \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_b2/igie/scripts/infer_efficientnet_b2_fp16_performance.sh b/models/cv/classification/efficientnet_b2/igie/scripts/infer_efficientnet_b2_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4b9602218a6a2c5be91f8ee2f97e63df7abc85e4
--- /dev/null
+++ b/models/cv/classification/efficientnet_b2/igie/scripts/infer_efficientnet_b2_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="efficientnet_b2.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path efficientnet_b2_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                      \
+    --engine efficientnet_b2_bs_${batchsize}_fp16.so      \
+    --batchsize ${batchsize}                              \
+    --input_name input                                    \
+    --datasets ${datasets_path}                           \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/igie/README.md b/models/cv/classification/efficientnet_v2/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbcc5a4eb2cbc136a1a7b489f53efe4e022808e0
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/igie/README.md
@@ -0,0 +1,47 @@
+# EfficientNetV2-M
+
+## Description
+
+EfficientNetV2 M is an optimized model in the EfficientNetV2 series, which was developed by Google researchers. It continues the legacy of the EfficientNet family, focusing on advancing the state-of-the-art in accuracy and efficiency through advanced scaling techniques and architectural innovations.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnx
+pip3 install tqdm
+```
+
+### Download
+
+Pretrained model: <https://download.pytorch.org/models/efficientnet_v2_m-dc08266a.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+python3 export.py --weight efficientnet_v2_m-dc08266a.pth --output efficientnet_v2_m.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_efficientnet_v2_m_fp16_accuracy.sh
+# Performance
+bash scripts/infer_efficientnet_v2_m_fp16_performance.sh
+```
+
+## Results
+
+| Model            | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| ---------------- | --------- | --------- | -------- | -------- | -------- |
+| EfficientNetV2-M | 32        | FP16      | 1104.846 | 79.635   | 94.456   |
diff --git a/models/cv/classification/efficientnet_v2/igie/build_engine.py b/models/cv/classification/efficientnet_v2/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/igie/export.py b/models/cv/classification/efficientnet_v2/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfb9f76e7503ce443da7090c5739e4880652f194
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/igie/export.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import torchvision
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    
+    model = torchvision.models.efficientnet_v2_m()
+    model.load_state_dict(torch.load(args.weight))
+    model.eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )    
+    
+    print("Export onnx model successfully! ")
+    
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/efficientnet_v2/igie/inference.py b/models/cv/classification/efficientnet_v2/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aef3ec70fa7e88917c54aeb8242fc73a910c696
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/igie/inference.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256, interpolation=InterpolationMode.BILINEAR),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+            
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/igie/scripts/infer_efficientnet_v2_m_fp16_accuracy.sh b/models/cv/classification/efficientnet_v2/igie/scripts/infer_efficientnet_v2_m_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d7aa78a2d9b15370dc1e2a5fc5a9fd14ab68668d
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/igie/scripts/infer_efficientnet_v2_m_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="efficientnet_v2_m.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path efficientnet_v2_m_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                      \
+    --engine efficientnet_v2_m_bs_${batchsize}_fp16.so    \
+    --batchsize ${batchsize}                              \
+    --input_name input                                    \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/classification/efficientnet_v2/igie/scripts/infer_efficientnet_v2_m_fp16_performance.sh b/models/cv/classification/efficientnet_v2/igie/scripts/infer_efficientnet_v2_m_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..21073983ec294f1612356de6848ed2c08f9374d4
--- /dev/null
+++ b/models/cv/classification/efficientnet_v2/igie/scripts/infer_efficientnet_v2_m_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="efficientnet_v2_m.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path efficientnet_v2_m_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                      \
+    --engine efficientnet_v2_m_bs_${batchsize}_fp16.so    \
+    --batchsize ${batchsize}                              \
+    --input_name input                                    \
+    --datasets ${datasets_path}                           \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/mobilenet_v2/ixrt/README.md b/models/cv/classification/mobilenet_v2/ixrt/README.md
index 46343115fc731320eb2f61b0358695cb0608116b..10852a9550a34658bf9bc5b8d4a0dadd91e98bb9 100644
--- a/models/cv/classification/mobilenet_v2/ixrt/README.md
+++ b/models/cv/classification/mobilenet_v2/ixrt/README.md
@@ -13,17 +13,19 @@ pip3 install tqdm
 pip3 install onnxsim
 pip3 install opencv-python
 pip3 install ppq
+pip3 install protobuf==3.20.0
 ```
 
 ### Download
 
-Download the [imagenet](https://www.image-net.org/download.php) validation dataset, and place in data/datasets;
+Download the [imagenet](https://www.image-net.org/download.php) validation dataset, and place in `${PROJ_ROOT}/data/datasets`;
 
 ## Inference
 
 ### FP16
 
 ```bash
+cd python/
 # Test ACC
 bash script/infer_mobilenetv2_fp16_accuary.sh
 # Test FPS
diff --git a/models/cv/classification/mobilenet_v2/ixrt/python/inference.py b/models/cv/classification/mobilenet_v2/ixrt/python/inference.py
index e726dabc1f19cadeda9f130ef52f8b36ad435d26..ea3f7f6b47414387508d955f71344a4af3217167 100644
--- a/models/cv/classification/mobilenet_v2/ixrt/python/inference.py
+++ b/models/cv/classification/mobilenet_v2/ixrt/python/inference.py
@@ -85,6 +85,7 @@ def main(config):
         total_sample = 0
         acc_top1, acc_top5 = 0, 0
 
+        start_time = time.time()
         with tqdm(total= len(dataloader)) as _tqdm:
             for idx, (batch_data, batch_label) in enumerate(dataloader):
                 batch_data = batch_data.numpy().astype(inputs[0]["dtype"])
@@ -106,7 +107,10 @@ def main(config):
                 _tqdm.set_postfix(acc_1='{:.4f}'.format(acc_top1/total_sample),
                                     acc_5='{:.4f}'.format(acc_top5/total_sample))
                 _tqdm.update(1)
+        end_time = time.time()
+        end2end_time = end_time - start_time
 
+        print(F"E2E time : {end2end_time:.3f} seconds")
         print(F"Acc@1 : {acc_top1/total_sample} = {acc_top1}/{total_sample}")
         print(F"Acc@5 : {acc_top5/total_sample} = {acc_top5}/{total_sample}")
         acc1 = acc_top1/total_sample
diff --git a/models/cv/classification/repvgg/igie/README.md b/models/cv/classification/repvgg/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cbbeaac49e4c93cebf25e693f6499d4be333c252
--- /dev/null
+++ b/models/cv/classification/repvgg/igie/README.md
@@ -0,0 +1,65 @@
+# RepVGG
+
+## Description
+
+RepVGG is an innovative convolutional neural network architecture that combines the simplicity of VGG-style inference with a multi-branch topology during training. Through structural re-parameterization, RepVGG achieves high accuracy while significantly improving computational efficiency.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install mmcv==1.5.3
+pip3 install mmcls
+pip3 install mmengine
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+# git clone mmpretrain
+git clone -b v0.24.0 https://github.com/open-mmlab/mmpretrain.git
+
+# export onnx model
+python3 export.py --cfg mmpretrain/configs/repvgg/repvgg-A0_4xb64-coslr-120e_in1k.py --weight repvgg-A0_8xb32_in1k_20221213-60ae8e23.pth --output repvgg.onnx
+
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_repvgg_fp16_accuracy.sh
+# Performance
+bash scripts/infer_repvgg_fp16_performance.sh
+```
+
+## Results
+
+| Model  | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| ------ | --------- | --------- | -------- | -------- | -------- |
+| RepVGG | 32        | FP16      | 7423.035 | 72.345   | 90.543   |
+
+## Reference
+
+RepVGG: <https://github.com/open-mmlab/mmpretrain>
diff --git a/models/cv/classification/repvgg/igie/build_engine.py b/models/cv/classification/repvgg/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/repvgg/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/repvgg/igie/export.py b/models/cv/classification/repvgg/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc8d9fde531853bb9d25966dc7f706f2d9276dd
--- /dev/null
+++ b/models/cv/classification/repvgg/igie/export.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+
+import torch
+from mmcls.apis import init_model
+
+class Model(torch.nn.Module):
+    def __init__(self, config_file, checkpoint_file):
+        super().__init__()
+        self.model = init_model(config_file, checkpoint_file, device="cpu") 
+  
+    def forward(self, x):
+        feat = self.model.backbone(x)
+        feat = self.model.neck(feat)
+        out_head = self.model.head.fc(feat[0])
+        return out_head
+    
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    config_file = args.cfg
+    checkpoint_file = args.weight
+    model = Model(config_file, checkpoint_file).eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
+
diff --git a/models/cv/classification/repvgg/igie/inference.py b/models/cv/classification/repvgg/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c602a8f5c899e34f621851c10a5d00c47583c
--- /dev/null
+++ b/models/cv/classification/repvgg/igie/inference.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/repvgg/igie/scripts/infer_repvgg_fp16_accuracy.sh b/models/cv/classification/repvgg/igie/scripts/infer_repvgg_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..37f6fcb2ab964b22bd9e29f0015a1b2900f5a560
--- /dev/null
+++ b/models/cv/classification/repvgg/igie/scripts/infer_repvgg_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="repvgg.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path repvgg_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine repvgg_bs_${batchsize}_fp16.so       \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/classification/repvgg/igie/scripts/infer_repvgg_fp16_performance.sh b/models/cv/classification/repvgg/igie/scripts/infer_repvgg_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ad29f598b8dbe3512e5288bfd7c757df89654b09
--- /dev/null
+++ b/models/cv/classification/repvgg/igie/scripts/infer_repvgg_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="repvgg.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path repvgg_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine repvgg_bs_${batchsize}_fp16.so       \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}                   \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/repvgg/ixrt/README.md b/models/cv/classification/repvgg/ixrt/README.md
index 95104dad4374362a8bded98e5c3557065dbbc30e..37bbbcd480afdbddb3fd622c2eeacb59f2faeacb 100644
--- a/models/cv/classification/repvgg/ixrt/README.md
+++ b/models/cv/classification/repvgg/ixrt/README.md
@@ -1,4 +1,4 @@
-# REPVGG
+# RepVGG
 
 ## Description
 
@@ -65,6 +65,6 @@ bash scripts/infer_repvgg_fp16_performance.sh
 
 ## Results
 
-Model  |BatchSize  |Precision |FPS      |Top-1(%)  |Top-5(%)
--------|-----------|----------|---------|----------|--------
-REPVGG |    32     |   FP16   | 5725.37 |  72.41   | 90.49
+| Model  | BatchSize | Precision | FPS     | Top-1(%) | Top-5(%) |
+| ------ | --------- | --------- | ------- | -------- | -------- |
+| RepVGG | 32        | FP16      | 5725.37 | 72.41    | 90.49    |
diff --git a/models/cv/classification/resnet50/ixrt/inference.py b/models/cv/classification/resnet50/ixrt/inference.py
index 2c9dcb3f9cc5b9a26903651a31fafa16d8f0db31..50aafd4fd5ef9664203cdcbdfbdb577edca933c4 100644
--- a/models/cv/classification/resnet50/ixrt/inference.py
+++ b/models/cv/classification/resnet50/ixrt/inference.py
@@ -83,6 +83,7 @@ def main(config):
         total_sample = 0
         acc_top1, acc_top5 = 0, 0
 
+        start_time = time.time()
         with tqdm(total= len(dataloader)) as _tqdm:
             for idx, (batch_data, batch_label) in enumerate(dataloader):
                 batch_data = batch_data.numpy().astype(inputs[0]["dtype"])
@@ -104,7 +105,10 @@ def main(config):
                 _tqdm.set_postfix(acc_1='{:.4f}'.format(acc_top1/total_sample),
                                     acc_5='{:.4f}'.format(acc_top5/total_sample))
                 _tqdm.update(1)
+        end_time = time.time()
+        end2end_time = end_time - start_time
 
+        print(F"E2E time : {end2end_time:.3f} seconds")
         print(F"Acc@1 : {acc_top1/total_sample} = {acc_top1}/{total_sample}")
         print(F"Acc@5 : {acc_top5/total_sample} = {acc_top5}/{total_sample}")
         acc1 = acc_top1/total_sample
diff --git a/models/cv/classification/se_resnet50/igie/README.md b/models/cv/classification/se_resnet50/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ab59b0e8cdf70d652e3de0677488f2288ddbe429
--- /dev/null
+++ b/models/cv/classification/se_resnet50/igie/README.md
@@ -0,0 +1,64 @@
+# SEResNet50
+
+## Description
+
+SEResNet50 is an enhanced version of the ResNet50 network integrated with Squeeze-and-Excitation (SE) blocks, which strengthens the network's feature expression capability by explicitly emphasizing useful features and suppressing irrelevant ones. This improvement enables SEResNet50 to demonstrate higher accuracy in various visual recognition tasks compared to the standard ResNet50.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install mmcv==1.5.3
+pip3 install mmcls
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+# git clone mmpretrain
+git clone -b v0.24.0 https://github.com/open-mmlab/mmpretrain.git
+
+# export onnx model
+python3 export.py --cfg mmpretrain/configs/seresnet/seresnet50_8xb32_in1k.py --weight se-resnet50_batch256_imagenet_20200804-ae206104.pth --output seresnet50.onnx
+
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_seresnet_fp16_accuracy.sh
+# Performance
+bash scripts/infer_seresnet_fp16_performance.sh
+```
+
+## Results
+
+| Model      | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| ---------- | --------- | --------- | -------- | -------- | -------- |
+| SEResNet50 | 32        | FP16      | 2548.268 | 77.709   | 93.812   |
+
+## Reference
+
+SE_ResNet50: <https://github.com/open-mmlab/mmpretrain>
diff --git a/models/cv/classification/se_resnet50/igie/build_engine.py b/models/cv/classification/se_resnet50/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/se_resnet50/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/se_resnet50/igie/export.py b/models/cv/classification/se_resnet50/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc8d9fde531853bb9d25966dc7f706f2d9276dd
--- /dev/null
+++ b/models/cv/classification/se_resnet50/igie/export.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+
+import torch
+from mmcls.apis import init_model
+
+class Model(torch.nn.Module):
+    def __init__(self, config_file, checkpoint_file):
+        super().__init__()
+        self.model = init_model(config_file, checkpoint_file, device="cpu") 
+  
+    def forward(self, x):
+        feat = self.model.backbone(x)
+        feat = self.model.neck(feat)
+        out_head = self.model.head.fc(feat[0])
+        return out_head
+    
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    config_file = args.cfg
+    checkpoint_file = args.weight
+    model = Model(config_file, checkpoint_file).eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
+
diff --git a/models/cv/classification/se_resnet50/igie/inference.py b/models/cv/classification/se_resnet50/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c602a8f5c899e34f621851c10a5d00c47583c
--- /dev/null
+++ b/models/cv/classification/se_resnet50/igie/inference.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/se_resnet50/igie/scripts/infer_seresnet_fp16_accuracy.sh b/models/cv/classification/se_resnet50/igie/scripts/infer_seresnet_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c861e85be2300f51b41c671d09513272480cb5fe
--- /dev/null
+++ b/models/cv/classification/se_resnet50/igie/scripts/infer_seresnet_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="seresnet50.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path seresnet_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine seresnet_bs_${batchsize}_fp16.so     \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/classification/se_resnet50/igie/scripts/infer_seresnet_fp16_performance.sh b/models/cv/classification/se_resnet50/igie/scripts/infer_seresnet_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..caaeaa7c2035f56fe4f62fd23387024e8c85875d
--- /dev/null
+++ b/models/cv/classification/se_resnet50/igie/scripts/infer_seresnet_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="seresnet50.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path seresnet_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine seresnet_bs_${batchsize}_fp16.so   \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}                   \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/shufflenetv2_x1_0/igie/README.md b/models/cv/classification/shufflenetv2_x1_0/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1211bba938ee81dcf2c76b9b85d81fd71ddc69a5
--- /dev/null
+++ b/models/cv/classification/shufflenetv2_x1_0/igie/README.md
@@ -0,0 +1,47 @@
+# ShuffleNetV2_x1_0
+
+## Description
+
+ShuffleNet V2_x1_0 is an efficient convolutional neural network (CNN) architecture that emphasizes a balance between computational efficiency and accuracy, particularly suited for deployment on mobile and embedded devices. The model refines the ShuffleNet series by introducing structural innovations that enhance feature reuse and reduce redundancy, all while maintaining simplicity and performance.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnx
+pip3 install tqdm
+```
+
+### Download
+
+Pretrained model: <https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth>
+
+Dataset: <https://www.image-net.org/download.php> to download the validation dataset.
+
+### Model Conversion
+
+```bash
+python3 export.py --weight shufflenetv2_x1-5666bf0f80.pth --output shufflenetv2_x1_0.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_shufflenetv2_x1_0_fp16_accuracy.sh
+# Performance
+bash scripts/infer_shufflenetv2_x1_0_fp16_performance.sh
+```
+
+## Results
+
+| Model             | BatchSize | Precision | FPS      | Top-1(%) | Top-5(%) |
+| ----------------- | --------- | --------- | -------- | -------- | -------- |
+| ShuffleNetV2_x1_0 | 32        | FP16      | 8232.980 | 69.308   | 88.302   |
diff --git a/models/cv/classification/shufflenetv2_x1_0/igie/build_engine.py b/models/cv/classification/shufflenetv2_x1_0/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/classification/shufflenetv2_x1_0/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/shufflenetv2_x1_0/igie/export.py b/models/cv/classification/shufflenetv2_x1_0/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a4ff9ba8168c1e33a9a5677facc9b9d03afb911
--- /dev/null
+++ b/models/cv/classification/shufflenetv2_x1_0/igie/export.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import torchvision
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    
+    model = torchvision.models.shufflenet_v2_x1_0()
+    model.load_state_dict(torch.load(args.weight))
+    model.eval()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 224, 224)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )    
+    
+    print("Export onnx model successfully! ")
+    
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/classification/shufflenetv2_x1_0/igie/inference.py b/models/cv/classification/shufflenetv2_x1_0/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aef3ec70fa7e88917c54aeb8242fc73a910c696
--- /dev/null
+++ b/models/cv/classification/shufflenetv2_x1_0/igie/inference.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import sys
+import argparse
+import tvm
+import torch
+import torchvision
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader.")
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def get_dataloader(data_path, batch_size, num_workers):
+    dataset = torchvision.datasets.ImageFolder(
+        data_path,
+        transforms.Compose(
+            [
+                transforms.Resize(256, interpolation=InterpolationMode.BILINEAR),
+                transforms.CenterCrop(224),
+                transforms.PILToTensor(),
+                transforms.ConvertImageDtype(torch.float),
+                transforms.Normalize(
+                    mean=(0.485, 0.456, 0.406),
+                    std=(0.229, 0.224, 0.225)
+                )
+            ]
+        )
+    )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers)
+
+    return dataloader
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # get dataloader
+        dataloader = get_dataloader(args.datasets, batch_size, args.num_workers)
+        
+        top1_acc = 0
+        top5_acc = 0
+        total_num = 0
+
+        for image, label in tqdm(dataloader):
+
+            # pad the last batch
+            pad_batch = len(image) != batch_size
+            
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input(args.input_name, tvm.nd.array(image, device))
+
+            # run inference
+            module.run()
+            
+            pred = module.get_output(0).asnumpy()
+
+            if pad_batch:
+                pred = pred[:origin_size]
+
+            # get batch accuracy
+            batch_top1_acc, batch_top5_acc = get_topk_accuracy(pred, label)
+
+            top1_acc += batch_top1_acc
+            top5_acc += batch_top5_acc
+            total_num += batch_size
+
+        result_stat = {}
+        result_stat["acc@1"] = round(top1_acc / total_num * 100.0, 3)
+        result_stat["acc@5"] = round(top5_acc / total_num * 100.0, 3)
+
+        print(f"\n* Top1 acc: {result_stat['acc@1']} %, Top5 acc: {result_stat['acc@5']} %")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/classification/shufflenetv2_x1_0/igie/scripts/infer_shufflenetv2_x1_0_fp16_accuracy.sh b/models/cv/classification/shufflenetv2_x1_0/igie/scripts/infer_shufflenetv2_x1_0_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..91802347326ce32c17d7f0a3309328a3976314db
--- /dev/null
+++ b/models/cv/classification/shufflenetv2_x1_0/igie/scripts/infer_shufflenetv2_x1_0_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="shufflenetv2_x1_0.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path shufflenetv2_x1_0_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                      \
+    --engine shufflenetv2_x1_0_bs_${batchsize}_fp16.so    \
+    --batchsize ${batchsize}                              \
+    --input_name input                                    \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/classification/shufflenetv2_x1_0/igie/scripts/infer_shufflenetv2_x1_0_fp16_performance.sh b/models/cv/classification/shufflenetv2_x1_0/igie/scripts/infer_shufflenetv2_x1_0_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..21353dc71844e052a925a493f58d6860a20323da
--- /dev/null
+++ b/models/cv/classification/shufflenetv2_x1_0/igie/scripts/infer_shufflenetv2_x1_0_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="shufflenetv2_x1_0.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,224,224    \
+    --precision fp16                        \
+    --engine_path shufflenetv2_x1_0_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                                      \
+    --engine shufflenetv2_x1_0_bs_${batchsize}_fp16.so    \
+    --batchsize ${batchsize}                              \
+    --input_name input                                    \
+    --datasets ${datasets_path}                           \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/classification/swin_transformer_Large/ixrt/README.md b/models/cv/classification/swin_transformer_Large/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9cb811bfc076bee7d42775a16f8e8803180aebcb
--- /dev/null
+++ b/models/cv/classification/swin_transformer_Large/ixrt/README.md
@@ -0,0 +1,83 @@
+# Swin-L
+
+## Description
+
+Swin Transformer-Large is a variant of the Swin Transformer, an architecture designed for computer vision tasks, particularly within the realms of image classification, object detection, and segmentation. The Swin Transformer-Large model represents an expanded version with more layers and parameters compared to its base configuration, aiming for improved performance and deeper processing of visual data.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open-swin-large.tar  >
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_imagenet.tar > to download the open_imagenet dataset.
+
+or you can :
+```bash
+bash /scripts/prepare_model_and_dataset.sh
+
+```
+
+### Model Conversion
+Please correct the paths in the following commands or files.
+```bash
+tar -xvf open-swin-large.tar
+wget <https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/swin-large-torch-fp32.json >
+python3 torch2onnx.py --model_path swin-transformer-large.pt --output_path swin-large-torch-fp32.onnx
+
+```
+
+## Inference
+
+
+```bash
+export ORIGIN_ONNX_NAME=/Path/swin-large-torch-fp32.onnx
+export OPTIMIER_FILE=/Path/ixrt/oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+
+bash scripts/infer_swinl_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+
+pip3 install -r toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/requirements.txt
+mv /ixrt/perf_engine.py toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+cd toolbox/ByteMLPerf/byte_infer_perf/
+mv /general_perf/general_perf/model_zoo/popular/swin-large /general_perf/model_zoo/popular/swin-large
+cd toolbox/ByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
+```
+
+
+## Results
+
+Model   |BatchSize  |Precision |QPS       |Top-1 Acc  |
+--------|-----------|----------|----------|-----------|
+Swin-L  |    16     |   FP16   | 5.746    | 85.62     | 
\ No newline at end of file
diff --git a/models/cv/detection/atss/igie/README.md b/models/cv/detection/atss/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa4ffd64486563276fb4ad966cc5eef7fb99d3c3
--- /dev/null
+++ b/models/cv/detection/atss/igie/README.md
@@ -0,0 +1,68 @@
+# ATSS
+
+## Description
+
+ATSS is an advanced adaptive training sample selection method that effectively enhances the performance of both anchor-based and anchor-free object detectors by dynamically choosing positive and negative samples based on the statistical characteristics of objects. The design of ATSS reduces reliance on hyperparameters, simplifies the sample selection process, and significantly improves detection accuracy without adding extra computational costs.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install onnxsim
+pip3 install mmdet==3.3.0
+pip3 install mmdeploy==1.3.1
+pip3 install mmengine==0.10.4
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth>
+
+Dataset: <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+
+<<<<<<< Updated upstream
+```bash
+wget https://download.openmmlab.com/mmdetection/v2.0/atss/atss_r50_fpn_1x_coco/atss_r50_fpn_1x_coco_20200209-985f7bd0.pth
+```
+
+=======
+>>>>>>> Stashed changes
+### Model Conversion
+
+```bash
+# export onnx model
+python3 export.py --weight atss_r50_fpn_1x_coco_20200209-985f7bd0.pth --cfg atss_r50_fpn_1x_coco.py --output atss.onnx
+
+# use onnxsim optimize onnx model
+onnxsim atss.onnx atss_opt.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/coco/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_atss_fp16_accuracy.sh
+# Performance
+bash scripts/infer_atss_fp16_performance.sh
+```
+
+## Results
+
+| Model | BatchSize | Input Shape | Precision |   FPS   | mAP@0.5(%) |
+| :---: | :-------: | :---------: | :-------: | :-----: | :--------: |
+| ATSS  |    32     |   800x800   |   FP16    | 126.864 |   0.541    |
diff --git a/models/cv/detection/atss/igie/atss_r50_fpn_1x_coco.py b/models/cv/detection/atss/igie/atss_r50_fpn_1x_coco.py
new file mode 100755
index 0000000000000000000000000000000000000000..0378cf0b6f9307ccd1e931eab2c705ab3b121475
--- /dev/null
+++ b/models/cv/detection/atss/igie/atss_r50_fpn_1x_coco.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backend_args = None
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=1, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+load_from = None
+log_level = 'ERROR'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+model = dict(
+    backbone=dict(
+        depth=50,
+        frozen_stages=1,
+        init_cfg=dict(checkpoint='torchvision://resnet50', type='Pretrained'),
+        norm_cfg=dict(requires_grad=True, type='BN'),
+        norm_eval=True,
+        num_stages=4,
+        out_indices=(
+            0,
+            1,
+            2,
+            3,
+        ),
+        style='pytorch',
+        type='ResNet'),
+    bbox_head=dict(
+        anchor_generator=dict(
+            octave_base_scale=8,
+            ratios=[
+                1.0,
+            ],
+            scales_per_octave=1,
+            strides=[
+                8,
+                16,
+                32,
+                64,
+                128,
+            ],
+            type='AnchorGenerator'),
+        bbox_coder=dict(
+            target_means=[
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+            ],
+            target_stds=[
+                0.1,
+                0.1,
+                0.2,
+                0.2,
+            ],
+            type='DeltaXYWHBBoxCoder'),
+        feat_channels=256,
+        in_channels=256,
+        loss_bbox=dict(loss_weight=2.0, type='GIoULoss'),
+        loss_centerness=dict(
+            loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=True),
+        loss_cls=dict(
+            alpha=0.25,
+            gamma=2.0,
+            loss_weight=1.0,
+            type='FocalLoss',
+            use_sigmoid=True),
+        num_classes=80,
+        stacked_convs=4,
+        type='ATSSHead'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_size_divisor=32,
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='DetDataPreprocessor'),
+    neck=dict(
+        add_extra_convs='on_output',
+        in_channels=[
+            256,
+            512,
+            1024,
+            2048,
+        ],
+        num_outs=5,
+        out_channels=256,
+        start_level=1,
+        type='FPN'),
+    test_cfg=dict(
+        max_per_img=100,
+        min_bbox_size=0,
+        nms=dict(iou_threshold=0.6, type='nms'),
+        nms_pre=1000,
+        score_thr=0.05),
+    train_cfg=dict(
+        allowed_border=-1,
+        assigner=dict(topk=9, type='ATSSAssigner'),
+        debug=False,
+        pos_weight=-1),
+    type='ATSS')
+optim_wrapper = dict(
+    optimizer=dict(lr=0.02, momentum=0.9, type='SGD', weight_decay=0.0001),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=500, start_factor=0.001, type='LinearLR'),
+    dict(
+        begin=0,
+        by_epoch=True,
+        end=12,
+        gamma=0.1,
+        milestones=[
+            8,
+            11,
+        ],
+        type='MultiStepLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='images/val2017/'),
+        data_root='/root/.igie_cache/modelzoo_data/datasets/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    ann_file=
+    '/root/.igie_cache/modelzoo_data/datasets/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+test_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(keep_ratio=False, scale=(
+        800,
+        800,
+    ), type='Resize'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+        ),
+        type='PackDetInputs'),
+]
+train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=1)
+train_dataloader = dict(
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    batch_size=2,
+    dataset=dict(
+        ann_file='annotations/instances_train2017.json',
+        backend_args=None,
+        data_prefix=dict(img='train2017/'),
+        data_root='data/coco/',
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(keep_ratio=True, scale=(
+                1333,
+                800,
+            ), type='Resize'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(type='PackDetInputs'),
+        ],
+        type='CocoDataset'),
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(keep_ratio=True, scale=(
+        1333,
+        800,
+    ), type='Resize'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(type='PackDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    ann_file='data/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='DetLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './'
diff --git a/models/cv/detection/atss/igie/build_engine.py b/models/cv/detection/atss/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/detection/atss/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/atss/igie/deploy_default.py b/models/cv/detection/atss/igie/deploy_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d8e43dc829456f0c2e46a7acfc3128757f945d
--- /dev/null
+++ b/models/cv/detection/atss/igie/deploy_default.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+onnx_config = dict(
+    type='onnx',
+    export_params=True,
+    keep_initializers_as_inputs=False,
+    opset_version=11,
+    save_file='end2end.onnx',
+    input_names=['input'],
+    output_names=['output'],
+    input_shape=None,
+    optimize=True)
+
+codebase_config = dict(
+    type='mmdet',
+    task='ObjectDetection',
+    model_type='end2end',
+    post_processing=dict(
+        score_threshold=0.05,
+        confidence_threshold=0.005,
+        iou_threshold=0.5,
+        max_output_boxes_per_class=200,
+        pre_top_k=5000,
+        keep_top_k=100,
+        background_label_id=-1,
+    ))
+
+backend_config = dict(type='onnxruntime')
\ No newline at end of file
diff --git a/models/cv/detection/atss/igie/export.py b/models/cv/detection/atss/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..13573c9dff3d96be4ba59eaa8698d67fb1d50f13
--- /dev/null
+++ b/models/cv/detection/atss/igie/export.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+
+import torch
+from mmdeploy.utils import load_config
+from mmdeploy.apis import build_task_processor
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+       
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    deploy_cfg = 'deploy_default.py'
+    model_cfg = args.cfg
+    model_checkpoint = args.weight
+
+    deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+    task_processor = build_task_processor(model_cfg, deploy_cfg, device='cpu')
+
+    model = task_processor.build_pytorch_model(model_checkpoint)
+
+    input_names = ['input']
+    dynamic_axes = {'input': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 800, 800)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/atss/igie/inference.py b/models/cv/detection/atss/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc1a2b8604533313fb7cb8f41d5d899e3ce25553
--- /dev/null
+++ b/models/cv/detection/atss/igie/inference.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import argparse
+import tvm
+import torch
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from mmpose.registry import RUNNERS
+from mmengine.config import Config
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine    
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:       
+         # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # runner config
+        cfg = Config.fromfile("atss_r50_fpn_1x_coco.py")
+
+        cfg.work_dir = "./"
+        cfg['test_dataloader']['batch_size'] = batch_size
+        cfg['test_dataloader']['dataset']['data_root'] = args.datasets
+        cfg['test_dataloader']['dataset']['data_prefix']['img'] = 'images/val2017/'
+        cfg['test_evaluator']['ann_file'] = os.path.join(args.datasets, 'annotations/instances_val2017.json')
+        cfg['log_level'] = 'ERROR'
+
+        # build runner
+        runner = RUNNERS.build(cfg)
+    
+        for data in tqdm(runner.test_dataloader):
+            cls_score = []
+            box_reg = []
+            score_factors = []
+
+            input_data = runner.model.data_preprocessor(data, False)
+            image = input_data['inputs'].cpu()
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input("input", tvm.nd.array(image, device))
+
+            module.run()
+            
+            for i in range(module.get_num_outputs()):
+                output = module.get_output(i).asnumpy()
+
+                if pad_batch:
+                    output = output[:origin_size]
+
+                output = torch.from_numpy(output)
+
+                if output.shape[1] == 80:
+                    cls_score.append(output)
+                elif output.shape[1] == 4:
+                    box_reg.append(output)
+                else:
+                    score_factors.append(output)
+
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in data['data_samples']
+            ]  
+
+            preds = runner.model.bbox_head.predict_by_feat(
+                cls_score, box_reg, score_factors=score_factors, batch_img_metas=batch_img_metas, rescale=True
+            )
+
+            batch_data_samples = runner.model.add_pred_to_datasample(input_data['data_samples'], preds)
+
+            runner.test_evaluator.process(data_samples=batch_data_samples, data_batch=data)
+
+        metrics = runner.test_evaluator.evaluate(len(runner.test_dataloader.dataset))        
+    
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/detection/atss/igie/scripts/infer_atss_fp16_accuracy.sh b/models/cv/detection/atss/igie/scripts/infer_atss_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8bb8ab47171fbd78b8ac2417bc02749f6303279a
--- /dev/null
+++ b/models/cv/detection/atss/igie/scripts/infer_atss_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="atss_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,800,800    \
+    --precision fp16                        \
+    --engine_path atss_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine atss_bs_${batchsize}_fp16.so     \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/detection/atss/igie/scripts/infer_atss_fp16_performance.sh b/models/cv/detection/atss/igie/scripts/infer_atss_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..427c05be1cdbe3513d06f9b338097f27c4883b9d
--- /dev/null
+++ b/models/cv/detection/atss/igie/scripts/infer_atss_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="atss_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,800,800    \
+    --precision fp16                        \
+    --engine_path atss_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine atss_bs_${batchsize}_fp16.so     \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}               \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/detection/fcos/igie/README.md b/models/cv/detection/fcos/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2aa984220574c4ad554fd867422e6047bcb23ca0
--- /dev/null
+++ b/models/cv/detection/fcos/igie/README.md
@@ -0,0 +1,65 @@
+# FCOS
+
+## Description
+
+FCOS is an innovative one-stage object detection framework that abandons traditional anchor box dependency and uses a fully convolutional network for per-pixel target prediction. By introducing a centerness branch and multi-scale feature fusion, FCOS enhances detection performance while simplifying the model structure, especially in detecting small and overlapping targets. Additionally, FCOS eliminates the need for hyperparameter tuning related to anchor boxes, streamlining the model training and tuning process.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install onnxsim
+pip3 install mmdet==3.3.0
+pip3 install mmdeploy==1.3.1
+pip3 install mmengine==0.10.4
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth>
+
+Dataset: <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+
+```bash
+wget https://download.openmmlab.com/mmdetection/v2.0/fcos/fcos_r50_caffe_fpn_gn-head_1x_coco/fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth
+```
+
+### Model Conversion
+
+```bash
+# export onnx model
+python3 export.py --weight fcos_r50_caffe_fpn_gn-head_1x_coco-821213aa.pth --cfg fcos_r50_caffe_fpn_gn-head_1x_coco.py --output fcos.onnx
+
+# use onnxsim optimize onnx model
+onnxsim fcos.onnx fcos_opt.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/coco/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_fcos_fp16_accuracy.sh
+# Performance
+bash scripts/infer_fcos_fp16_performance.sh
+```
+
+## Results
+
+| Model | BatchSize | Input Shape | Precision |   FPS   | mAP@0.5(%) |
+| :---: | :-------: | :---------: | :-------: | :-----: | :--------: |
+| FCOS  |    32     |   800x800   |   FP16    | 135.019 |   0.522    |
diff --git a/models/cv/detection/fcos/igie/build_engine.py b/models/cv/detection/fcos/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/detection/fcos/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/fcos/igie/deploy_default.py b/models/cv/detection/fcos/igie/deploy_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d8e43dc829456f0c2e46a7acfc3128757f945d
--- /dev/null
+++ b/models/cv/detection/fcos/igie/deploy_default.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+onnx_config = dict(
+    type='onnx',
+    export_params=True,
+    keep_initializers_as_inputs=False,
+    opset_version=11,
+    save_file='end2end.onnx',
+    input_names=['input'],
+    output_names=['output'],
+    input_shape=None,
+    optimize=True)
+
+codebase_config = dict(
+    type='mmdet',
+    task='ObjectDetection',
+    model_type='end2end',
+    post_processing=dict(
+        score_threshold=0.05,
+        confidence_threshold=0.005,
+        iou_threshold=0.5,
+        max_output_boxes_per_class=200,
+        pre_top_k=5000,
+        keep_top_k=100,
+        background_label_id=-1,
+    ))
+
+backend_config = dict(type='onnxruntime')
\ No newline at end of file
diff --git a/models/cv/detection/fcos/igie/export.py b/models/cv/detection/fcos/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..13573c9dff3d96be4ba59eaa8698d67fb1d50f13
--- /dev/null
+++ b/models/cv/detection/fcos/igie/export.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+
+import torch
+from mmdeploy.utils import load_config
+from mmdeploy.apis import build_task_processor
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+       
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    deploy_cfg = 'deploy_default.py'
+    model_cfg = args.cfg
+    model_checkpoint = args.weight
+
+    deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+    task_processor = build_task_processor(model_cfg, deploy_cfg, device='cpu')
+
+    model = task_processor.build_pytorch_model(model_checkpoint)
+
+    input_names = ['input']
+    dynamic_axes = {'input': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 800, 800)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/fcos/igie/fcos_r50_caffe_fpn_gn-head_1x_coco.py b/models/cv/detection/fcos/igie/fcos_r50_caffe_fpn_gn-head_1x_coco.py
new file mode 100755
index 0000000000000000000000000000000000000000..04941d978d5a49cf71df16acf4e0b7486c0ea56d
--- /dev/null
+++ b/models/cv/detection/fcos/igie/fcos_r50_caffe_fpn_gn-head_1x_coco.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backend_args = None
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=1, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+evaluation = dict(interval=1, metric='bbox')
+load_from = None
+log_level = 'ERROR'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+model = dict(
+    backbone=dict(
+        depth=50,
+        frozen_stages=1,
+        init_cfg=dict(
+            checkpoint='open-mmlab://detectron/resnet50_caffe',
+            type='Pretrained'),
+        norm_cfg=dict(requires_grad=False, type='BN'),
+        norm_eval=True,
+        num_stages=4,
+        out_indices=(
+            0,
+            1,
+            2,
+            3,
+        ),
+        style='caffe',
+        type='ResNet'),
+    bbox_head=dict(
+        feat_channels=256,
+        in_channels=256,
+        loss_bbox=dict(loss_weight=1.0, type='IoULoss'),
+        loss_centerness=dict(
+            loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=True),
+        loss_cls=dict(
+            alpha=0.25,
+            gamma=2.0,
+            loss_weight=1.0,
+            type='FocalLoss',
+            use_sigmoid=True),
+        num_classes=80,
+        stacked_convs=4,
+        strides=[
+            8,
+            16,
+            32,
+            64,
+            128,
+        ],
+        type='FCOSHead'),
+    data_preprocessor=dict(
+        bgr_to_rgb=False,
+        mean=[
+            102.9801,
+            115.9465,
+            122.7717,
+        ],
+        pad_size_divisor=32,
+        std=[
+            1.0,
+            1.0,
+            1.0,
+        ],
+        type='DetDataPreprocessor'),
+    neck=dict(
+        add_extra_convs='on_output',
+        in_channels=[
+            256,
+            512,
+            1024,
+            2048,
+        ],
+        num_outs=5,
+        out_channels=256,
+        relu_before_extra_convs=True,
+        start_level=1,
+        type='FPN'),
+    test_cfg=dict(
+        max_per_img=100,
+        min_bbox_size=0,
+        nms=dict(iou_threshold=0.5, type='nms'),
+        nms_pre=1000,
+        score_thr=0.05),
+    type='FCOS')
+optim_wrapper = dict(
+    optimizer=dict(lr=0.02, momentum=0.9, type='SGD', weight_decay=0.0001),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=500, start_factor=0.001, type='LinearLR'),
+    dict(
+        begin=0,
+        by_epoch=True,
+        end=12,
+        gamma=0.1,
+        milestones=[
+            8,
+            11,
+        ],
+        type='MultiStepLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='images/val2017/'),
+        data_root='/root/.igie_cache/modelzoo_data/datasets/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    ann_file=
+    '/root/.igie_cache/modelzoo_data/datasets/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+test_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        800,
+        800,
+    ), type='Resize'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+        ),
+        type='PackDetInputs'),
+]
+train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=1)
+train_dataloader = dict(
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    batch_size=2,
+    dataset=dict(
+        ann_file='annotations/instances_train2017.json',
+        backend_args=None,
+        data_prefix=dict(img='train2017/'),
+        data_root='data/coco/',
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(keep_ratio=True, scale=(
+                1333,
+                800,
+            ), type='Resize'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(type='PackDetInputs'),
+        ],
+        type='CocoDataset'),
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(keep_ratio=True, scale=(
+        1333,
+        800,
+    ), type='Resize'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(type='PackDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    ann_file='data/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='DetLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './'
diff --git a/models/cv/detection/fcos/igie/inference.py b/models/cv/detection/fcos/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0517124fcb3e4cf136adfff62f6dda9c2765064
--- /dev/null
+++ b/models/cv/detection/fcos/igie/inference.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import argparse
+import tvm
+import torch
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from mmpose.registry import RUNNERS
+from mmengine.config import Config
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine    
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:       
+         # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # runner config
+        cfg = Config.fromfile("fcos_r50_caffe_fpn_gn-head_1x_coco.py")
+
+        cfg.work_dir = "./"
+        cfg['test_dataloader']['batch_size'] = batch_size
+        cfg['test_dataloader']['dataset']['data_root'] = args.datasets
+        cfg['test_dataloader']['dataset']['data_prefix']['img'] = 'images/val2017/'
+        cfg['test_evaluator']['ann_file'] = os.path.join(args.datasets, 'annotations/instances_val2017.json')
+        cfg['log_level'] = 'ERROR'
+
+        # build runner
+        runner = RUNNERS.build(cfg)
+    
+        for data in tqdm(runner.test_dataloader):
+            cls_score = []
+            box_reg = []
+            score_factors = []
+            
+            input_data = runner.model.data_preprocessor(data, False)
+            image = input_data['inputs'].cpu()
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input("input", tvm.nd.array(image, device))
+
+            module.run()
+            
+            for i in range(module.get_num_outputs()):
+                output = module.get_output(i).asnumpy()
+
+                if pad_batch:
+                    output = output[:origin_size]
+
+                output = torch.from_numpy(output)
+
+                if output.shape[1] == 80:
+                    cls_score.append(output)
+                elif output.shape[1] == 4:
+                    box_reg.append(output)
+                else:
+                    score_factors.append(output)
+
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in data['data_samples']
+            ]  
+
+            preds = runner.model.bbox_head.predict_by_feat(
+                cls_score, box_reg, score_factors=score_factors, batch_img_metas=batch_img_metas, rescale=True
+            )
+
+            batch_data_samples = runner.model.add_pred_to_datasample(input_data['data_samples'], preds)
+
+            runner.test_evaluator.process(data_samples=batch_data_samples, data_batch=data)
+
+        metrics = runner.test_evaluator.evaluate(len(runner.test_dataloader.dataset))        
+    
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/detection/fcos/igie/scripts/infer_fcos_fp16_accuracy.sh b/models/cv/detection/fcos/igie/scripts/infer_fcos_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c09f2c330d0b1880ae3453c2880cbd63bd07350b
--- /dev/null
+++ b/models/cv/detection/fcos/igie/scripts/infer_fcos_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="fcos_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,800,800    \
+    --precision fp16                        \
+    --engine_path fcos_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine fcos_bs_${batchsize}_fp16.so     \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/detection/fcos/igie/scripts/infer_fcos_fp16_performance.sh b/models/cv/detection/fcos/igie/scripts/infer_fcos_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8f358c38f286b2725a8cc1738451a85ecad11d3d
--- /dev/null
+++ b/models/cv/detection/fcos/igie/scripts/infer_fcos_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="fcos_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,800,800    \
+    --precision fp16                        \
+    --engine_path fcos_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine fcos_bs_${batchsize}_fp16.so     \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}               \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/detection/fsaf/igie/README.md b/models/cv/detection/fsaf/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7a587dc0e10d1c382019a0b18c7bbe0574d26994
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/README.md
@@ -0,0 +1,65 @@
+# FSAF
+
+## Description
+
+The FSAF (Feature Selective Anchor-Free) module is an innovative component for single-shot object detection that enhances performance through online feature selection and anchor-free branches. The FSAF module dynamically selects the most suitable feature level for each object instance, rather than relying on traditional anchor-based heuristic methods. This improvement significantly boosts the accuracy of object detection, especially for small targets and in complex scenes. Moreover, compared to existing anchor-based detectors, the FSAF module maintains high efficiency while adding negligible additional inference overhead.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install onnxsim
+pip3 install mmdet==3.3.0
+pip3 install mmdeploy==1.3.1
+pip3 install mmengine==0.10.4
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth>
+
+Dataset: <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+
+```bash
+wget https://download.openmmlab.com/mmdetection/v2.0/fsaf/fsaf_r50_fpn_1x_coco/fsaf_r50_fpn_1x_coco-94ccc51f.pth
+```
+
+### Model Conversion
+
+```bash
+# export onnx model
+python3 export.py --weight fsaf_r50_fpn_1x_coco-94ccc51f.pth --cfg fsaf_r50_fpn_1x_coco.py --output fsaf.onnx
+
+# use onnxsim optimize onnx model
+onnxsim fsaf.onnx fsaf_opt.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/coco/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_fsaf_fp16_accuracy.sh
+# Performance
+bash scripts/infer_fsaf_fp16_performance.sh
+```
+
+## Results
+
+| Model | BatchSize | Input Shape | Precision |   FPS   | mAP@0.5(%) |
+| :---: | :-------: | :---------: | :-------: | :-----: | :--------: |
+| FSAF  |    32     |   800x800   |   FP16    | 178.748 |   0.530    |
diff --git a/models/cv/detection/fsaf/igie/base/retinanet_r50_fpn_1x_coco.py b/models/cv/detection/fsaf/igie/base/retinanet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..425d3d32efbaede948e1ac55e27d65be4ab26283
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/base/retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# model settings
+model = dict(
+    type='RetinaNet',
+    data_preprocessor=dict(
+        type='DetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type='RetinaHead',
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='PseudoSampler'),  # Focal loss should use PseudoSampler
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type='nms', iou_threshold=0.5),
+        max_per_img=100))
\ No newline at end of file
diff --git a/models/cv/detection/fsaf/igie/build_engine.py b/models/cv/detection/fsaf/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/fsaf/igie/deploy_default.py b/models/cv/detection/fsaf/igie/deploy_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d8e43dc829456f0c2e46a7acfc3128757f945d
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/deploy_default.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+onnx_config = dict(
+    type='onnx',
+    export_params=True,
+    keep_initializers_as_inputs=False,
+    opset_version=11,
+    save_file='end2end.onnx',
+    input_names=['input'],
+    output_names=['output'],
+    input_shape=None,
+    optimize=True)
+
+codebase_config = dict(
+    type='mmdet',
+    task='ObjectDetection',
+    model_type='end2end',
+    post_processing=dict(
+        score_threshold=0.05,
+        confidence_threshold=0.005,
+        iou_threshold=0.5,
+        max_output_boxes_per_class=200,
+        pre_top_k=5000,
+        keep_top_k=100,
+        background_label_id=-1,
+    ))
+
+backend_config = dict(type='onnxruntime')
\ No newline at end of file
diff --git a/models/cv/detection/fsaf/igie/export.py b/models/cv/detection/fsaf/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..13573c9dff3d96be4ba59eaa8698d67fb1d50f13
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/export.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+
+import torch
+from mmdeploy.utils import load_config
+from mmdeploy.apis import build_task_processor
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+       
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    deploy_cfg = 'deploy_default.py'
+    model_cfg = args.cfg
+    model_checkpoint = args.weight
+
+    deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+    task_processor = build_task_processor(model_cfg, deploy_cfg, device='cpu')
+
+    model = task_processor.build_pytorch_model(model_checkpoint)
+
+    input_names = ['input']
+    dynamic_axes = {'input': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 800, 800)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/fsaf/igie/fsaf_r50_fpn_1x_coco.py b/models/cv/detection/fsaf/igie/fsaf_r50_fpn_1x_coco.py
new file mode 100755
index 0000000000000000000000000000000000000000..33c2df60e5472e22e57616614cad5ff4fb21e984
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/fsaf_r50_fpn_1x_coco.py
@@ -0,0 +1,278 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backend_args = None
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=1, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+load_from = None
+log_level = 'ERROR'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+model = dict(
+    backbone=dict(
+        depth=50,
+        frozen_stages=1,
+        init_cfg=dict(checkpoint='torchvision://resnet50', type='Pretrained'),
+        norm_cfg=dict(requires_grad=True, type='BN'),
+        norm_eval=True,
+        num_stages=4,
+        out_indices=(
+            0,
+            1,
+            2,
+            3,
+        ),
+        style='pytorch',
+        type='ResNet'),
+    bbox_head=dict(
+        anchor_generator=dict(
+            octave_base_scale=1,
+            ratios=[
+                1.0,
+            ],
+            scales_per_octave=1,
+            strides=[
+                8,
+                16,
+                32,
+                64,
+                128,
+            ],
+            type='AnchorGenerator'),
+        bbox_coder=dict(normalizer=4.0, type='TBLRBBoxCoder'),
+        feat_channels=256,
+        in_channels=256,
+        loss_bbox=dict(
+            eps=1e-06, loss_weight=1.0, reduction='none', type='IoULoss'),
+        loss_cls=dict(
+            alpha=0.25,
+            gamma=2.0,
+            loss_weight=1.0,
+            reduction='none',
+            type='FocalLoss',
+            use_sigmoid=True),
+        num_classes=80,
+        reg_decoded_bbox=True,
+        stacked_convs=4,
+        type='FSAFHead'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_size_divisor=32,
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='DetDataPreprocessor'),
+    neck=dict(
+        add_extra_convs='on_input',
+        in_channels=[
+            256,
+            512,
+            1024,
+            2048,
+        ],
+        num_outs=5,
+        out_channels=256,
+        start_level=1,
+        type='FPN'),
+    test_cfg=dict(
+        max_per_img=100,
+        min_bbox_size=0,
+        nms=dict(iou_threshold=0.5, type='nms'),
+        nms_pre=1000,
+        score_thr=0.05),
+    train_cfg=dict(
+        allowed_border=-1,
+        assigner=dict(
+            min_pos_iof=0.01,
+            neg_scale=0.2,
+            pos_scale=0.2,
+            type='CenterRegionAssigner'),
+        debug=False,
+        pos_weight=-1,
+        sampler=dict(type='PseudoSampler')),
+    type='FSAF')
+optim_wrapper = dict(
+    optimizer=dict(lr=0.02, momentum=0.9, type='SGD', weight_decay=0.0001),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=500, start_factor=0.001, type='LinearLR'),
+    dict(
+        begin=0,
+        by_epoch=True,
+        end=12,
+        gamma=0.1,
+        milestones=[
+            8,
+            11,
+        ],
+        type='MultiStepLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='images/val2017/'),
+        data_root='/root/.igie_cache/modelzoo_data/datasets/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    ann_file=
+    '/root/.igie_cache/modelzoo_data/datasets/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+test_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        1333,
+        800,
+    ), type='Resize'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+        ),
+        type='PackDetInputs'),
+]
+train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=1)
+train_dataloader = dict(
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    batch_size=2,
+    dataset=dict(
+        ann_file='annotations/instances_train2017.json',
+        backend_args=None,
+        data_prefix=dict(img='train2017/'),
+        data_root='data/coco/',
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(keep_ratio=True, scale=(
+                1333,
+                800,
+            ), type='Resize'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(type='PackDetInputs'),
+        ],
+        type='CocoDataset'),
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(keep_ratio=True, scale=(
+        1333,
+        800,
+    ), type='Resize'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(type='PackDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                1333,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    ann_file='data/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='DetLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './'
diff --git a/models/cv/detection/fsaf/igie/inference.py b/models/cv/detection/fsaf/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..058a53433e2ac0e02f3650846980c2d5460d5e90
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/inference.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import argparse
+import tvm
+import torch
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from mmpose.registry import RUNNERS
+from mmengine.config import Config
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine    
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:       
+         # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # runner config
+        cfg = Config.fromfile("fsaf_r50_fpn_1x_coco.py")
+
+        cfg.work_dir = "./"
+        cfg['test_dataloader']['batch_size'] = batch_size
+        cfg['test_dataloader']['dataset']['data_root'] = args.datasets
+        cfg['test_dataloader']['dataset']['data_prefix']['img'] = 'images/val2017/'
+        cfg['test_evaluator']['ann_file'] = os.path.join(args.datasets, 'annotations/instances_val2017.json')
+        cfg['log_level'] = 'ERROR'
+
+        # build runner
+        runner = RUNNERS.build(cfg)
+    
+        for data in tqdm(runner.test_dataloader):
+            cls_score = []
+            box_reg = []
+            
+            input_data = runner.model.data_preprocessor(data, False)
+            image = input_data['inputs'].cpu()
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input("input", tvm.nd.array(image, device))
+
+            module.run()
+            
+            for i in range(module.get_num_outputs()):
+                output = module.get_output(i).asnumpy()
+
+                if pad_batch:
+                    output = output[:origin_size]
+
+                output = torch.from_numpy(output)
+
+                if output.shape[1] == 80:
+                    cls_score.append(output)
+                elif output.shape[1] == 4:
+                    box_reg.append(output)
+
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in data['data_samples']
+            ]  
+
+            preds = runner.model.bbox_head.predict_by_feat(
+                cls_score, box_reg, batch_img_metas=batch_img_metas, rescale=True
+            )
+
+            batch_data_samples = runner.model.add_pred_to_datasample(input_data['data_samples'], preds)
+
+            runner.test_evaluator.process(data_samples=batch_data_samples, data_batch=data)
+
+        metrics = runner.test_evaluator.evaluate(len(runner.test_dataloader.dataset))        
+    
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/detection/fsaf/igie/scripts/infer_fsaf_fp16_accuracy.sh b/models/cv/detection/fsaf/igie/scripts/infer_fsaf_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..94bd0b406f0fe03ab33a040e544771da5322bb89
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/scripts/infer_fsaf_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="fsaf_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,800,800    \
+    --precision fp16                        \
+    --engine_path fsaf_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine fsaf_bs_${batchsize}_fp16.so     \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/detection/fsaf/igie/scripts/infer_fsaf_fp16_performance.sh b/models/cv/detection/fsaf/igie/scripts/infer_fsaf_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cb8f93aa049143ebdef57db7a8b17a310e8b91d4
--- /dev/null
+++ b/models/cv/detection/fsaf/igie/scripts/infer_fsaf_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="fsaf_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,800,800    \
+    --precision fp16                        \
+    --engine_path fsaf_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine fsaf_bs_${batchsize}_fp16.so     \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}               \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/detection/retinaface/igie/README.md b/models/cv/detection/retinaface/igie/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..a4eb4e4d406b3b225c4db9f4980a9f35b757aa3d
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/README.md
@@ -0,0 +1,67 @@
+# RetinaFace
+
+## Description
+
+RetinaFace is an efficient single-stage face detection model that employs a multi-task learning strategy to simultaneously predict facial locations, landmarks, and 3D facial shapes. It utilizes feature pyramids and context modules to extract multi-scale features and employs a self-supervised mesh decoder to enhance detection accuracy. RetinaFace demonstrates excellent performance on datasets like WIDER FACE, supports real-time processing, and its code and datasets are publicly available for researchers.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install onnxsim
+pip3 install opencv-python==4.6.0.66
+```
+
+### Download
+
+Pretrained model: <https://github.com/biubug6/Face-Detector-1MB-with-landmark/raw/master/weights/mobilenet0.25_Final.pth>
+
+Dataset: <http://shuoyang1213.me/WIDERFACE/> to download the validation dataset.
+
+```bash
+wget https://github.com/biubug6/Face-Detector-1MB-with-landmark/raw/master/weights/mobilenet0.25_Final.pth
+```
+
+### Model Conversion
+
+```bash
+# export onnx model
+python3 export.py --weight mobilenet0.25_Final.pth --output retinaface.onnx
+
+# use onnxsim optimize onnx model
+onnxsim retinaface.onnx retinaface_opt.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/widerface/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_retinaface_fp16_accuracy.sh
+# Performance
+bash scripts/infer_retinaface_fp16_performance.sh
+```
+
+## Results
+
+|   Model    | BatchSize | Precision |   FPS    | Easy AP(%) | Medium AP (%) | Hard AP(%) |
+| :--------: | :-------: | :-------: | :------: | :--------: | :-----------: | :--------: |
+| RetinaFace |    32     |   FP16    | 8304.626 |   80.13    |     68.52     |   36.59    |
+
+## Reference
+
+Face-Detector-1MB-with-landmark: <https://github.com/biubug6/Face-Detector-1MB-with-landmark>
diff --git a/models/cv/detection/retinaface/igie/build_engine.py b/models/cv/detection/retinaface/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/retinaface/igie/export.py b/models/cv/detection/retinaface/igie/export.py
new file mode 100755
index 0000000000000000000000000000000000000000..a1816645c2cf6009d2f4529bdd340832db64a044
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/export.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+import torch
+from models.retinaface import RetinaFace
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+    
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def check_keys(model, pretrained_state_dict):
+    ckpt_keys = set(pretrained_state_dict.keys())
+    model_keys = set(model.state_dict().keys())
+    used_pretrained_keys = model_keys & ckpt_keys
+    unused_pretrained_keys = ckpt_keys - model_keys
+    missing_keys = model_keys - ckpt_keys
+    print('Missing keys:{}'.format(len(missing_keys)))
+    print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys)))
+    print('Used keys:{}'.format(len(used_pretrained_keys)))
+    assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
+    return True
+
+
+def remove_prefix(state_dict, prefix):
+    ''' Old style model is stored with all names of parameters sharing common prefix 'module.' '''
+    print('remove prefix \'{}\''.format(prefix))
+    f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x
+    return {f(key): value for key, value in state_dict.items()}
+
+
+def load_model(model, pretrained_path):
+    print('Loading pretrained model from {}'.format(pretrained_path))
+    pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
+    
+    if "state_dict" in pretrained_dict.keys():
+        pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.')
+    else:
+        pretrained_dict = remove_prefix(pretrained_dict, 'module.')
+    check_keys(model, pretrained_dict)
+    model.load_state_dict(pretrained_dict, strict=False)
+    return model
+
+def main():
+    args = parse_args()
+
+    cfg_mnet = {
+        'name': 'mobilenet0.25',
+        'min_sizes': [[10, 20], [32, 64], [128, 256]],
+        'steps': [8, 16, 32],
+        'variance': [0.1, 0.2],
+        'clip': False,
+        'loc_weight': 2.0,
+        'gpu_train': True,
+        'batch_size': 32,
+        'ngpu': 1,
+        'epoch': 250,
+        'decay1': 190,
+        'decay2': 220,
+        'image_size': 300,
+        'pretrain': False,
+        'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3},
+        'in_channel': 32,
+        'out_channel': 64
+    }
+    model = RetinaFace(cfg = cfg_mnet, phase = 'test')
+
+    # load weight
+    model = load_model(model, args.weight)
+    model.eval()
+
+    input_names = ["input"]
+    output_names = ["output"]
+    dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 320, 320)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        output_names = output_names,
+        opset_version=13
+    )
+    print("Export onnx model successfully! ")
+
+
+if __name__ == '__main__':
+    main()
+
+
diff --git a/models/cv/detection/retinaface/igie/inference.py b/models/cv/detection/retinaface/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..44d29d1b9e1c8075c1212cecbe13ed683ebf1d7f
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/inference.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import cv2
+import tvm
+import torch
+import argparse
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from torch.utils.data import Dataset
+from utils.post_process import post_process
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114)):
+    shape = im.shape[:2]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    r = 1
+
+    # Compute padding
+    ratio = r, r
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] 
+    dw /= 2 
+    dh /= 2
+
+    if shape[::-1] != new_unpad:
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im,
+                            top,
+                            bottom,
+                            left,
+                            right,
+                            cv2.BORDER_CONSTANT,
+                            value=color)
+    return im, ratio, (dw, dh)
+
+
+class FaceDataset(Dataset):
+    def __init__(self, img_path,image_size=320, layout="NCHW"):
+  
+        self.imgs_path = []
+        self.imgs_path_ori=[]
+        self.image_size=image_size
+        self.layout = layout
+        self.img_dir=os.path.dirname(img_path)
+        with open(img_path, 'r') as fr:
+            self.imgs_path = fr.read().split()
+        self.imgs_path_ori=self.imgs_path
+
+    def __len__(self):
+        return len(self.imgs_path)  
+   
+    def __getitem__(self, idx):
+        img, (h0, w0), (h, w) = self._load_image(idx)
+        img, ratio, pad = letterbox(img,
+                                    self.image_size,
+                                    color=(114,114,114))
+        shapes = (h0, w0), ((h / h0, w / w0), pad),(h, w)
+        img = img.astype(np.float32)
+        img -= (104, 117, 123)
+        img = img.transpose(2, 0, 1)
+
+        return img, self.imgs_path[idx], shapes, self.imgs_path_ori[idx]
+
+    
+    @staticmethod
+    def collate_fn(batch):
+        im, path, shapes, path_ori = zip(*batch)
+        return np.concatenate([i[None] for i in im], axis=0), path, shapes, path_ori
+
+    def _load_image(self, i):
+        im = cv2.imread(self.img_dir+'/images'+self.imgs_path[i], cv2.IMREAD_COLOR)
+        h0, w0 = im.shape[:2] 
+        r = self.image_size / max(h0, w0)  
+        if r != 1:  
+            im = cv2.resize(im, (int(w0 * r), int(h0 * r)),
+                            interpolation=cv2.INTER_LINEAR)
+        return im.astype("float32"), (h0, w0), im.shape[:2]
+
+def get_dataloader(args):
+    image_size = 320
+    batchsize = args.batchsize
+    data_path = os.path.join(args.datasets, 'val/wider_val.txt')
+    datasets =FaceDataset(data_path, image_size)
+    dataLoader = torch.utils.data.DataLoader(datasets, batchsize, drop_last=False, collate_fn=datasets.collate_fn)
+
+    return dataLoader
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)           
+        
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:
+        # warmup
+        for _ in range(args.warmup):
+            module.run()
+        
+        dataloader = get_dataloader(args)
+
+        for batch in tqdm(dataloader):
+            image = batch[0]
+            shapes = batch[2]
+            img_names = batch[3]
+
+            pad_batch = len(image) != batch_size
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input("input", tvm.nd.array(image, device))
+        
+            module.run()
+
+            loc_bs, conf_bs, landms_bs = module.get_output(0).asnumpy(), module.get_output(1).asnumpy(), module.get_output(2).asnumpy()
+
+            if pad_batch:
+                loc_bs = loc_bs[:origin_size]
+                conf_bs = conf_bs[:origin_size]
+                landms_bs = landms_bs[:origin_size]
+
+            ## batch accuracy
+            post_process(shapes, img_names, loc_bs, conf_bs, landms_bs, save_folder='./widerface_evaluate/widerface_txt/')
+    
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/detection/retinaface/igie/models/net.py b/models/cv/detection/retinaface/igie/models/net.py
new file mode 100644
index 0000000000000000000000000000000000000000..981be825f619ab3d655553d013f987f0e129a4f7
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/models/net.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+def conv_bn(inp, oup, stride = 1):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+
+def conv_bn_no_relu(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+    )
+
+def conv_bn1X1(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+
+def conv_dw(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+        nn.BatchNorm2d(inp),
+        nn.ReLU(inplace=True),
+
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+
+class SSH(nn.Module):
+    def __init__(self, in_channel, out_channel):
+        super(SSH, self).__init__()
+        assert out_channel % 4 == 0
+        leaky = 0
+        if (out_channel <= 64):
+            leaky = 0.1
+        self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1)
+
+        self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1)
+        self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)
+
+        self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1)
+        self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)
+
+    def forward(self, input):
+        conv3X3 = self.conv3X3(input)
+
+        conv5X5_1 = self.conv5X5_1(input)
+        conv5X5 = self.conv5X5_2(conv5X5_1)
+
+        conv7X7_2 = self.conv7X7_2(conv5X5_1)
+        conv7X7 = self.conv7x7_3(conv7X7_2)
+
+        out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
+        out = F.relu(out)
+        return out
+
+class FPN(nn.Module):
+    def __init__(self,in_channels_list,out_channels):
+        super(FPN,self).__init__()
+        leaky = 0
+        if (out_channels <= 64):
+            leaky = 0.1
+        self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride = 1)
+        self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride = 1)
+        self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride = 1)
+
+        self.merge1 = conv_bn(out_channels, out_channels)
+        self.merge2 = conv_bn(out_channels, out_channels)
+
+    def forward(self, input):
+        # names = list(input.keys())
+        input = list(input.values())
+
+        output1 = self.output1(input[0])
+        output2 = self.output2(input[1])
+        output3 = self.output3(input[2])
+
+        up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest")
+        output2 = output2 + up3
+        output2 = self.merge2(output2)
+
+        up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest")
+        output1 = output1 + up2
+        output1 = self.merge1(output1)
+
+        out = [output1, output2, output3]
+        return out
+
+
+
+class MobileNetV1(nn.Module):
+    def __init__(self):
+        super(MobileNetV1, self).__init__()
+        self.stage1 = nn.Sequential(
+            conv_bn(3, 8, 2),    # 3
+            conv_dw(8, 16, 1),   # 7
+            conv_dw(16, 32, 2),  # 11
+            conv_dw(32, 32, 1),  # 19
+            conv_dw(32, 64, 2),  # 27
+            conv_dw(64, 64, 1),  # 43
+        )
+        self.stage2 = nn.Sequential(
+            conv_dw(64, 128, 2),  # 43 + 16 = 59
+            conv_dw(128, 128, 1), # 59 + 32 = 91
+            conv_dw(128, 128, 1), # 91 + 32 = 123
+            conv_dw(128, 128, 1), # 123 + 32 = 155
+            conv_dw(128, 128, 1), # 155 + 32 = 187
+            conv_dw(128, 128, 1), # 187 + 32 = 219
+        )
+        self.stage3 = nn.Sequential(
+            conv_dw(128, 256, 2), # 219 +3 2 = 241
+            conv_dw(256, 256, 1), # 241 + 64 = 301
+        )
+        self.avg = nn.AdaptiveAvgPool2d((1,1))
+        self.fc = nn.Linear(256, 1000)
+
+    def forward(self, x):
+        x = self.stage1(x)
+        x = self.stage2(x)
+        x = self.stage3(x)
+        x = self.avg(x)
+        # x = self.model(x)
+        x = x.view(-1, 256)
+        x = self.fc(x)
+        return x
+
diff --git a/models/cv/detection/retinaface/igie/models/retinaface.py b/models/cv/detection/retinaface/igie/models/retinaface.py
new file mode 100644
index 0000000000000000000000000000000000000000..873506e35960e87ecc601eae8a610dee75e02213
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/models/retinaface.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import torch.nn as nn
+import torchvision.models._utils as _utils
+import torch.nn.functional as F
+
+from models.net import MobileNetV1 as MobileNetV1
+from models.net import FPN as FPN
+from models.net import SSH as SSH
+
+class ClassHead(nn.Module):
+    def __init__(self,inchannels=512,num_anchors=3):
+        super(ClassHead,self).__init__()
+        self.num_anchors = num_anchors
+        self.conv1x1 = nn.Conv2d(inchannels,self.num_anchors*2,kernel_size=(1,1),stride=1,padding=0)
+
+    def forward(self,x):
+        out = self.conv1x1(x)
+        out = out.permute(0,2,3,1).contiguous()
+        
+        return out.view(out.shape[0], -1, 2)
+
+class BboxHead(nn.Module):
+    def __init__(self,inchannels=512,num_anchors=3):
+        super(BboxHead,self).__init__()
+        self.conv1x1 = nn.Conv2d(inchannels,num_anchors*4,kernel_size=(1,1),stride=1,padding=0)
+
+    def forward(self,x):
+        out = self.conv1x1(x)
+        out = out.permute(0,2,3,1).contiguous()
+
+        return out.view(out.shape[0], -1, 4)
+
+class LandmarkHead(nn.Module):
+    def __init__(self,inchannels=512,num_anchors=3):
+        super(LandmarkHead,self).__init__()
+        self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10,kernel_size=(1,1),stride=1,padding=0)
+
+    def forward(self,x):
+        out = self.conv1x1(x)
+        out = out.permute(0,2,3,1).contiguous()
+
+        return out.view(out.shape[0], -1, 10)
+
+class RetinaFace(nn.Module):
+    def __init__(self, cfg = None, phase = 'train'):
+        """
+        :param cfg:  Network related settings.
+        :param phase: train or test.
+        """
+        super(RetinaFace,self).__init__()
+        self.phase = phase
+        backbone = None
+        if cfg['name'] == 'mobilenet0.25':
+            backbone = MobileNetV1()
+            if cfg['pretrain']:
+                checkpoint = torch.load("./weights/mobilenetV1X0.25_pretrain.tar", map_location=torch.device('cpu'))
+                from collections import OrderedDict
+                new_state_dict = OrderedDict()
+                for k, v in checkpoint['state_dict'].items():
+                    name = k[7:]  # remove module.
+                    new_state_dict[name] = v
+                # load params
+                backbone.load_state_dict(new_state_dict)
+        elif cfg['name'] == 'Resnet50':
+            import torchvision.models as models
+            backbone = models.resnet50(pretrained=cfg['pretrain'])
+
+        self.body = _utils.IntermediateLayerGetter(backbone, cfg['return_layers'])
+        in_channels_stage2 = cfg['in_channel']
+        in_channels_list = [
+            in_channels_stage2 * 2,
+            in_channels_stage2 * 4,
+            in_channels_stage2 * 8,
+        ]
+        out_channels = cfg['out_channel']
+        self.fpn = FPN(in_channels_list,out_channels)
+        self.ssh1 = SSH(out_channels, out_channels)
+        self.ssh2 = SSH(out_channels, out_channels)
+        self.ssh3 = SSH(out_channels, out_channels)
+
+        self.ClassHead = self._make_class_head(fpn_num=3, inchannels=cfg['out_channel'])
+        self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=cfg['out_channel'])
+        self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=cfg['out_channel'])
+
+    def _make_class_head(self,fpn_num=3,inchannels=64,anchor_num=2):
+        classhead = nn.ModuleList()
+        for i in range(fpn_num):
+            classhead.append(ClassHead(inchannels,anchor_num))
+        return classhead
+    
+    def _make_bbox_head(self,fpn_num=3,inchannels=64,anchor_num=2):
+        bboxhead = nn.ModuleList()
+        for i in range(fpn_num):
+            bboxhead.append(BboxHead(inchannels,anchor_num))
+        return bboxhead
+
+    def _make_landmark_head(self,fpn_num=3,inchannels=64,anchor_num=2):
+        landmarkhead = nn.ModuleList()
+        for i in range(fpn_num):
+            landmarkhead.append(LandmarkHead(inchannels,anchor_num))
+        return landmarkhead
+
+    def forward(self,inputs):
+        out = self.body(inputs)
+
+        # FPN
+        fpn = self.fpn(out)
+
+        # SSH
+        feature1 = self.ssh1(fpn[0])
+        feature2 = self.ssh2(fpn[1])
+        feature3 = self.ssh3(fpn[2])
+        features = [feature1, feature2, feature3]
+
+        bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1)
+        classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)],dim=1)
+        ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1)
+
+        if self.phase == 'train':
+            output = (bbox_regressions, classifications, ldm_regressions)
+        else:
+            output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions)
+        return output
\ No newline at end of file
diff --git a/models/cv/detection/retinaface/igie/scripts/infer_retinaface_fp16_accuracy.sh b/models/cv/detection/retinaface/igie/scripts/infer_retinaface_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ea43f251a9f1214dbda82062ffd567f382ce2167
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/scripts/infer_retinaface_fp16_accuracy.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="retinaface_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,320,320    \
+    --precision fp16                        \
+    --engine_path retinaface_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine retinaface_bs_${batchsize}_fp16.so    \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}
+
+# compute accuracy
+cd widerface_evaluate
+python3 setup.py build_ext --inplace
+python3 evaluation.py
+cd ..
\ No newline at end of file
diff --git a/models/cv/detection/retinaface/igie/scripts/infer_retinaface_fp16_performance.sh b/models/cv/detection/retinaface/igie/scripts/infer_retinaface_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e92b1f56279f4607edb95ff8ae5e84072fd8bc62
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/scripts/infer_retinaface_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="retinaface_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,320,320    \
+    --precision fp16                        \
+    --engine_path retinaface_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                              \
+    --engine retinaface_bs_${batchsize}_fp16.so   \
+    --batchsize ${batchsize}                      \
+    --input_name input                            \
+    --datasets ${datasets_path}                   \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/detection/retinaface/igie/utils/box_utils.py b/models/cv/detection/retinaface/igie/utils/box_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..0bf174cb44da423218d7ce548946d3af2e5cc729
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/utils/box_utils.py
@@ -0,0 +1,344 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+import numpy as np
+
+def point_form(boxes):
+    """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
+    representation for comparison to point form ground truth data.
+    Args:
+        boxes: (tensor) center-size default boxes from priorbox layers.
+    Return:
+        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
+    """
+    return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
+                     boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
+
+
+def center_size(boxes):
+    """ Convert prior_boxes to (cx, cy, w, h)
+    representation for comparison to center-size form ground truth data.
+    Args:
+        boxes: (tensor) point_form boxes
+    Return:
+        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
+    """
+    return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
+                     boxes[:, 2:] - boxes[:, :2], 1)  # w, h
+
+
+def intersect(box_a, box_b):
+    """ We resize both tensors to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (tensor) bounding boxes, Shape: [A,4].
+      box_b: (tensor) bounding boxes, Shape: [B,4].
+    Return:
+      (tensor) intersection area, Shape: [A,B].
+    """
+    A = box_a.size(0)
+    B = box_b.size(0)
+    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
+    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
+                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
+    inter = torch.clamp((max_xy - min_xy), min=0)
+    return inter[:, :, 0] * inter[:, :, 1]
+
+
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
+    area_b = ((box_b[:, 2]-box_b[:, 0]) *
+              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+def matrix_iou(a, b):
+    """
+    return iou of a and b, numpy version for data augenmentation
+    """
+    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
+    return area_i / (area_a[:, np.newaxis] + area_b - area_i)
+
+
+def matrix_iof(a, b):
+    """
+    return iof of a and b, numpy version for data augenmentation
+    """
+    lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
+    return area_i / np.maximum(area_a[:, np.newaxis], 1)
+
+
+def match(threshold, truths, priors, variances, labels, landms, loc_t, conf_t, landm_t, idx):
+    """Match each prior box with the ground truth box of the highest jaccard
+    overlap, encode the bounding boxes, then return the matched indices
+    corresponding to both confidence and location preds.
+    Args:
+        threshold: (float) The overlap threshold used when mathing boxes.
+        truths: (tensor) Ground truth boxes, Shape: [num_obj, 4].
+        priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
+        variances: (tensor) Variances corresponding to each prior coord,
+            Shape: [num_priors, 4].
+        labels: (tensor) All the class labels for the image, Shape: [num_obj].
+        landms: (tensor) Ground truth landms, Shape [num_obj, 10].
+        loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
+        conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
+        landm_t: (tensor) Tensor to be filled w/ endcoded landm targets.
+        idx: (int) current batch index
+    Return:
+        The matched indices corresponding to 1)location 2)confidence 3)landm preds.
+    """
+    # jaccard index
+    overlaps = jaccard(
+        truths,
+        point_form(priors)
+    )
+    # (Bipartite Matching)
+    # [1,num_objects] best prior for each ground truth
+    best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
+
+    # ignore hard gt
+    valid_gt_idx = best_prior_overlap[:, 0] >= 0.2
+    best_prior_idx_filter = best_prior_idx[valid_gt_idx, :]
+    if best_prior_idx_filter.shape[0] <= 0:
+        loc_t[idx] = 0
+        conf_t[idx] = 0
+        return
+
+    # [1,num_priors] best ground truth for each prior
+    best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
+    best_truth_idx.squeeze_(0)
+    best_truth_overlap.squeeze_(0)
+    best_prior_idx.squeeze_(1)
+    best_prior_idx_filter.squeeze_(1)
+    best_prior_overlap.squeeze_(1)
+    best_truth_overlap.index_fill_(0, best_prior_idx_filter, 2)  # ensure best prior
+    # TODO refactor: index  best_prior_idx with long tensor
+    # ensure every gt matches with its prior of max overlap
+    for j in range(best_prior_idx.size(0)):     # 判别此anchor是预测哪一个boxes
+        best_truth_idx[best_prior_idx[j]] = j
+    matches = truths[best_truth_idx]            # Shape: [num_priors,4] 此处为每一个anchor对应的bbox取出来
+    conf = labels[best_truth_idx]               # Shape: [num_priors]      此处为每一个anchor对应的label取出来
+    conf[best_truth_overlap < threshold] = 0    # label as background   overlap<0.35的全部作为负样本
+    loc = encode(matches, priors, variances)
+
+    matches_landm = landms[best_truth_idx]
+    landm = encode_landm(matches_landm, priors, variances)
+    loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
+    conf_t[idx] = conf  # [num_priors] top class label for each prior
+    landm_t[idx] = landm
+
+
+def encode(matched, priors, variances):
+    """Encode the variances from the priorbox layers into the ground truth boxes
+    we have matched (based on jaccard overlap) with the prior boxes.
+    Args:
+        matched: (tensor) Coords of ground truth for each prior in point-form
+            Shape: [num_priors, 4].
+        priors: (tensor) Prior boxes in center-offset form
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        encoded boxes (tensor), Shape: [num_priors, 4]
+    """
+
+    # dist b/t match center and prior's center
+    g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
+    # encode variance
+    g_cxcy /= (variances[0] * priors[:, 2:])
+    # match wh / prior wh
+    g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
+    g_wh = torch.log(g_wh) / variances[1]
+    # return target for smooth_l1_loss
+    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
+
+def encode_landm(matched, priors, variances):
+    """Encode the variances from the priorbox layers into the ground truth boxes
+    we have matched (based on jaccard overlap) with the prior boxes.
+    Args:
+        matched: (tensor) Coords of ground truth for each prior in point-form
+            Shape: [num_priors, 10].
+        priors: (tensor) Prior boxes in center-offset form
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        encoded landm (tensor), Shape: [num_priors, 10]
+    """
+
+    # dist b/t match center and prior's center
+    matched = torch.reshape(matched, (matched.size(0), 5, 2))
+    priors_cx = priors[:, 0].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
+    priors_cy = priors[:, 1].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
+    priors_w = priors[:, 2].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
+    priors_h = priors[:, 3].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
+    priors = torch.cat([priors_cx, priors_cy, priors_w, priors_h], dim=2)
+    g_cxcy = matched[:, :, :2] - priors[:, :, :2]
+    # encode variance
+    g_cxcy /= (variances[0] * priors[:, :, 2:])
+    # g_cxcy /= priors[:, :, 2:]
+    g_cxcy = g_cxcy.reshape(g_cxcy.size(0), -1)
+    # return target for smooth_l1_loss
+    return g_cxcy
+
+
+# Adapted from https://github.com/Hakuyume/chainer-ssd
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat((
+        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+def decode_landm(pre, priors, variances):
+    """Decode landm from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        pre (tensor): landm predictions for loc layers,
+            Shape: [num_priors,10]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded landm predictions
+    """
+    landms = torch.cat((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:],
+                        priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:],
+                        priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:],
+                        priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:],
+                        priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:],
+                        ), dim=1)
+    return landms
+
+
+def log_sum_exp(x):
+    """Utility function for computing log_sum_exp while determining
+    This will be used to determine unaveraged confidence loss across
+    all examples in a batch.
+    Args:
+        x (Variable(tensor)): conf_preds from conf layers
+    """
+    x_max = x.data.max()
+    return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
+
+
+# Original author: Francisco Massa:
+# https://github.com/fmassa/object-detection.torch
+# Ported to PyTorch by Max deGroot (02/01/2017)
+def nms(boxes, scores, overlap=0.5, top_k=200):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+
+    keep = torch.Tensor(scores.size(0)).fill_(0).long()
+    if boxes.numel() == 0:
+        return keep
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    area = torch.mul(x2 - x1, y2 - y1)
+    v, idx = scores.sort(0)  # sort in ascending order
+    # I = I[v >= 0.01]
+    idx = idx[-top_k:]  # indices of the top-k largest vals
+    xx1 = boxes.new()
+    yy1 = boxes.new()
+    xx2 = boxes.new()
+    yy2 = boxes.new()
+    w = boxes.new()
+    h = boxes.new()
+
+    # keep = torch.Tensor()
+    count = 0
+    while idx.numel() > 0:
+        i = idx[-1]  # index of current largest val
+        # keep.append(i)
+        keep[count] = i
+        count += 1
+        if idx.size(0) == 1:
+            break
+        idx = idx[:-1]  # remove kept element from view
+        # load bboxes of next highest vals
+        torch.index_select(x1, 0, idx, out=xx1)
+        torch.index_select(y1, 0, idx, out=yy1)
+        torch.index_select(x2, 0, idx, out=xx2)
+        torch.index_select(y2, 0, idx, out=yy2)
+        # store element-wise max with next highest score
+        xx1 = torch.clamp(xx1, min=x1[i])
+        yy1 = torch.clamp(yy1, min=y1[i])
+        xx2 = torch.clamp(xx2, max=x2[i])
+        yy2 = torch.clamp(yy2, max=y2[i])
+        w.resize_as_(xx2)
+        h.resize_as_(yy2)
+        w = xx2 - xx1
+        h = yy2 - yy1
+        # check sizes of xx1 and xx2.. after each iteration
+        w = torch.clamp(w, min=0.0)
+        h = torch.clamp(h, min=0.0)
+        inter = w*h
+        # IoU = i / (area(a) + area(b) - i)
+        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+        union = (rem_areas - inter) + area[i]
+        IoU = inter/union  # store result in iou
+        # keep only elements with an IoU <= overlap
+        idx = idx[IoU.le(overlap)]
+    return keep, count
+
+
diff --git a/models/cv/detection/retinaface/igie/utils/post_process.py b/models/cv/detection/retinaface/igie/utils/post_process.py
new file mode 100755
index 0000000000000000000000000000000000000000..3affc02214efe8a15bfce61603eaf0d7a8f1a300
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/utils/post_process.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import torch
+import numpy as np
+from .prior_box import PriorBox
+from .box_utils import decode, decode_landm
+from .py_cpu_nms import py_cpu_nms
+
+cfg_mnet = {
+    'name': 'mobilenet0.25',
+    'min_sizes': [[10, 20], [32, 64], [128, 256]],
+    'steps': [8, 16, 32],
+    'variance': [0.1, 0.2],
+    'clip': False,
+    'loc_weight': 2.0,
+    'gpu_train': True,
+    'batch_size': 32,
+    'ngpu': 1,
+    'epoch': 250,
+    'decay1': 190,
+    'decay2': 220,
+    'image_size': 300,
+    'pretrain': False,
+    'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3},
+    'in_channel': 32,
+    'out_channel': 64
+}
+
+def clip_boxes(boxes, shape):
+    # Clip boxes (xyxy) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[:, 0].clamp_(0, shape[1])  # x1
+        boxes[:, 1].clamp_(0, shape[0])  # y1
+        boxes[:, 2].clamp_(0, shape[1])  # x2
+        boxes[:, 3].clamp_(0, shape[0])  # y2
+    else:  # np.array (faster grouped)
+        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+
+def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
+    # Rescale boxes (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    boxes[:, [0, 2]] -= pad[0]  # x padding
+    boxes[:, [1, 3]] -= pad[1]  # y padding
+    boxes[:, :4] /= gain
+    clip_boxes(boxes, img0_shape)
+    return boxes
+
+def clip_boxes_landm(landm, shape):
+    # Clip boxes (xyxy) to image shape (height, width)
+    if isinstance(landm, torch.Tensor):  # faster individually
+        landm[:, 0].clamp_(0, shape[1])  # x1
+        landm[:, 1].clamp_(0, shape[0])  # y1
+        landm[:, 2].clamp_(0, shape[1])  # x2
+        landm[:, 3].clamp_(0, shape[0])  # y2
+        landm[:, 4].clamp_(0, shape[1])  # x1
+        landm[:, 5].clamp_(0, shape[0])  # y1
+        landm[:, 6].clamp_(0, shape[1])  # x2
+        landm[:, 7].clamp_(0, shape[0])  # y2
+        landm[:, 8].clamp_(0, shape[1])  # x2
+        landm[:, 9].clamp_(0, shape[0])  # y2
+    else:  # np.array (faster grouped)
+        landm[:, [0, 2,4,6,8]] = landm[:, [0, 2,4,6,8]].clip(0, shape[1])  # x1, x2
+        landm[:, [1, 3,5,7,9]] = landm[:, [1, 3,5,7,9]].clip(0, shape[0])  # y1, y2
+
+def scale_boxes_landm(img1_shape, landm, img0_shape, ratio_pad=None):
+    # Rescale boxes (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    landm[:, [0, 2,4,6,8]] -= pad[0]  # x padding
+    landm[:, [1, 3,5,7,9]] -= pad[1]  # y padding
+    landm[:, :10] /= gain
+   
+    clip_boxes_landm(landm, img0_shape)
+    return landm
+
+def post_process(shapes, img_names, loc_bs, conf_bs, landms_bs, save_folder):
+    max_size = 320
+    confidence_threshold=0.02
+    nms_threshold=0.4
+
+    for idx, loc in enumerate(loc_bs):
+        img_size=[320, 320]
+        im_shape=list(shapes[idx][0])       #ori
+        
+        im_size_min = np.min(im_shape[0:2])
+        im_size_max = np.max(im_shape[0:2])
+        resize = float(320) / float(im_size_min)
+        # prevent bigger axis from being more than max_size:
+        if np.round(resize * im_size_max) >max_size:
+            resize = float(max_size) / float(im_size_max)
+        
+        scale = torch.Tensor([img_size[1], img_size[0], img_size[1], img_size[0]])
+        scale = scale.to('cpu')
+
+        priorbox = PriorBox(cfg_mnet, image_size=(320, 320))
+        priors = priorbox.forward()
+        priors = priors.to('cpu')
+        prior_data = priors.data
+        
+        boxes = decode(torch.from_numpy(loc_bs[idx]).data.squeeze(0).float(), prior_data, cfg_mnet['variance'])
+        boxes = boxes * scale
+        boxes=scale_boxes([320, 320],boxes,im_shape,shapes[idx][1])
+        boxes = boxes.cpu().numpy()
+        scores = torch.from_numpy(conf_bs[idx]).squeeze(0).data.cpu().numpy()[:, 1]
+        landms = decode_landm(torch.from_numpy(landms_bs[idx]).data.squeeze(0), prior_data, cfg_mnet['variance'])
+        img_size=[1,3,img_size[0],img_size[1]]
+
+
+        scale1 = torch.Tensor([img_size[3], img_size[2], img_size[3], img_size[2],
+                            img_size[3], img_size[2], img_size[3], img_size[2],
+                            img_size[3], img_size[2]])
+        scale1 = scale1.to('cpu')
+        
+        landms = landms * scale1
+        landms=scale_boxes_landm([320, 320],landms,im_shape,shapes[idx][1])
+        landms = landms.cpu().numpy()
+
+        # ignore low scores
+        inds = np.where(scores > confidence_threshold)[0]
+        boxes = boxes[inds]
+        landms = landms[inds]
+        scores = scores[inds]
+
+        # keep top-K before NMS
+        order = scores.argsort()[::-1]
+        boxes = boxes[order]
+        landms = landms[order]
+        scores = scores[order]
+
+        # do NMS
+        dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
+        keep = py_cpu_nms(dets, nms_threshold)
+        dets = dets[keep, :]
+        landms = landms[keep]
+
+        dets = np.concatenate((dets, landms), axis=1)
+        
+        # --------------------------------------------------------------------
+        save_name = save_folder + img_names[idx][:-4] + ".txt"
+        dirname = os.path.dirname(save_name)
+        if not os.path.isdir(dirname):
+            os.makedirs(dirname)
+        with open(save_name, "w") as fd:
+            bboxs = dets
+            file_name = os.path.basename(save_name)[:-4] + "\n"
+            bboxs_num = str(len(bboxs)) + "\n"
+            fd.write(file_name)
+            fd.write(bboxs_num)
+            for box in bboxs:
+                x = int(box[0])
+                y = int(box[1])
+                w = int(box[2]) - int(box[0])
+                h = int(box[3]) - int(box[1])
+                confidence = str(box[4])
+                line = str(x) + " " + str(y) + " " + str(w) + " " + str(h) + " " + confidence + " \n"
+                fd.write(line)
diff --git a/models/cv/detection/retinaface/igie/utils/prior_box.py b/models/cv/detection/retinaface/igie/utils/prior_box.py
new file mode 100755
index 0000000000000000000000000000000000000000..da085d2215d05f519a0e0412bf0a700f3fabf366
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/utils/prior_box.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import torch
+from itertools import product as product
+import numpy as np
+from math import ceil
+
+
+class PriorBox(object):
+    def __init__(self, cfg, image_size=None, phase='train'):
+        super(PriorBox, self).__init__()
+        self.min_sizes = cfg['min_sizes']
+        self.steps = cfg['steps']
+        self.clip = cfg['clip']
+        self.image_size = image_size
+        self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps]
+        self.name = "s"
+
+    def forward(self):
+        anchors = []
+        for k, f in enumerate(self.feature_maps):
+            min_sizes = self.min_sizes[k]
+            for i, j in product(range(f[0]), range(f[1])):
+                for min_size in min_sizes:
+                    s_kx = min_size / self.image_size[1]
+                    s_ky = min_size / self.image_size[0]
+                    dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]]
+                    dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]]
+                    for cy, cx in product(dense_cy, dense_cx):
+                        anchors += [cx, cy, s_kx, s_ky]
+
+        # back to torch land
+        output = torch.Tensor(anchors).view(-1, 4)
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        return output
diff --git a/models/cv/detection/retinaface/igie/utils/py_cpu_nms.py b/models/cv/detection/retinaface/igie/utils/py_cpu_nms.py
new file mode 100755
index 0000000000000000000000000000000000000000..161cd07bcf204d13a6e5f39c7042e708c73661f2
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/utils/py_cpu_nms.py
@@ -0,0 +1,52 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import numpy as np
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/README.md b/models/cv/detection/retinaface/igie/widerface_evaluate/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..95952b7b481e561ad6da3e7d562ce71b56f4b4a4
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/widerface_evaluate/README.md
@@ -0,0 +1,27 @@
+# WiderFace-Evaluation
+Python Evaluation Code for [Wider Face Dataset](http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/)
+
+
+## Usage
+
+
+##### before evaluating ....
+
+````
+python3 setup.py build_ext --inplace
+````
+
+##### evaluating
+
+**GroungTruth:** `wider_face_val.mat`, `wider_easy_val.mat`, `wider_medium_val.mat`,`wider_hard_val.mat`
+
+````
+python3 evaluation.py -p <your prediction dir> -g <groud truth dir>
+````
+
+## Bugs & Problems
+please issue
+
+## Acknowledgements
+
+some code borrowed from Sergey Karayev
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/box_overlaps.c b/models/cv/detection/retinaface/igie/widerface_evaluate/box_overlaps.c
new file mode 100755
index 0000000000000000000000000000000000000000..4926be24e3be99009998301b4d2f1490424b8133
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/widerface_evaluate/box_overlaps.c
@@ -0,0 +1,6871 @@
+/* Generated by Cython 0.29.33 */
+
+/* BEGIN: Cython Metadata
+{
+    "distutils": {
+        "depends": [
+            "/home/work/zhaoxq/miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h",
+            "/home/work/zhaoxq/miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ufuncobject.h"
+        ],
+        "include_dirs": [
+            "/home/work/zhaoxq/miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/core/include"
+        ],
+        "name": "bbox",
+        "sources": [
+            "box_overlaps.pyx"
+        ]
+    },
+    "module_name": "bbox"
+}
+END: Cython Metadata */
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif /* PY_SSIZE_T_CLEAN */
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+    #error Cython requires Python 2.6+ or Python 3.3+.
+#else
+#define CYTHON_ABI "0_29_33"
+#define CYTHON_HEX_VERSION 0x001D21F0
+#define CYTHON_FUTURE_DIVISION 0
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+  #if PY_VERSION_HEX >= 0x02070000
+    #define HAVE_LONG_LONG
+  #endif
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+#elif defined(PYSTON_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 0
+  #endif
+#elif defined(PY_NOGIL)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #define CYTHON_COMPILING_IN_NOGIL 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #ifndef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT 1
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE 1
+  #endif
+  #undef CYTHON_USE_DICT_VERSIONS
+  #define CYTHON_USE_DICT_VERSIONS 0
+  #undef CYTHON_USE_EXC_INFO_STACK
+  #define CYTHON_USE_EXC_INFO_STACK 0
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #define CYTHON_COMPILING_IN_NOGIL 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYTYPE_LOOKUP
+    #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
+    #define CYTHON_USE_PYTYPE_LOOKUP 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0 || PY_VERSION_HEX >= 0x030B00A2
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #if PY_VERSION_HEX >= 0x030B00A4
+    #undef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 0
+  #elif !defined(CYTHON_FAST_THREAD_STATE)
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL (PY_VERSION_HEX < 0x030A0000)
+  #endif
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT (PY_VERSION_HEX >= 0x03050000)
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
+  #endif
+  #ifndef CYTHON_USE_DICT_VERSIONS
+    #define CYTHON_USE_DICT_VERSIONS (PY_VERSION_HEX >= 0x030600B1)
+  #endif
+  #if PY_VERSION_HEX >= 0x030B00A4
+    #undef CYTHON_USE_EXC_INFO_STACK
+    #define CYTHON_USE_EXC_INFO_STACK 0
+  #elif !defined(CYTHON_USE_EXC_INFO_STACK)
+    #define CYTHON_USE_EXC_INFO_STACK (PY_VERSION_HEX >= 0x030700A3)
+  #endif
+  #ifndef CYTHON_UPDATE_DESCRIPTOR_DOC
+    #define CYTHON_UPDATE_DESCRIPTOR_DOC 1
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #if PY_MAJOR_VERSION < 3
+    #include "longintrepr.h"
+  #endif
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+  #ifdef SIZEOF_VOID_P
+    enum { __pyx_check_sizeof_voidp = 1 / (int)(SIZEOF_VOID_P == sizeof(void*)) };
+  #endif
+#endif
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+  #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+    #ifndef _MSC_STDINT_H_
+        #if _MSC_VER < 1300
+           typedef unsigned char     uint8_t;
+           typedef unsigned int      uint32_t;
+        #else
+           typedef unsigned __int8   uint8_t;
+           typedef unsigned __int32  uint32_t;
+        #endif
+    #endif
+#else
+   #include <stdint.h>
+#endif
+#ifndef CYTHON_FALLTHROUGH
+  #if defined(__cplusplus) && __cplusplus >= 201103L
+    #if __has_cpp_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH [[fallthrough]]
+    #elif __has_cpp_attribute(clang::fallthrough)
+      #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+    #elif __has_cpp_attribute(gnu::fallthrough)
+      #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+    #endif
+  #endif
+  #ifndef CYTHON_FALLTHROUGH
+    #if __has_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+    #else
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+  #if defined(__clang__ ) && defined(__apple_build_version__)
+    #if __apple_build_version__ < 7000000
+      #undef  CYTHON_FALLTHROUGH
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+#endif
+
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #elif defined(__GNUC__)
+    #define CYTHON_INLINE __inline__
+  #elif defined(_MSC_VER)
+    #define CYTHON_INLINE __inline
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_INLINE inline
+  #else
+    #define CYTHON_INLINE
+  #endif
+#endif
+
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define Py_OptimizeFlag 0
+#endif
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_DefaultClassType PyType_Type
+#if PY_VERSION_HEX >= 0x030B00A1
+    static CYTHON_INLINE PyCodeObject* __Pyx_PyCode_New(int a, int k, int l, int s, int f,
+                                                    PyObject *code, PyObject *c, PyObject* n, PyObject *v,
+                                                    PyObject *fv, PyObject *cell, PyObject* fn,
+                                                    PyObject *name, int fline, PyObject *lnos) {
+        PyObject *kwds=NULL, *argcount=NULL, *posonlyargcount=NULL, *kwonlyargcount=NULL;
+        PyObject *nlocals=NULL, *stacksize=NULL, *flags=NULL, *replace=NULL, *call_result=NULL, *empty=NULL;
+        const char *fn_cstr=NULL;
+        const char *name_cstr=NULL;
+        PyCodeObject* co=NULL;
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (!(kwds=PyDict_New())) goto end;
+        if (!(argcount=PyLong_FromLong(a))) goto end;
+        if (PyDict_SetItemString(kwds, "co_argcount", argcount) != 0) goto end;
+        if (!(posonlyargcount=PyLong_FromLong(0))) goto end;
+        if (PyDict_SetItemString(kwds, "co_posonlyargcount", posonlyargcount) != 0) goto end;
+        if (!(kwonlyargcount=PyLong_FromLong(k))) goto end;
+        if (PyDict_SetItemString(kwds, "co_kwonlyargcount", kwonlyargcount) != 0) goto end;
+        if (!(nlocals=PyLong_FromLong(l))) goto end;
+        if (PyDict_SetItemString(kwds, "co_nlocals", nlocals) != 0) goto end;
+        if (!(stacksize=PyLong_FromLong(s))) goto end;
+        if (PyDict_SetItemString(kwds, "co_stacksize", stacksize) != 0) goto end;
+        if (!(flags=PyLong_FromLong(f))) goto end;
+        if (PyDict_SetItemString(kwds, "co_flags", flags) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_code", code) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_consts", c) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_names", n) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_varnames", v) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_freevars", fv) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_cellvars", cell) != 0) goto end;
+        if (PyDict_SetItemString(kwds, "co_linetable", lnos) != 0) goto end;
+        if (!(fn_cstr=PyUnicode_AsUTF8AndSize(fn, NULL))) goto end;
+        if (!(name_cstr=PyUnicode_AsUTF8AndSize(name, NULL))) goto end;
+        if (!(co = PyCode_NewEmpty(fn_cstr, name_cstr, fline))) goto end;
+        if (!(replace = PyObject_GetAttrString((PyObject*)co, "replace"))) goto cleanup_code_too;
+        if (!(empty = PyTuple_New(0))) goto cleanup_code_too; // unfortunately __pyx_empty_tuple isn't available here
+        if (!(call_result = PyObject_Call(replace, empty, kwds))) goto cleanup_code_too;
+        Py_XDECREF((PyObject*)co);
+        co = (PyCodeObject*)call_result;
+        call_result = NULL;
+        if (0) {
+            cleanup_code_too:
+            Py_XDECREF((PyObject*)co);
+            co = NULL;
+        }
+        end:
+        Py_XDECREF(kwds);
+        Py_XDECREF(argcount);
+        Py_XDECREF(posonlyargcount);
+        Py_XDECREF(kwonlyargcount);
+        Py_XDECREF(nlocals);
+        Py_XDECREF(stacksize);
+        Py_XDECREF(replace);
+        Py_XDECREF(call_result);
+        Py_XDECREF(empty);
+        if (type) {
+            PyErr_Restore(type, value, traceback);
+        }
+        return co;
+    }
+#else
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+#endif
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#ifndef METH_STACKLESS
+  #define METH_STACKLESS 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+  #ifndef METH_FASTCALL
+     #define METH_FASTCALL 0x80
+  #endif
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+                                                          Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #define __Pyx_PyCFunctionFast _PyCFunctionFast
+  #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+    ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS | METH_STACKLESS)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX < 0x030400A1
+  #define PyMem_RawMalloc(n)           PyMem_Malloc(n)
+  #define PyMem_RawRealloc(p, n)       PyMem_Realloc(p, n)
+  #define PyMem_RawFree(p)             PyMem_Free(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+  #define __Pyx_PyCode_HasFreeVars(co)  PyCode_HasFreeVars(co)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x03060000
+  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+  #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+  *key = PyThread_create_key();
+  return 0;
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+  *key = Py_tss_NEEDS_INIT;
+  return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+  PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+  return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+  PyThread_delete_key(*key);
+  *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+  return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+  return PyThread_get_key_value(*key);
+}
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStr(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+#else
+#define __Pyx_PyDict_GetItemStr(dict, name)  PyDict_GetItem(dict, name)
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #if PY_VERSION_HEX >= 0x030C0000
+    #define __Pyx_PyUnicode_READY(op)       (0)
+  #else
+    #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                                0 : _PyUnicode_Ready((PyObject *)(op)))
+  #endif
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #if PY_VERSION_HEX >= 0x030C0000
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_LENGTH(u))
+  #else
+    #if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x03090000
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : ((PyCompactUnicodeObject *)(u))->wstr_length))
+    #else
+    #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+    #endif
+  #endif
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+  #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+  #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+  #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None || (PyString_Check(b) && !PyString_CheckExact(b)))) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None || (PyUnicode_Check(b) && !PyUnicode_CheckExact(b)))) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#ifndef PyObject_Unicode
+  #define PyObject_Unicode             PyObject_Str
+#endif
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#if PY_VERSION_HEX >= 0x030900A4
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_SET_REFCNT(obj, refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SET_SIZE(obj, size)
+#else
+  #define __Pyx_SET_REFCNT(obj, refcnt) Py_REFCNT(obj) = (refcnt)
+  #define __Pyx_SET_SIZE(obj, size) Py_SIZE(obj) = (size)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
+#else
+  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   __Pyx_PyIndex_AsHash_t
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   __Pyx_PyIndex_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyMethod_New(func, self, klass) ((self) ? ((void)(klass), PyMethod_New(func, self)) : __Pyx_NewRef(func))
+#else
+  #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(_WIN32) || defined(WIN32) || defined(MS_WINDOWS)
+  #if !defined(_USE_MATH_DEFINES)
+    #define _USE_MATH_DEFINES
+  #endif
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+#define __PYX_MARK_ERR_POS(f_index, lineno) \
+    { __pyx_filename = __pyx_f[f_index]; (void)__pyx_filename; __pyx_lineno = lineno; (void)__pyx_lineno; __pyx_clineno = __LINE__; (void)__pyx_clineno; }
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+    { __PYX_MARK_ERR_POS(f_index, lineno) goto Ln_error; }
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#define __PYX_HAVE__bbox
+#define __PYX_HAVE_API__bbox
+/* Early includes */
+#include <string.h>
+#include <stdio.h>
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+
+    /* NumPy API declarations from "numpy/__init__.pxd" */
+    
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_UTF8 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT (PY_MAJOR_VERSION >= 3 && __PYX_DEFAULT_STRING_ENCODING_IS_UTF8)
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+static CYTHON_INLINE int __Pyx_is_valid_index(Py_ssize_t i, Py_ssize_t limit) {
+    return (size_t) i < (size_t) limit;
+}
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s)    ((char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return (size_t)(u_end - u - 1);
+}
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject*);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c) + 1);
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+static PyObject *__pyx_m = NULL;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_cython_runtime = NULL;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+/* Header.proto */
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif defined(_Complex_I)
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+  "box_overlaps.pyx",
+  "__init__.pxd",
+  "type.pxd",
+};
+/* BufferFormatStructs.proto */
+#define IS_UNSIGNED(type) (((type) -1) > 0)
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name;
+  struct __Pyx_StructField_* fields;
+  size_t size;
+  size_t arraysize[8];
+  int ndim;
+  char typegroup;
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":689
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":690
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":691
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":692
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":696
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":697
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":698
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":699
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":703
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":704
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":713
+ * # The int types are mapped a bit surprising --
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t
+ */
+typedef npy_long __pyx_t_5numpy_int_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":714
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   longlong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_long_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":715
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_ulong      uint_t
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":717
+ * ctypedef npy_longlong   longlong_t
+ * 
+ * ctypedef npy_ulong      uint_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t
+ */
+typedef npy_ulong __pyx_t_5numpy_uint_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":718
+ * 
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":719
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":721
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":722
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":724
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":725
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":726
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+
+/* "box_overlaps.pyx":13
+ * 
+ * DTYPE = np.float
+ * ctypedef np.float_t DTYPE_t             # <<<<<<<<<<<<<<
+ * 
+ * def bbox_overlaps(
+ */
+typedef __pyx_t_5numpy_float_t __pyx_t_4bbox_DTYPE_t;
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+/* Declarations.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+
+/*--- Type declarations ---*/
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":728
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ * ctypedef npy_cfloat      cfloat_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t
+ */
+typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":729
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ */
+typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":730
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cdouble     complex_t
+ */
+typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":732
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ * ctypedef npy_cdouble     complex_t             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ */
+typedef npy_cdouble __pyx_t_5numpy_complex_t;
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
+    const char* function_name);
+
+/* ArgTypeTest.proto */
+#define __Pyx_ArgTypeTest(obj, type, none_allowed, name, exact)\
+    ((likely((Py_TYPE(obj) == type) | (none_allowed && (obj == Py_None)))) ? 1 :\
+        __Pyx__ArgTypeTest(obj, type, name, exact))
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact);
+
+/* IsLittleEndian.proto */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void);
+
+/* BufferFormatCheck.proto */
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts);
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type);
+
+/* BufferGetAndValidate.proto */
+#define __Pyx_GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack)\
+    ((obj == Py_None || obj == NULL) ?\
+    (__Pyx_ZeroBuffer(buf), 0) :\
+    __Pyx__GetBufferAndValidate(buf, obj, dtype, flags, nd, cast, stack))
+static int  __Pyx__GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static void __Pyx_ZeroBuffer(Py_buffer* buf);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+static Py_ssize_t __Pyx_minusones[] = { -1, -1, -1, -1, -1, -1, -1, -1 };
+static Py_ssize_t __Pyx_zeros[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+/* PyDictVersioning.proto */
+#if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+#define __PYX_DICT_VERSION_INIT  ((PY_UINT64_T) -1)
+#define __PYX_GET_DICT_VERSION(dict)  (((PyDictObject*)(dict))->ma_version_tag)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)\
+    (version_var) = __PYX_GET_DICT_VERSION(dict);\
+    (cache_var) = (value);
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP) {\
+    static PY_UINT64_T __pyx_dict_version = 0;\
+    static PyObject *__pyx_dict_cached_value = NULL;\
+    if (likely(__PYX_GET_DICT_VERSION(DICT) == __pyx_dict_version)) {\
+        (VAR) = __pyx_dict_cached_value;\
+    } else {\
+        (VAR) = __pyx_dict_cached_value = (LOOKUP);\
+        __pyx_dict_version = __PYX_GET_DICT_VERSION(DICT);\
+    }\
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj);
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj);
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version);
+#else
+#define __PYX_GET_DICT_VERSION(dict)  (0)
+#define __PYX_UPDATE_DICT_CACHE(dict, value, cache_var, version_var)
+#define __PYX_PY_DICT_LOOKUP_IF_MODIFIED(VAR, DICT, LOOKUP)  (VAR) = (LOOKUP);
+#endif
+
+/* GetModuleGlobalName.proto */
+#if CYTHON_USE_DICT_VERSIONS
+#define __Pyx_GetModuleGlobalName(var, name)  do {\
+    static PY_UINT64_T __pyx_dict_version = 0;\
+    static PyObject *__pyx_dict_cached_value = NULL;\
+    (var) = (likely(__pyx_dict_version == __PYX_GET_DICT_VERSION(__pyx_d))) ?\
+        (likely(__pyx_dict_cached_value) ? __Pyx_NewRef(__pyx_dict_cached_value) : __Pyx_GetBuiltinName(name)) :\
+        __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\
+} while(0)
+#define __Pyx_GetModuleGlobalNameUncached(var, name)  do {\
+    PY_UINT64_T __pyx_dict_version;\
+    PyObject *__pyx_dict_cached_value;\
+    (var) = __Pyx__GetModuleGlobalName(name, &__pyx_dict_version, &__pyx_dict_cached_value);\
+} while(0)
+static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value);
+#else
+#define __Pyx_GetModuleGlobalName(var, name)  (var) = __Pyx__GetModuleGlobalName(name)
+#define __Pyx_GetModuleGlobalNameUncached(var, name)  (var) = __Pyx__GetModuleGlobalName(name)
+static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name);
+#endif
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* ExtTypeTest.proto */
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type);
+
+/* BufferIndexError.proto */
+static void __Pyx_RaiseBufferIndexError(int axis);
+
+#define __Pyx_BufPtrStrided2d(type, buf, i0, s0, i1, s1) (type)((char*)buf + i0 * s0 + i1 * s1)
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
+#define __Pyx_PyErr_Occurred()  __pyx_tstate->curexc_type
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred()  PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* GetTopmostException.proto */
+#if CYTHON_USE_EXC_INFO_STACK
+static _PyErr_StackItem * __Pyx_PyErr_GetTopmostException(PyThreadState *tstate);
+#endif
+
+/* SaveResetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_ExceptionSave(type, value, tb)  __Pyx__ExceptionSave(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#define __Pyx_ExceptionReset(type, value, tb)  __Pyx__ExceptionReset(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+#else
+#define __Pyx_ExceptionSave(type, value, tb)   PyErr_GetExcInfo(type, value, tb)
+#define __Pyx_ExceptionReset(type, value, tb)  PyErr_SetExcInfo(type, value, tb)
+#endif
+
+/* PyErrExceptionMatches.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_ExceptionMatches(err) __Pyx_PyErr_ExceptionMatchesInState(__pyx_tstate, err)
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err);
+#else
+#define __Pyx_PyErr_ExceptionMatches(err)  PyErr_ExceptionMatches(err)
+#endif
+
+/* GetException.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_GetException(type, value, tb)  __Pyx__GetException(__pyx_tstate, type, value, tb)
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb);
+#endif
+
+/* RaiseException.proto */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause);
+
+/* TypeImport.proto */
+#ifndef __PYX_HAVE_RT_ImportType_proto
+#define __PYX_HAVE_RT_ImportType_proto
+enum __Pyx_ImportType_CheckSize {
+   __Pyx_ImportType_CheckSize_Error = 0,
+   __Pyx_ImportType_CheckSize_Warn = 1,
+   __Pyx_ImportType_CheckSize_Ignore = 2
+};
+static PyTypeObject *__Pyx_ImportType(PyObject* module, const char *module_name, const char *class_name, size_t size, enum __Pyx_ImportType_CheckSize check_size);
+#endif
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+/* BufferStructDeclare.proto */
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+/* GCCDiagnostics.proto */
+#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+#define __Pyx_HAS_GCC_DIAGNOSTIC
+#endif
+
+/* RealImag.proto */
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if defined(__cplusplus) && CYTHON_CCOMPLEX\
+        && (defined(_WIN32) || defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4 )) || __cplusplus >= 201103)
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_float(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_float(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_float(a, b) ((a)-(b))
+    #define __Pyx_c_prod_float(a, b) ((a)*(b))
+    #define __Pyx_c_quot_float(a, b) ((a)/(b))
+    #define __Pyx_c_neg_float(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_float(z) ((z)==(float)0)
+    #define __Pyx_c_conj_float(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (::std::abs(z))
+        #define __Pyx_c_pow_float(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_float(z) ((z)==0)
+    #define __Pyx_c_conj_float(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_abs_float(z)     (cabsf(z))
+        #define __Pyx_c_pow_float(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+/* Arithmetic.proto */
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq_double(a, b)   ((a)==(b))
+    #define __Pyx_c_sum_double(a, b)  ((a)+(b))
+    #define __Pyx_c_diff_double(a, b) ((a)-(b))
+    #define __Pyx_c_prod_double(a, b) ((a)*(b))
+    #define __Pyx_c_quot_double(a, b) ((a)/(b))
+    #define __Pyx_c_neg_double(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero_double(z) ((z)==(double)0)
+    #define __Pyx_c_conj_double(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (::std::abs(z))
+        #define __Pyx_c_pow_double(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero_double(z) ((z)==0)
+    #define __Pyx_c_conj_double(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs_double(z)     (cabs(z))
+        #define __Pyx_c_pow_double(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE unsigned int __Pyx_PyInt_As_unsigned_int(PyObject *);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+
+/* Module declarations from 'cython' */
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'cpython' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'cpython.mem' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+
+/* Module declarations from 'bbox' */
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_4bbox_DTYPE_t = { "DTYPE_t", NULL, sizeof(__pyx_t_4bbox_DTYPE_t), { 0 }, 0, 'R', 0, 0 };
+#define __Pyx_MODULE_NAME "bbox"
+extern int __pyx_module_is_main_bbox;
+int __pyx_module_is_main_bbox = 0;
+
+/* Implementation of 'bbox' */
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_ImportError;
+static const char __pyx_k_K[] = "K";
+static const char __pyx_k_N[] = "N";
+static const char __pyx_k_k[] = "k";
+static const char __pyx_k_n[] = "n";
+static const char __pyx_k_ih[] = "ih";
+static const char __pyx_k_iw[] = "iw";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k_ua[] = "ua";
+static const char __pyx_k_bbox[] = "bbox";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_name[] = "__name__";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_DTYPE[] = "DTYPE";
+static const char __pyx_k_boxes[] = "boxes";
+static const char __pyx_k_dtype[] = "dtype";
+static const char __pyx_k_float[] = "float";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_range[] = "range";
+static const char __pyx_k_zeros[] = "zeros";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_box_area[] = "box_area";
+static const char __pyx_k_overlaps[] = "overlaps";
+static const char __pyx_k_ImportError[] = "ImportError";
+static const char __pyx_k_query_boxes[] = "query_boxes";
+static const char __pyx_k_bbox_overlaps[] = "bbox_overlaps";
+static const char __pyx_k_box_overlaps_pyx[] = "box_overlaps.pyx";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_numpy_core_multiarray_failed_to[] = "numpy.core.multiarray failed to import";
+static const char __pyx_k_numpy_core_umath_failed_to_impor[] = "numpy.core.umath failed to import";
+static PyObject *__pyx_n_s_DTYPE;
+static PyObject *__pyx_n_s_ImportError;
+static PyObject *__pyx_n_s_K;
+static PyObject *__pyx_n_s_N;
+static PyObject *__pyx_n_s_bbox;
+static PyObject *__pyx_n_s_bbox_overlaps;
+static PyObject *__pyx_n_s_box_area;
+static PyObject *__pyx_kp_s_box_overlaps_pyx;
+static PyObject *__pyx_n_s_boxes;
+static PyObject *__pyx_n_s_cline_in_traceback;
+static PyObject *__pyx_n_s_dtype;
+static PyObject *__pyx_n_s_float;
+static PyObject *__pyx_n_s_ih;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_n_s_iw;
+static PyObject *__pyx_n_s_k;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_s_n;
+static PyObject *__pyx_n_s_name;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_kp_s_numpy_core_multiarray_failed_to;
+static PyObject *__pyx_kp_s_numpy_core_umath_failed_to_impor;
+static PyObject *__pyx_n_s_overlaps;
+static PyObject *__pyx_n_s_query_boxes;
+static PyObject *__pyx_n_s_range;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_ua;
+static PyObject *__pyx_n_s_zeros;
+static PyObject *__pyx_pf_4bbox_bbox_overlaps(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_boxes, PyArrayObject *__pyx_v_query_boxes); /* proto */
+static PyObject *__pyx_tuple_;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__3;
+static PyObject *__pyx_codeobj__4;
+/* Late includes */
+
+/* "box_overlaps.pyx":15
+ * ctypedef np.float_t DTYPE_t
+ * 
+ * def bbox_overlaps(             # <<<<<<<<<<<<<<
+ *         np.ndarray[DTYPE_t, ndim=2] boxes,
+ *         np.ndarray[DTYPE_t, ndim=2] query_boxes):
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_4bbox_1bbox_overlaps(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static char __pyx_doc_4bbox_bbox_overlaps[] = "\n    Parameters\n    ----------\n    boxes: (N, 4) ndarray of float\n    query_boxes: (K, 4) ndarray of float\n    Returns\n    -------\n    overlaps: (N, K) ndarray of overlap between boxes and query_boxes\n    ";
+static PyMethodDef __pyx_mdef_4bbox_1bbox_overlaps = {"bbox_overlaps", (PyCFunction)(void*)(PyCFunctionWithKeywords)__pyx_pw_4bbox_1bbox_overlaps, METH_VARARGS|METH_KEYWORDS, __pyx_doc_4bbox_bbox_overlaps};
+static PyObject *__pyx_pw_4bbox_1bbox_overlaps(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_boxes = 0;
+  PyArrayObject *__pyx_v_query_boxes = 0;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("bbox_overlaps (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_boxes,&__pyx_n_s_query_boxes,0};
+    PyObject* values[2] = {0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_boxes)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_query_boxes)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("bbox_overlaps", 1, 2, 2, 1); __PYX_ERR(0, 15, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "bbox_overlaps") < 0)) __PYX_ERR(0, 15, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 2) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+    }
+    __pyx_v_boxes = ((PyArrayObject *)values[0]);
+    __pyx_v_query_boxes = ((PyArrayObject *)values[1]);
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("bbox_overlaps", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 15, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("bbox.bbox_overlaps", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_boxes), __pyx_ptype_5numpy_ndarray, 1, "boxes", 0))) __PYX_ERR(0, 16, __pyx_L1_error)
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_query_boxes), __pyx_ptype_5numpy_ndarray, 1, "query_boxes", 0))) __PYX_ERR(0, 17, __pyx_L1_error)
+  __pyx_r = __pyx_pf_4bbox_bbox_overlaps(__pyx_self, __pyx_v_boxes, __pyx_v_query_boxes);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_4bbox_bbox_overlaps(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_boxes, PyArrayObject *__pyx_v_query_boxes) {
+  unsigned int __pyx_v_N;
+  unsigned int __pyx_v_K;
+  PyArrayObject *__pyx_v_overlaps = 0;
+  __pyx_t_4bbox_DTYPE_t __pyx_v_iw;
+  __pyx_t_4bbox_DTYPE_t __pyx_v_ih;
+  __pyx_t_4bbox_DTYPE_t __pyx_v_box_area;
+  __pyx_t_4bbox_DTYPE_t __pyx_v_ua;
+  unsigned int __pyx_v_k;
+  unsigned int __pyx_v_n;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_boxes;
+  __Pyx_Buffer __pyx_pybuffer_boxes;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_overlaps;
+  __Pyx_Buffer __pyx_pybuffer_overlaps;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_query_boxes;
+  __Pyx_Buffer __pyx_pybuffer_query_boxes;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyArrayObject *__pyx_t_5 = NULL;
+  unsigned int __pyx_t_6;
+  unsigned int __pyx_t_7;
+  unsigned int __pyx_t_8;
+  size_t __pyx_t_9;
+  Py_ssize_t __pyx_t_10;
+  int __pyx_t_11;
+  size_t __pyx_t_12;
+  Py_ssize_t __pyx_t_13;
+  size_t __pyx_t_14;
+  Py_ssize_t __pyx_t_15;
+  size_t __pyx_t_16;
+  Py_ssize_t __pyx_t_17;
+  unsigned int __pyx_t_18;
+  unsigned int __pyx_t_19;
+  unsigned int __pyx_t_20;
+  __pyx_t_4bbox_DTYPE_t __pyx_t_21;
+  __pyx_t_4bbox_DTYPE_t __pyx_t_22;
+  __pyx_t_4bbox_DTYPE_t __pyx_t_23;
+  __pyx_t_4bbox_DTYPE_t __pyx_t_24;
+  int __pyx_t_25;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("bbox_overlaps", 0);
+  __pyx_pybuffer_overlaps.pybuffer.buf = NULL;
+  __pyx_pybuffer_overlaps.refcount = 0;
+  __pyx_pybuffernd_overlaps.data = NULL;
+  __pyx_pybuffernd_overlaps.rcbuffer = &__pyx_pybuffer_overlaps;
+  __pyx_pybuffer_boxes.pybuffer.buf = NULL;
+  __pyx_pybuffer_boxes.refcount = 0;
+  __pyx_pybuffernd_boxes.data = NULL;
+  __pyx_pybuffernd_boxes.rcbuffer = &__pyx_pybuffer_boxes;
+  __pyx_pybuffer_query_boxes.pybuffer.buf = NULL;
+  __pyx_pybuffer_query_boxes.refcount = 0;
+  __pyx_pybuffernd_query_boxes.data = NULL;
+  __pyx_pybuffernd_query_boxes.rcbuffer = &__pyx_pybuffer_query_boxes;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_boxes.rcbuffer->pybuffer, (PyObject*)__pyx_v_boxes, &__Pyx_TypeInfo_nn___pyx_t_4bbox_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 15, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_boxes.diminfo[0].strides = __pyx_pybuffernd_boxes.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_boxes.diminfo[0].shape = __pyx_pybuffernd_boxes.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_boxes.diminfo[1].strides = __pyx_pybuffernd_boxes.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_boxes.diminfo[1].shape = __pyx_pybuffernd_boxes.rcbuffer->pybuffer.shape[1];
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_query_boxes.rcbuffer->pybuffer, (PyObject*)__pyx_v_query_boxes, &__Pyx_TypeInfo_nn___pyx_t_4bbox_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) __PYX_ERR(0, 15, __pyx_L1_error)
+  }
+  __pyx_pybuffernd_query_boxes.diminfo[0].strides = __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_query_boxes.diminfo[0].shape = __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_query_boxes.diminfo[1].strides = __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_query_boxes.diminfo[1].shape = __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.shape[1];
+
+  /* "box_overlaps.pyx":27
+ *     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+ *     """
+ *     cdef unsigned int N = boxes.shape[0]             # <<<<<<<<<<<<<<
+ *     cdef unsigned int K = query_boxes.shape[0]
+ *     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+ */
+  __pyx_v_N = (__pyx_v_boxes->dimensions[0]);
+
+  /* "box_overlaps.pyx":28
+ *     """
+ *     cdef unsigned int N = boxes.shape[0]
+ *     cdef unsigned int K = query_boxes.shape[0]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+ *     cdef DTYPE_t iw, ih, box_area
+ */
+  __pyx_v_K = (__pyx_v_query_boxes->dimensions[0]);
+
+  /* "box_overlaps.pyx":29
+ *     cdef unsigned int N = boxes.shape[0]
+ *     cdef unsigned int K = query_boxes.shape[0]
+ *     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)             # <<<<<<<<<<<<<<
+ *     cdef DTYPE_t iw, ih, box_area
+ *     cdef DTYPE_t ua
+ */
+  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_zeros); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_unsigned_int(__pyx_v_N); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = __Pyx_PyInt_From_unsigned_int(__pyx_v_K); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = PyTuple_New(2); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_GIVEREF(__pyx_t_1);
+  PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_4, 1, __pyx_t_3);
+  __pyx_t_1 = 0;
+  __pyx_t_3 = 0;
+  __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_DTYPE); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_dtype, __pyx_t_1) < 0) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_3, __pyx_t_4); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 29, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) __PYX_ERR(0, 29, __pyx_L1_error)
+  __pyx_t_5 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_overlaps.rcbuffer->pybuffer, (PyObject*)__pyx_t_5, &__Pyx_TypeInfo_nn___pyx_t_4bbox_DTYPE_t, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 2, 0, __pyx_stack) == -1)) {
+      __pyx_v_overlaps = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_overlaps.rcbuffer->pybuffer.buf = NULL;
+      __PYX_ERR(0, 29, __pyx_L1_error)
+    } else {__pyx_pybuffernd_overlaps.diminfo[0].strides = __pyx_pybuffernd_overlaps.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_overlaps.diminfo[0].shape = __pyx_pybuffernd_overlaps.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_overlaps.diminfo[1].strides = __pyx_pybuffernd_overlaps.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_overlaps.diminfo[1].shape = __pyx_pybuffernd_overlaps.rcbuffer->pybuffer.shape[1];
+    }
+  }
+  __pyx_t_5 = 0;
+  __pyx_v_overlaps = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "box_overlaps.pyx":33
+ *     cdef DTYPE_t ua
+ *     cdef unsigned int k, n
+ *     for k in range(K):             # <<<<<<<<<<<<<<
+ *         box_area = (
+ *             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+ */
+  __pyx_t_6 = __pyx_v_K;
+  __pyx_t_7 = __pyx_t_6;
+  for (__pyx_t_8 = 0; __pyx_t_8 < __pyx_t_7; __pyx_t_8+=1) {
+    __pyx_v_k = __pyx_t_8;
+
+    /* "box_overlaps.pyx":35
+ *     for k in range(K):
+ *         box_area = (
+ *             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *             # <<<<<<<<<<<<<<
+ *             (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+ *         )
+ */
+    __pyx_t_9 = __pyx_v_k;
+    __pyx_t_10 = 2;
+    __pyx_t_11 = -1;
+    if (unlikely(__pyx_t_9 >= (size_t)__pyx_pybuffernd_query_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+    if (__pyx_t_10 < 0) {
+      __pyx_t_10 += __pyx_pybuffernd_query_boxes.diminfo[1].shape;
+      if (unlikely(__pyx_t_10 < 0)) __pyx_t_11 = 1;
+    } else if (unlikely(__pyx_t_10 >= __pyx_pybuffernd_query_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+    if (unlikely(__pyx_t_11 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_11);
+      __PYX_ERR(0, 35, __pyx_L1_error)
+    }
+    __pyx_t_12 = __pyx_v_k;
+    __pyx_t_13 = 0;
+    __pyx_t_11 = -1;
+    if (unlikely(__pyx_t_12 >= (size_t)__pyx_pybuffernd_query_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+    if (__pyx_t_13 < 0) {
+      __pyx_t_13 += __pyx_pybuffernd_query_boxes.diminfo[1].shape;
+      if (unlikely(__pyx_t_13 < 0)) __pyx_t_11 = 1;
+    } else if (unlikely(__pyx_t_13 >= __pyx_pybuffernd_query_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+    if (unlikely(__pyx_t_11 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_11);
+      __PYX_ERR(0, 35, __pyx_L1_error)
+    }
+
+    /* "box_overlaps.pyx":36
+ *         box_area = (
+ *             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+ *             (query_boxes[k, 3] - query_boxes[k, 1] + 1)             # <<<<<<<<<<<<<<
+ *         )
+ *         for n in range(N):
+ */
+    __pyx_t_14 = __pyx_v_k;
+    __pyx_t_15 = 3;
+    __pyx_t_11 = -1;
+    if (unlikely(__pyx_t_14 >= (size_t)__pyx_pybuffernd_query_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+    if (__pyx_t_15 < 0) {
+      __pyx_t_15 += __pyx_pybuffernd_query_boxes.diminfo[1].shape;
+      if (unlikely(__pyx_t_15 < 0)) __pyx_t_11 = 1;
+    } else if (unlikely(__pyx_t_15 >= __pyx_pybuffernd_query_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+    if (unlikely(__pyx_t_11 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_11);
+      __PYX_ERR(0, 36, __pyx_L1_error)
+    }
+    __pyx_t_16 = __pyx_v_k;
+    __pyx_t_17 = 1;
+    __pyx_t_11 = -1;
+    if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_query_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+    if (__pyx_t_17 < 0) {
+      __pyx_t_17 += __pyx_pybuffernd_query_boxes.diminfo[1].shape;
+      if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+    } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_query_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+    if (unlikely(__pyx_t_11 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_11);
+      __PYX_ERR(0, 36, __pyx_L1_error)
+    }
+
+    /* "box_overlaps.pyx":35
+ *     for k in range(K):
+ *         box_area = (
+ *             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *             # <<<<<<<<<<<<<<
+ *             (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+ *         )
+ */
+    __pyx_v_box_area = ((((*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.buf, __pyx_t_9, __pyx_pybuffernd_query_boxes.diminfo[0].strides, __pyx_t_10, __pyx_pybuffernd_query_boxes.diminfo[1].strides)) - (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.buf, __pyx_t_12, __pyx_pybuffernd_query_boxes.diminfo[0].strides, __pyx_t_13, __pyx_pybuffernd_query_boxes.diminfo[1].strides))) + 1.0) * (((*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.buf, __pyx_t_14, __pyx_pybuffernd_query_boxes.diminfo[0].strides, __pyx_t_15, __pyx_pybuffernd_query_boxes.diminfo[1].strides)) - (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_query_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_query_boxes.diminfo[1].strides))) + 1.0));
+
+    /* "box_overlaps.pyx":38
+ *             (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+ *         )
+ *         for n in range(N):             # <<<<<<<<<<<<<<
+ *             iw = (
+ *                 min(boxes[n, 2], query_boxes[k, 2]) -
+ */
+    __pyx_t_18 = __pyx_v_N;
+    __pyx_t_19 = __pyx_t_18;
+    for (__pyx_t_20 = 0; __pyx_t_20 < __pyx_t_19; __pyx_t_20+=1) {
+      __pyx_v_n = __pyx_t_20;
+
+      /* "box_overlaps.pyx":40
+ *         for n in range(N):
+ *             iw = (
+ *                 min(boxes[n, 2], query_boxes[k, 2]) -             # <<<<<<<<<<<<<<
+ *                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+ *             )
+ */
+      __pyx_t_16 = __pyx_v_k;
+      __pyx_t_17 = 2;
+      __pyx_t_11 = -1;
+      if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_query_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+      if (__pyx_t_17 < 0) {
+        __pyx_t_17 += __pyx_pybuffernd_query_boxes.diminfo[1].shape;
+        if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+      } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_query_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+      if (unlikely(__pyx_t_11 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_11);
+        __PYX_ERR(0, 40, __pyx_L1_error)
+      }
+      __pyx_t_21 = (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_query_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_query_boxes.diminfo[1].strides));
+      __pyx_t_16 = __pyx_v_n;
+      __pyx_t_17 = 2;
+      __pyx_t_11 = -1;
+      if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+      if (__pyx_t_17 < 0) {
+        __pyx_t_17 += __pyx_pybuffernd_boxes.diminfo[1].shape;
+        if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+      } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+      if (unlikely(__pyx_t_11 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_11);
+        __PYX_ERR(0, 40, __pyx_L1_error)
+      }
+      __pyx_t_22 = (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_boxes.diminfo[1].strides));
+      if (((__pyx_t_21 < __pyx_t_22) != 0)) {
+        __pyx_t_23 = __pyx_t_21;
+      } else {
+        __pyx_t_23 = __pyx_t_22;
+      }
+
+      /* "box_overlaps.pyx":41
+ *             iw = (
+ *                 min(boxes[n, 2], query_boxes[k, 2]) -
+ *                 max(boxes[n, 0], query_boxes[k, 0]) + 1             # <<<<<<<<<<<<<<
+ *             )
+ *             if iw > 0:
+ */
+      __pyx_t_16 = __pyx_v_k;
+      __pyx_t_17 = 0;
+      __pyx_t_11 = -1;
+      if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_query_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+      if (__pyx_t_17 < 0) {
+        __pyx_t_17 += __pyx_pybuffernd_query_boxes.diminfo[1].shape;
+        if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+      } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_query_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+      if (unlikely(__pyx_t_11 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_11);
+        __PYX_ERR(0, 41, __pyx_L1_error)
+      }
+      __pyx_t_21 = (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_query_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_query_boxes.diminfo[1].strides));
+      __pyx_t_16 = __pyx_v_n;
+      __pyx_t_17 = 0;
+      __pyx_t_11 = -1;
+      if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+      if (__pyx_t_17 < 0) {
+        __pyx_t_17 += __pyx_pybuffernd_boxes.diminfo[1].shape;
+        if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+      } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+      if (unlikely(__pyx_t_11 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_11);
+        __PYX_ERR(0, 41, __pyx_L1_error)
+      }
+      __pyx_t_22 = (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_boxes.diminfo[1].strides));
+      if (((__pyx_t_21 > __pyx_t_22) != 0)) {
+        __pyx_t_24 = __pyx_t_21;
+      } else {
+        __pyx_t_24 = __pyx_t_22;
+      }
+
+      /* "box_overlaps.pyx":40
+ *         for n in range(N):
+ *             iw = (
+ *                 min(boxes[n, 2], query_boxes[k, 2]) -             # <<<<<<<<<<<<<<
+ *                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+ *             )
+ */
+      __pyx_v_iw = ((__pyx_t_23 - __pyx_t_24) + 1.0);
+
+      /* "box_overlaps.pyx":43
+ *                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+ *             )
+ *             if iw > 0:             # <<<<<<<<<<<<<<
+ *                 ih = (
+ *                     min(boxes[n, 3], query_boxes[k, 3]) -
+ */
+      __pyx_t_25 = ((__pyx_v_iw > 0.0) != 0);
+      if (__pyx_t_25) {
+
+        /* "box_overlaps.pyx":45
+ *             if iw > 0:
+ *                 ih = (
+ *                     min(boxes[n, 3], query_boxes[k, 3]) -             # <<<<<<<<<<<<<<
+ *                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+ *                 )
+ */
+        __pyx_t_16 = __pyx_v_k;
+        __pyx_t_17 = 3;
+        __pyx_t_11 = -1;
+        if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_query_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+        if (__pyx_t_17 < 0) {
+          __pyx_t_17 += __pyx_pybuffernd_query_boxes.diminfo[1].shape;
+          if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+        } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_query_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+        if (unlikely(__pyx_t_11 != -1)) {
+          __Pyx_RaiseBufferIndexError(__pyx_t_11);
+          __PYX_ERR(0, 45, __pyx_L1_error)
+        }
+        __pyx_t_24 = (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_query_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_query_boxes.diminfo[1].strides));
+        __pyx_t_16 = __pyx_v_n;
+        __pyx_t_17 = 3;
+        __pyx_t_11 = -1;
+        if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+        if (__pyx_t_17 < 0) {
+          __pyx_t_17 += __pyx_pybuffernd_boxes.diminfo[1].shape;
+          if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+        } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+        if (unlikely(__pyx_t_11 != -1)) {
+          __Pyx_RaiseBufferIndexError(__pyx_t_11);
+          __PYX_ERR(0, 45, __pyx_L1_error)
+        }
+        __pyx_t_23 = (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_boxes.diminfo[1].strides));
+        if (((__pyx_t_24 < __pyx_t_23) != 0)) {
+          __pyx_t_21 = __pyx_t_24;
+        } else {
+          __pyx_t_21 = __pyx_t_23;
+        }
+
+        /* "box_overlaps.pyx":46
+ *                 ih = (
+ *                     min(boxes[n, 3], query_boxes[k, 3]) -
+ *                     max(boxes[n, 1], query_boxes[k, 1]) + 1             # <<<<<<<<<<<<<<
+ *                 )
+ *                 if ih > 0:
+ */
+        __pyx_t_16 = __pyx_v_k;
+        __pyx_t_17 = 1;
+        __pyx_t_11 = -1;
+        if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_query_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+        if (__pyx_t_17 < 0) {
+          __pyx_t_17 += __pyx_pybuffernd_query_boxes.diminfo[1].shape;
+          if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+        } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_query_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+        if (unlikely(__pyx_t_11 != -1)) {
+          __Pyx_RaiseBufferIndexError(__pyx_t_11);
+          __PYX_ERR(0, 46, __pyx_L1_error)
+        }
+        __pyx_t_24 = (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_query_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_query_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_query_boxes.diminfo[1].strides));
+        __pyx_t_16 = __pyx_v_n;
+        __pyx_t_17 = 1;
+        __pyx_t_11 = -1;
+        if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+        if (__pyx_t_17 < 0) {
+          __pyx_t_17 += __pyx_pybuffernd_boxes.diminfo[1].shape;
+          if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+        } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+        if (unlikely(__pyx_t_11 != -1)) {
+          __Pyx_RaiseBufferIndexError(__pyx_t_11);
+          __PYX_ERR(0, 46, __pyx_L1_error)
+        }
+        __pyx_t_23 = (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_boxes.diminfo[1].strides));
+        if (((__pyx_t_24 > __pyx_t_23) != 0)) {
+          __pyx_t_22 = __pyx_t_24;
+        } else {
+          __pyx_t_22 = __pyx_t_23;
+        }
+
+        /* "box_overlaps.pyx":45
+ *             if iw > 0:
+ *                 ih = (
+ *                     min(boxes[n, 3], query_boxes[k, 3]) -             # <<<<<<<<<<<<<<
+ *                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+ *                 )
+ */
+        __pyx_v_ih = ((__pyx_t_21 - __pyx_t_22) + 1.0);
+
+        /* "box_overlaps.pyx":48
+ *                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+ *                 )
+ *                 if ih > 0:             # <<<<<<<<<<<<<<
+ *                     ua = float(
+ *                         (boxes[n, 2] - boxes[n, 0] + 1) *
+ */
+        __pyx_t_25 = ((__pyx_v_ih > 0.0) != 0);
+        if (__pyx_t_25) {
+
+          /* "box_overlaps.pyx":50
+ *                 if ih > 0:
+ *                     ua = float(
+ *                         (boxes[n, 2] - boxes[n, 0] + 1) *             # <<<<<<<<<<<<<<
+ *                         (boxes[n, 3] - boxes[n, 1] + 1) +
+ *                         box_area - iw * ih
+ */
+          __pyx_t_16 = __pyx_v_n;
+          __pyx_t_17 = 2;
+          __pyx_t_11 = -1;
+          if (unlikely(__pyx_t_16 >= (size_t)__pyx_pybuffernd_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+          if (__pyx_t_17 < 0) {
+            __pyx_t_17 += __pyx_pybuffernd_boxes.diminfo[1].shape;
+            if (unlikely(__pyx_t_17 < 0)) __pyx_t_11 = 1;
+          } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+          if (unlikely(__pyx_t_11 != -1)) {
+            __Pyx_RaiseBufferIndexError(__pyx_t_11);
+            __PYX_ERR(0, 50, __pyx_L1_error)
+          }
+          __pyx_t_14 = __pyx_v_n;
+          __pyx_t_15 = 0;
+          __pyx_t_11 = -1;
+          if (unlikely(__pyx_t_14 >= (size_t)__pyx_pybuffernd_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+          if (__pyx_t_15 < 0) {
+            __pyx_t_15 += __pyx_pybuffernd_boxes.diminfo[1].shape;
+            if (unlikely(__pyx_t_15 < 0)) __pyx_t_11 = 1;
+          } else if (unlikely(__pyx_t_15 >= __pyx_pybuffernd_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+          if (unlikely(__pyx_t_11 != -1)) {
+            __Pyx_RaiseBufferIndexError(__pyx_t_11);
+            __PYX_ERR(0, 50, __pyx_L1_error)
+          }
+
+          /* "box_overlaps.pyx":51
+ *                     ua = float(
+ *                         (boxes[n, 2] - boxes[n, 0] + 1) *
+ *                         (boxes[n, 3] - boxes[n, 1] + 1) +             # <<<<<<<<<<<<<<
+ *                         box_area - iw * ih
+ *                     )
+ */
+          __pyx_t_12 = __pyx_v_n;
+          __pyx_t_13 = 3;
+          __pyx_t_11 = -1;
+          if (unlikely(__pyx_t_12 >= (size_t)__pyx_pybuffernd_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+          if (__pyx_t_13 < 0) {
+            __pyx_t_13 += __pyx_pybuffernd_boxes.diminfo[1].shape;
+            if (unlikely(__pyx_t_13 < 0)) __pyx_t_11 = 1;
+          } else if (unlikely(__pyx_t_13 >= __pyx_pybuffernd_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+          if (unlikely(__pyx_t_11 != -1)) {
+            __Pyx_RaiseBufferIndexError(__pyx_t_11);
+            __PYX_ERR(0, 51, __pyx_L1_error)
+          }
+          __pyx_t_9 = __pyx_v_n;
+          __pyx_t_10 = 1;
+          __pyx_t_11 = -1;
+          if (unlikely(__pyx_t_9 >= (size_t)__pyx_pybuffernd_boxes.diminfo[0].shape)) __pyx_t_11 = 0;
+          if (__pyx_t_10 < 0) {
+            __pyx_t_10 += __pyx_pybuffernd_boxes.diminfo[1].shape;
+            if (unlikely(__pyx_t_10 < 0)) __pyx_t_11 = 1;
+          } else if (unlikely(__pyx_t_10 >= __pyx_pybuffernd_boxes.diminfo[1].shape)) __pyx_t_11 = 1;
+          if (unlikely(__pyx_t_11 != -1)) {
+            __Pyx_RaiseBufferIndexError(__pyx_t_11);
+            __PYX_ERR(0, 51, __pyx_L1_error)
+          }
+
+          /* "box_overlaps.pyx":49
+ *                 )
+ *                 if ih > 0:
+ *                     ua = float(             # <<<<<<<<<<<<<<
+ *                         (boxes[n, 2] - boxes[n, 0] + 1) *
+ *                         (boxes[n, 3] - boxes[n, 1] + 1) +
+ */
+          __pyx_v_ua = ((double)((((((*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_boxes.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_boxes.diminfo[0].strides, __pyx_t_17, __pyx_pybuffernd_boxes.diminfo[1].strides)) - (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_boxes.rcbuffer->pybuffer.buf, __pyx_t_14, __pyx_pybuffernd_boxes.diminfo[0].strides, __pyx_t_15, __pyx_pybuffernd_boxes.diminfo[1].strides))) + 1.0) * (((*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_boxes.rcbuffer->pybuffer.buf, __pyx_t_12, __pyx_pybuffernd_boxes.diminfo[0].strides, __pyx_t_13, __pyx_pybuffernd_boxes.diminfo[1].strides)) - (*__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_boxes.rcbuffer->pybuffer.buf, __pyx_t_9, __pyx_pybuffernd_boxes.diminfo[0].strides, __pyx_t_10, __pyx_pybuffernd_boxes.diminfo[1].strides))) + 1.0)) + __pyx_v_box_area) - (__pyx_v_iw * __pyx_v_ih)));
+
+          /* "box_overlaps.pyx":54
+ *                         box_area - iw * ih
+ *                     )
+ *                     overlaps[n, k] = iw * ih / ua             # <<<<<<<<<<<<<<
+ *     return overlaps
+ */
+          __pyx_t_22 = (__pyx_v_iw * __pyx_v_ih);
+          if (unlikely(__pyx_v_ua == 0)) {
+            PyErr_SetString(PyExc_ZeroDivisionError, "float division");
+            __PYX_ERR(0, 54, __pyx_L1_error)
+          }
+          __pyx_t_9 = __pyx_v_n;
+          __pyx_t_12 = __pyx_v_k;
+          __pyx_t_11 = -1;
+          if (unlikely(__pyx_t_9 >= (size_t)__pyx_pybuffernd_overlaps.diminfo[0].shape)) __pyx_t_11 = 0;
+          if (unlikely(__pyx_t_12 >= (size_t)__pyx_pybuffernd_overlaps.diminfo[1].shape)) __pyx_t_11 = 1;
+          if (unlikely(__pyx_t_11 != -1)) {
+            __Pyx_RaiseBufferIndexError(__pyx_t_11);
+            __PYX_ERR(0, 54, __pyx_L1_error)
+          }
+          *__Pyx_BufPtrStrided2d(__pyx_t_4bbox_DTYPE_t *, __pyx_pybuffernd_overlaps.rcbuffer->pybuffer.buf, __pyx_t_9, __pyx_pybuffernd_overlaps.diminfo[0].strides, __pyx_t_12, __pyx_pybuffernd_overlaps.diminfo[1].strides) = (__pyx_t_22 / __pyx_v_ua);
+
+          /* "box_overlaps.pyx":48
+ *                     max(boxes[n, 1], query_boxes[k, 1]) + 1
+ *                 )
+ *                 if ih > 0:             # <<<<<<<<<<<<<<
+ *                     ua = float(
+ *                         (boxes[n, 2] - boxes[n, 0] + 1) *
+ */
+        }
+
+        /* "box_overlaps.pyx":43
+ *                 max(boxes[n, 0], query_boxes[k, 0]) + 1
+ *             )
+ *             if iw > 0:             # <<<<<<<<<<<<<<
+ *                 ih = (
+ *                     min(boxes[n, 3], query_boxes[k, 3]) -
+ */
+      }
+    }
+  }
+
+  /* "box_overlaps.pyx":55
+ *                     )
+ *                     overlaps[n, k] = iw * ih / ua
+ *     return overlaps             # <<<<<<<<<<<<<<
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(((PyObject *)__pyx_v_overlaps));
+  __pyx_r = ((PyObject *)__pyx_v_overlaps);
+  goto __pyx_L0;
+
+  /* "box_overlaps.pyx":15
+ * ctypedef np.float_t DTYPE_t
+ * 
+ * def bbox_overlaps(             # <<<<<<<<<<<<<<
+ *         np.ndarray[DTYPE_t, ndim=2] boxes,
+ *         np.ndarray[DTYPE_t, ndim=2] query_boxes):
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_boxes.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_overlaps.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_query_boxes.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("bbox.bbox_overlaps", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_boxes.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_overlaps.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_query_boxes.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_overlaps);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":734
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":735
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 735, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":734
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":737
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":738
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 738, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":737
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":740
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":741
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 741, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":740
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":743
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":744
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 744, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":743
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":746
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":747
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 747, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":746
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":749
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyDataType_SHAPE(PyArray_Descr *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("PyDataType_SHAPE", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":750
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  __pyx_t_1 = (PyDataType_HASSUBARRAY(__pyx_v_d) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":751
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape             # <<<<<<<<<<<<<<
+ *     else:
+ *         return ()
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject*)__pyx_v_d->subarray->shape));
+    __pyx_r = ((PyObject*)__pyx_v_d->subarray->shape);
+    goto __pyx_L0;
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":750
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):
+ *     if PyDataType_HASSUBARRAY(d):             # <<<<<<<<<<<<<<
+ *         return <tuple>d.subarray.shape
+ *     else:
+ */
+  }
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":753
+ *         return <tuple>d.subarray.shape
+ *     else:
+ *         return ()             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  /*else*/ {
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(__pyx_empty_tuple);
+    __pyx_r = __pyx_empty_tuple;
+    goto __pyx_L0;
+  }
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":749
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline tuple PyDataType_SHAPE(dtype d):             # <<<<<<<<<<<<<<
+ *     if PyDataType_HASSUBARRAY(d):
+ *         return <tuple>d.subarray.shape
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":868
+ *     int _import_umath() except -1
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("set_array_base", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":869
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *     Py_INCREF(base) # important to do this before stealing the reference below!             # <<<<<<<<<<<<<<
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ */
+  Py_INCREF(__pyx_v_base);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":870
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  (void)(PyArray_SetBaseObject(__pyx_v_arr, __pyx_v_base));
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":868
+ *     int _import_umath() except -1
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *     Py_INCREF(base) # important to do this before stealing the reference below!
+ *     PyArray_SetBaseObject(arr, base)
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":872
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_v_base;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":873
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)             # <<<<<<<<<<<<<<
+ *     if base is NULL:
+ *         return None
+ */
+  __pyx_v_base = PyArray_BASE(__pyx_v_arr);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":874
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     return <object>base
+ */
+  __pyx_t_1 = ((__pyx_v_base == NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":875
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     return <object>base
+ * 
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+    goto __pyx_L0;
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":874
+ * cdef inline object get_array_base(ndarray arr):
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     return <object>base
+ */
+  }
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":876
+ *     if base is NULL:
+ *         return None
+ *     return <object>base             # <<<<<<<<<<<<<<
+ * 
+ * # Versions of the import_* functions which are more suitable for
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(((PyObject *)__pyx_v_base));
+  __pyx_r = ((PyObject *)__pyx_v_base);
+  goto __pyx_L0;
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":872
+ *     PyArray_SetBaseObject(arr, base)
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     base = PyArray_BASE(arr)
+ *     if base is NULL:
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":880
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         __pyx_import_array()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_array(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_array", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":881
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":882
+ * cdef inline int import_array() except -1:
+ *     try:
+ *         __pyx_import_array()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ */
+      __pyx_t_4 = _import_array(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 882, __pyx_L3_error)
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":881
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":883
+ *     try:
+ *         __pyx_import_array()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 883, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":884
+ *         __pyx_import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple_, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 884, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 884, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":881
+ * # Cython code.
+ * cdef inline int import_array() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         __pyx_import_array()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":880
+ * # Versions of the import_* functions which are more suitable for
+ * # Cython code.
+ * cdef inline int import_array() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         __pyx_import_array()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_array", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":886
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_umath(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_umath", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":887
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":888
+ * cdef inline int import_umath() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 888, __pyx_L3_error)
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":887
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":889
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 889, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":890
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 890, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 890, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":887
+ * 
+ * cdef inline int import_umath() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":886
+ *         raise ImportError("numpy.core.multiarray failed to import")
+ * 
+ * cdef inline int import_umath() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_umath", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":892
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+static CYTHON_INLINE int __pyx_f_5numpy_import_ufunc(void) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  int __pyx_t_4;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("import_ufunc", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":893
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+  {
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ExceptionSave(&__pyx_t_1, &__pyx_t_2, &__pyx_t_3);
+    __Pyx_XGOTREF(__pyx_t_1);
+    __Pyx_XGOTREF(__pyx_t_2);
+    __Pyx_XGOTREF(__pyx_t_3);
+    /*try:*/ {
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":894
+ * cdef inline int import_ufunc() except -1:
+ *     try:
+ *         _import_umath()             # <<<<<<<<<<<<<<
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")
+ */
+      __pyx_t_4 = _import_umath(); if (unlikely(__pyx_t_4 == ((int)-1))) __PYX_ERR(1, 894, __pyx_L3_error)
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":893
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    }
+    __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __Pyx_XDECREF(__pyx_t_2); __pyx_t_2 = 0;
+    __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+    goto __pyx_L8_try_end;
+    __pyx_L3_error:;
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":895
+ *     try:
+ *         _import_umath()
+ *     except Exception:             # <<<<<<<<<<<<<<
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyErr_ExceptionMatches(((PyObject *)(&((PyTypeObject*)PyExc_Exception)[0])));
+    if (__pyx_t_4) {
+      __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+      if (__Pyx_GetException(&__pyx_t_5, &__pyx_t_6, &__pyx_t_7) < 0) __PYX_ERR(1, 895, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GOTREF(__pyx_t_7);
+
+      /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":896
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef extern from *:
+ */
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ImportError, __pyx_tuple__2, NULL); if (unlikely(!__pyx_t_8)) __PYX_ERR(1, 896, __pyx_L5_except_error)
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      __PYX_ERR(1, 896, __pyx_L5_except_error)
+    }
+    goto __pyx_L5_except_error;
+    __pyx_L5_except_error:;
+
+    /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":893
+ * 
+ * cdef inline int import_ufunc() except -1:
+ *     try:             # <<<<<<<<<<<<<<
+ *         _import_umath()
+ *     except Exception:
+ */
+    __Pyx_XGIVEREF(__pyx_t_1);
+    __Pyx_XGIVEREF(__pyx_t_2);
+    __Pyx_XGIVEREF(__pyx_t_3);
+    __Pyx_ExceptionReset(__pyx_t_1, __pyx_t_2, __pyx_t_3);
+    goto __pyx_L1_error;
+    __pyx_L8_try_end:;
+  }
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":892
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.import_ufunc", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
+static int __pyx_pymod_exec_bbox(PyObject* module); /*proto*/
+static PyModuleDef_Slot __pyx_moduledef_slots[] = {
+  {Py_mod_create, (void*)__pyx_pymod_create},
+  {Py_mod_exec, (void*)__pyx_pymod_exec_bbox},
+  {0, NULL}
+};
+#endif
+
+static struct PyModuleDef __pyx_moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "bbox",
+    0, /* m_doc */
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    0, /* m_size */
+  #else
+    -1, /* m_size */
+  #endif
+    __pyx_methods /* m_methods */,
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    __pyx_moduledef_slots, /* m_slots */
+  #else
+    NULL, /* m_reload */
+  #endif
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+#ifndef CYTHON_SMALL_CODE
+#if defined(__clang__)
+    #define CYTHON_SMALL_CODE
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+    #define CYTHON_SMALL_CODE __attribute__((cold))
+#else
+    #define CYTHON_SMALL_CODE
+#endif
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_n_s_DTYPE, __pyx_k_DTYPE, sizeof(__pyx_k_DTYPE), 0, 0, 1, 1},
+  {&__pyx_n_s_ImportError, __pyx_k_ImportError, sizeof(__pyx_k_ImportError), 0, 0, 1, 1},
+  {&__pyx_n_s_K, __pyx_k_K, sizeof(__pyx_k_K), 0, 0, 1, 1},
+  {&__pyx_n_s_N, __pyx_k_N, sizeof(__pyx_k_N), 0, 0, 1, 1},
+  {&__pyx_n_s_bbox, __pyx_k_bbox, sizeof(__pyx_k_bbox), 0, 0, 1, 1},
+  {&__pyx_n_s_bbox_overlaps, __pyx_k_bbox_overlaps, sizeof(__pyx_k_bbox_overlaps), 0, 0, 1, 1},
+  {&__pyx_n_s_box_area, __pyx_k_box_area, sizeof(__pyx_k_box_area), 0, 0, 1, 1},
+  {&__pyx_kp_s_box_overlaps_pyx, __pyx_k_box_overlaps_pyx, sizeof(__pyx_k_box_overlaps_pyx), 0, 0, 1, 0},
+  {&__pyx_n_s_boxes, __pyx_k_boxes, sizeof(__pyx_k_boxes), 0, 0, 1, 1},
+  {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
+  {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
+  {&__pyx_n_s_float, __pyx_k_float, sizeof(__pyx_k_float), 0, 0, 1, 1},
+  {&__pyx_n_s_ih, __pyx_k_ih, sizeof(__pyx_k_ih), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_n_s_iw, __pyx_k_iw, sizeof(__pyx_k_iw), 0, 0, 1, 1},
+  {&__pyx_n_s_k, __pyx_k_k, sizeof(__pyx_k_k), 0, 0, 1, 1},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_n_s_n, __pyx_k_n, sizeof(__pyx_k_n), 0, 0, 1, 1},
+  {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1},
+  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+  {&__pyx_kp_s_numpy_core_multiarray_failed_to, __pyx_k_numpy_core_multiarray_failed_to, sizeof(__pyx_k_numpy_core_multiarray_failed_to), 0, 0, 1, 0},
+  {&__pyx_kp_s_numpy_core_umath_failed_to_impor, __pyx_k_numpy_core_umath_failed_to_impor, sizeof(__pyx_k_numpy_core_umath_failed_to_impor), 0, 0, 1, 0},
+  {&__pyx_n_s_overlaps, __pyx_k_overlaps, sizeof(__pyx_k_overlaps), 0, 0, 1, 1},
+  {&__pyx_n_s_query_boxes, __pyx_k_query_boxes, sizeof(__pyx_k_query_boxes), 0, 0, 1, 1},
+  {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_ua, __pyx_k_ua, sizeof(__pyx_k_ua), 0, 0, 1, 1},
+  {&__pyx_n_s_zeros, __pyx_k_zeros, sizeof(__pyx_k_zeros), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static CYTHON_SMALL_CODE int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) __PYX_ERR(0, 33, __pyx_L1_error)
+  __pyx_builtin_ImportError = __Pyx_GetBuiltinName(__pyx_n_s_ImportError); if (!__pyx_builtin_ImportError) __PYX_ERR(1, 884, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":884
+ *         __pyx_import_array()
+ *     except Exception:
+ *         raise ImportError("numpy.core.multiarray failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_umath() except -1:
+ */
+  __pyx_tuple_ = PyTuple_Pack(1, __pyx_kp_s_numpy_core_multiarray_failed_to); if (unlikely(!__pyx_tuple_)) __PYX_ERR(1, 884, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple_);
+  __Pyx_GIVEREF(__pyx_tuple_);
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":890
+ *         _import_umath()
+ *     except Exception:
+ *         raise ImportError("numpy.core.umath failed to import")             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline int import_ufunc() except -1:
+ */
+  __pyx_tuple__2 = PyTuple_Pack(1, __pyx_kp_s_numpy_core_umath_failed_to_impor); if (unlikely(!__pyx_tuple__2)) __PYX_ERR(1, 890, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__2);
+  __Pyx_GIVEREF(__pyx_tuple__2);
+
+  /* "box_overlaps.pyx":15
+ * ctypedef np.float_t DTYPE_t
+ * 
+ * def bbox_overlaps(             # <<<<<<<<<<<<<<
+ *         np.ndarray[DTYPE_t, ndim=2] boxes,
+ *         np.ndarray[DTYPE_t, ndim=2] query_boxes):
+ */
+  __pyx_tuple__3 = PyTuple_Pack(11, __pyx_n_s_boxes, __pyx_n_s_query_boxes, __pyx_n_s_N, __pyx_n_s_K, __pyx_n_s_overlaps, __pyx_n_s_iw, __pyx_n_s_ih, __pyx_n_s_box_area, __pyx_n_s_ua, __pyx_n_s_k, __pyx_n_s_n); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__3);
+  __Pyx_GIVEREF(__pyx_tuple__3);
+  __pyx_codeobj__4 = (PyObject*)__Pyx_PyCode_New(2, 0, 11, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__3, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_box_overlaps_pyx, __pyx_n_s_bbox_overlaps, 15, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__4)) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static CYTHON_SMALL_CODE int __Pyx_modinit_global_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_export_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_init_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_type_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_variable_import_code(void); /*proto*/
+static CYTHON_SMALL_CODE int __Pyx_modinit_function_import_code(void); /*proto*/
+
+static int __Pyx_modinit_global_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
+  /*--- Global init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
+  /*--- Variable export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
+  /*--- Function export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
+  /*--- Type init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
+  /*--- Type import code ---*/
+  __pyx_t_1 = PyImport_ImportModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_t_1)) __PYX_ERR(2, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType(__pyx_t_1, __Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x050B0000
+  sizeof(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject),
+  #endif
+  __Pyx_ImportType_CheckSize_Warn);
+   if (!__pyx_ptype_7cpython_4type_type) __PYX_ERR(2, 9, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyImport_ImportModule("numpy"); if (unlikely(!__pyx_t_1)) __PYX_ERR(1, 199, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType(__pyx_t_1, "numpy", "dtype", sizeof(PyArray_Descr), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_dtype) __PYX_ERR(1, 199, __pyx_L1_error)
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType(__pyx_t_1, "numpy", "flatiter", sizeof(PyArrayIterObject), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_flatiter) __PYX_ERR(1, 222, __pyx_L1_error)
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType(__pyx_t_1, "numpy", "broadcast", sizeof(PyArrayMultiIterObject), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_broadcast) __PYX_ERR(1, 226, __pyx_L1_error)
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType(__pyx_t_1, "numpy", "ndarray", sizeof(PyArrayObject), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_ndarray) __PYX_ERR(1, 238, __pyx_L1_error)
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType(__pyx_t_1, "numpy", "ufunc", sizeof(PyUFuncObject), __Pyx_ImportType_CheckSize_Ignore);
+   if (!__pyx_ptype_5numpy_ufunc) __PYX_ERR(1, 764, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_modinit_variable_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
+  /*--- Variable import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
+  /*--- Function import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+
+#ifndef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#elif PY_MAJOR_VERSION < 3
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" void
+#else
+#define __Pyx_PyMODINIT_FUNC void
+#endif
+#else
+#ifdef __cplusplus
+#define __Pyx_PyMODINIT_FUNC extern "C" PyObject *
+#else
+#define __Pyx_PyMODINIT_FUNC PyObject *
+#endif
+#endif
+
+
+#if PY_MAJOR_VERSION < 3
+__Pyx_PyMODINIT_FUNC initbbox(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC initbbox(void)
+#else
+__Pyx_PyMODINIT_FUNC PyInit_bbox(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC PyInit_bbox(void)
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+{
+  return PyModuleDef_Init(&__pyx_moduledef);
+}
+static CYTHON_SMALL_CODE int __Pyx_check_single_interpreter(void) {
+    #if PY_VERSION_HEX >= 0x030700A1
+    static PY_INT64_T main_interpreter_id = -1;
+    PY_INT64_T current_id = PyInterpreterState_GetID(PyThreadState_Get()->interp);
+    if (main_interpreter_id == -1) {
+        main_interpreter_id = current_id;
+        return (unlikely(current_id == -1)) ? -1 : 0;
+    } else if (unlikely(main_interpreter_id != current_id))
+    #else
+    static PyInterpreterState *main_interpreter = NULL;
+    PyInterpreterState *current_interpreter = PyThreadState_Get()->interp;
+    if (!main_interpreter) {
+        main_interpreter = current_interpreter;
+    } else if (unlikely(main_interpreter != current_interpreter))
+    #endif
+    {
+        PyErr_SetString(
+            PyExc_ImportError,
+            "Interpreter change detected - this module can only be loaded into one interpreter per process.");
+        return -1;
+    }
+    return 0;
+}
+static CYTHON_SMALL_CODE int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name, int allow_none) {
+    PyObject *value = PyObject_GetAttrString(spec, from_name);
+    int result = 0;
+    if (likely(value)) {
+        if (allow_none || value != Py_None) {
+            result = PyDict_SetItemString(moddict, to_name, value);
+        }
+        Py_DECREF(value);
+    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+    } else {
+        result = -1;
+    }
+    return result;
+}
+static CYTHON_SMALL_CODE PyObject* __pyx_pymod_create(PyObject *spec, CYTHON_UNUSED PyModuleDef *def) {
+    PyObject *module = NULL, *moddict, *modname;
+    if (__Pyx_check_single_interpreter())
+        return NULL;
+    if (__pyx_m)
+        return __Pyx_NewRef(__pyx_m);
+    modname = PyObject_GetAttrString(spec, "name");
+    if (unlikely(!modname)) goto bad;
+    module = PyModule_NewObject(modname);
+    Py_DECREF(modname);
+    if (unlikely(!module)) goto bad;
+    moddict = PyModule_GetDict(module);
+    if (unlikely(!moddict)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__", 1) < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__", 0) < 0)) goto bad;
+    return module;
+bad:
+    Py_XDECREF(module);
+    return NULL;
+}
+
+
+static CYTHON_SMALL_CODE int __pyx_pymod_exec_bbox(PyObject *__pyx_pyinit_module)
+#endif
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  if (__pyx_m) {
+    if (__pyx_m == __pyx_pyinit_module) return 0;
+    PyErr_SetString(PyExc_RuntimeError, "Module 'bbox' has already been imported. Re-initialisation is not supported.");
+    return -1;
+  }
+  #elif PY_MAJOR_VERSION >= 3
+  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
+  #endif
+  #if CYTHON_REFNANNY
+__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+if (!__Pyx_RefNanny) {
+  PyErr_Clear();
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+  if (!__Pyx_RefNanny)
+      Py_FatalError("failed to import 'refnanny' module");
+}
+#endif
+  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_bbox(void)", 0);
+  if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pxy_PyFrame_Initialize_Offsets
+  __Pxy_PyFrame_Initialize_Offsets();
+  #endif
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_AsyncGen_USED
+  if (__pyx_AsyncGen_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(WITH_THREAD) && PY_VERSION_HEX < 0x030700F0 && defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  PyEval_InitThreads();
+  #endif
+  /*--- Module creation code ---*/
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  __pyx_m = __pyx_pyinit_module;
+  Py_INCREF(__pyx_m);
+  #else
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("bbox", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_b);
+  __pyx_cython_runtime = PyImport_AddModule((char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_cython_runtime);
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main_bbox) {
+    if (PyObject_SetAttr(__pyx_m, __pyx_n_s_name, __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "bbox")) {
+      if (unlikely(PyDict_SetItemString(modules, "bbox", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global type/function init code ---*/
+  (void)__Pyx_modinit_global_init_code();
+  (void)__Pyx_modinit_variable_export_code();
+  (void)__Pyx_modinit_function_export_code();
+  (void)__Pyx_modinit_type_init_code();
+  if (unlikely(__Pyx_modinit_type_import_code() < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+  (void)__Pyx_modinit_variable_import_code();
+  (void)__Pyx_modinit_function_import_code();
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "box_overlaps.pyx":9
+ * 
+ * cimport cython
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * 
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) __PYX_ERR(0, 9, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "box_overlaps.pyx":12
+ * cimport numpy as np
+ * 
+ * DTYPE = np.float             # <<<<<<<<<<<<<<
+ * ctypedef np.float_t DTYPE_t
+ * 
+ */
+  __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_float); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_DTYPE, __pyx_t_2) < 0) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "box_overlaps.pyx":15
+ * ctypedef np.float_t DTYPE_t
+ * 
+ * def bbox_overlaps(             # <<<<<<<<<<<<<<
+ *         np.ndarray[DTYPE_t, ndim=2] boxes,
+ *         np.ndarray[DTYPE_t, ndim=2] query_boxes):
+ */
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_4bbox_1bbox_overlaps, NULL, __pyx_n_s_bbox); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_bbox_overlaps, __pyx_t_2) < 0) __PYX_ERR(0, 15, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "box_overlaps.pyx":1
+ * # --------------------------------------------------------             # <<<<<<<<<<<<<<
+ * # Fast R-CNN
+ * # Copyright (c) 2015 Microsoft
+ */
+  __pyx_t_2 = __Pyx_PyDict_NewPresized(0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_2) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "../../../miniconda3/envs/py3.6/lib/python3.6/site-packages/numpy/__init__.pxd":892
+ *         raise ImportError("numpy.core.umath failed to import")
+ * 
+ * cdef inline int import_ufunc() except -1:             # <<<<<<<<<<<<<<
+ *     try:
+ *         _import_umath()
+ */
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  if (__pyx_m) {
+    if (__pyx_d) {
+      __Pyx_AddTraceback("init bbox", __pyx_clineno, __pyx_lineno, __pyx_filename);
+    }
+    Py_CLEAR(__pyx_m);
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init bbox");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  return (__pyx_m != NULL) ? 0 : -1;
+  #elif PY_MAJOR_VERSION >= 3
+  return __pyx_m;
+  #else
+  return;
+  #endif
+}
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule(modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, "RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* PyObjectGetAttrStr */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#endif
+
+/* GetBuiltinName */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
+    if (unlikely(!result)) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+/* RaiseArgTupleInvalid */
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+/* RaiseDoubleKeywords */
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+/* ParseKeywords */
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+            continue;
+        }
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (**name == key) ? 0 :
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (__Pyx_PyUnicode_GET_LENGTH(**name) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key);
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (__Pyx_PyUnicode_GET_LENGTH(**argname) != __Pyx_PyUnicode_GET_LENGTH(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    PyErr_Format(PyExc_TypeError,
+    #if PY_MAJOR_VERSION < 3
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    return -1;
+}
+
+/* ArgTypeTest */
+static int __Pyx__ArgTypeTest(PyObject *obj, PyTypeObject *type, const char *name, int exact)
+{
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    else if (exact) {
+        #if PY_MAJOR_VERSION == 2
+        if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(__Pyx_TypeCheck(obj, type))) return 1;
+    }
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)",
+        name, type->tp_name, Py_TYPE(obj)->tp_name);
+    return 0;
+}
+
+/* IsLittleEndian */
+static CYTHON_INLINE int __Pyx_Is_Little_Endian(void)
+{
+  union {
+    uint32_t u32;
+    uint8_t u8[4];
+  } S;
+  S.u32 = 0x01020304;
+  return S.u8[0] == 4;
+}
+
+/* BufferFormatCheck */
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t <= '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1)
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case '?': return "'bool'";
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparseable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case '?': case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count;
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break;
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue;
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static PyObject *
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number, ndim;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return NULL;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+    ndim = ctx->head->field->type->ndim;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return NULL;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i])
+            return PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+        if (*ts != ',' && *ts != ')')
+            return PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim)
+        return PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return NULL;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return Py_None;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        return ts;
+      case ' ':
+      case '\r':
+      case '\n':
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_Is_Little_Endian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T':
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}':
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0;
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }
+        CYTHON_FALLTHROUGH;
+      case '?': case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 'p':
+        if ((ctx->enc_type == *ts) && (got_Z == ctx->is_complex) &&
+            (ctx->enc_packmode == ctx->new_packmode) && (!ctx->is_valid_array)) {
+          ctx->enc_count += ctx->new_count;
+          ctx->new_count = 1;
+          got_Z = 0;
+          ++ts;
+          break;
+        }
+        CYTHON_FALLTHROUGH;
+      case 's':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->enc_count = ctx->new_count;
+        ctx->enc_packmode = ctx->new_packmode;
+        ctx->enc_type = *ts;
+        ctx->is_complex = got_Z;
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (!__pyx_buffmt_parse_array(ctx, &ts)) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+
+/* BufferGetAndValidate */
+  static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (unlikely(info->buf == NULL)) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+static void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static int __Pyx__GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  buf->buf = NULL;
+  if (unlikely(__Pyx_GetBuffer(obj, buf, flags) == -1)) {
+    __Pyx_ZeroBuffer(buf);
+    return -1;
+  }
+  if (unlikely(buf->ndim != nd)) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if (unlikely((size_t)buf->itemsize != dtype->size)) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_SafeReleaseBuffer(buf);
+  return -1;
+}
+
+/* PyDictVersioning */
+  #if CYTHON_USE_DICT_VERSIONS && CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_tp_dict_version(PyObject *obj) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    return likely(dict) ? __PYX_GET_DICT_VERSION(dict) : 0;
+}
+static CYTHON_INLINE PY_UINT64_T __Pyx_get_object_dict_version(PyObject *obj) {
+    PyObject **dictptr = NULL;
+    Py_ssize_t offset = Py_TYPE(obj)->tp_dictoffset;
+    if (offset) {
+#if CYTHON_COMPILING_IN_CPYTHON
+        dictptr = (likely(offset > 0)) ? (PyObject **) ((char *)obj + offset) : _PyObject_GetDictPtr(obj);
+#else
+        dictptr = _PyObject_GetDictPtr(obj);
+#endif
+    }
+    return (dictptr && *dictptr) ? __PYX_GET_DICT_VERSION(*dictptr) : 0;
+}
+static CYTHON_INLINE int __Pyx_object_dict_version_matches(PyObject* obj, PY_UINT64_T tp_dict_version, PY_UINT64_T obj_dict_version) {
+    PyObject *dict = Py_TYPE(obj)->tp_dict;
+    if (unlikely(!dict) || unlikely(tp_dict_version != __PYX_GET_DICT_VERSION(dict)))
+        return 0;
+    return obj_dict_version == __Pyx_get_object_dict_version(obj);
+}
+#endif
+
+/* GetModuleGlobalName */
+  #if CYTHON_USE_DICT_VERSIONS
+static PyObject *__Pyx__GetModuleGlobalName(PyObject *name, PY_UINT64_T *dict_version, PyObject **dict_cached_value)
+#else
+static CYTHON_INLINE PyObject *__Pyx__GetModuleGlobalName(PyObject *name)
+#endif
+{
+    PyObject *result;
+#if !CYTHON_AVOID_BORROWED_REFS
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1
+    result = _PyDict_GetItem_KnownHash(__pyx_d, name, ((PyASCIIObject *) name)->hash);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    } else if (unlikely(PyErr_Occurred())) {
+        return NULL;
+    }
+#else
+    result = PyDict_GetItem(__pyx_d, name);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    }
+#endif
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    __PYX_UPDATE_DICT_CACHE(__pyx_d, result, *dict_cached_value, *dict_version)
+    if (likely(result)) {
+        return __Pyx_NewRef(result);
+    }
+    PyErr_Clear();
+#endif
+    return __Pyx_GetBuiltinName(name);
+}
+
+/* PyObjectCall */
+  #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = Py_TYPE(func)->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = (*call)(func, arg, kw);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* ExtTypeTest */
+  static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (likely(__Pyx_TypeCheck(obj, type)))
+        return 1;
+    PyErr_Format(PyExc_TypeError, "Cannot convert %.200s to %.200s",
+                 Py_TYPE(obj)->tp_name, type->tp_name);
+    return 0;
+}
+
+/* BufferIndexError */
+  static void __Pyx_RaiseBufferIndexError(int axis) {
+  PyErr_Format(PyExc_IndexError,
+     "Out of bounds on buffer access (axis %d)", axis);
+}
+
+/* PyErrFetchRestore */
+  #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+}
+#endif
+
+/* GetTopmostException */
+  #if CYTHON_USE_EXC_INFO_STACK
+static _PyErr_StackItem *
+__Pyx_PyErr_GetTopmostException(PyThreadState *tstate)
+{
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    while ((exc_info->exc_type == NULL || exc_info->exc_type == Py_None) &&
+           exc_info->previous_item != NULL)
+    {
+        exc_info = exc_info->previous_item;
+    }
+    return exc_info;
+}
+#endif
+
+/* SaveResetException */
+  #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx__ExceptionSave(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    #if CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = __Pyx_PyErr_GetTopmostException(tstate);
+    *type = exc_info->exc_type;
+    *value = exc_info->exc_value;
+    *tb = exc_info->exc_traceback;
+    #else
+    *type = tstate->exc_type;
+    *value = tstate->exc_value;
+    *tb = tstate->exc_traceback;
+    #endif
+    Py_XINCREF(*type);
+    Py_XINCREF(*value);
+    Py_XINCREF(*tb);
+}
+static CYTHON_INLINE void __Pyx__ExceptionReset(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    #if CYTHON_USE_EXC_INFO_STACK
+    _PyErr_StackItem *exc_info = tstate->exc_info;
+    tmp_type = exc_info->exc_type;
+    tmp_value = exc_info->exc_value;
+    tmp_tb = exc_info->exc_traceback;
+    exc_info->exc_type = type;
+    exc_info->exc_value = value;
+    exc_info->exc_traceback = tb;
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = type;
+    tstate->exc_value = value;
+    tstate->exc_traceback = tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+#endif
+
+/* PyErrExceptionMatches */
+  #if CYTHON_FAST_THREAD_STATE
+static int __Pyx_PyErr_ExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        if (__Pyx_PyErr_GivenExceptionMatches(exc_type, PyTuple_GET_ITEM(tuple, i))) return 1;
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_ExceptionMatchesInState(PyThreadState* tstate, PyObject* err) {
+    PyObject *exc_type = tstate->curexc_type;
+    if (exc_type == err) return 1;
+    if (unlikely(!exc_type)) return 0;
+    if (unlikely(PyTuple_Check(err)))
+        return __Pyx_PyErr_ExceptionMatchesTuple(exc_type, err);
+    return __Pyx_PyErr_GivenExceptionMatches(exc_type, err);
+}
+#endif
+
+/* GetException */
+  #if CYTHON_FAST_THREAD_STATE
+static int __Pyx__GetException(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb)
+#else
+static int __Pyx_GetException(PyObject **type, PyObject **value, PyObject **tb)
+#endif
+{
+    PyObject *local_type, *local_value, *local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    local_type = tstate->curexc_type;
+    local_value = tstate->curexc_value;
+    local_tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#else
+    PyErr_Fetch(&local_type, &local_value, &local_tb);
+#endif
+    PyErr_NormalizeException(&local_type, &local_value, &local_tb);
+#if CYTHON_FAST_THREAD_STATE
+    if (unlikely(tstate->curexc_type))
+#else
+    if (unlikely(PyErr_Occurred()))
+#endif
+        goto bad;
+    #if PY_MAJOR_VERSION >= 3
+    if (local_tb) {
+        if (unlikely(PyException_SetTraceback(local_value, local_tb) < 0))
+            goto bad;
+    }
+    #endif
+    Py_XINCREF(local_tb);
+    Py_XINCREF(local_type);
+    Py_XINCREF(local_value);
+    *type = local_type;
+    *value = local_value;
+    *tb = local_tb;
+#if CYTHON_FAST_THREAD_STATE
+    #if CYTHON_USE_EXC_INFO_STACK
+    {
+        _PyErr_StackItem *exc_info = tstate->exc_info;
+        tmp_type = exc_info->exc_type;
+        tmp_value = exc_info->exc_value;
+        tmp_tb = exc_info->exc_traceback;
+        exc_info->exc_type = local_type;
+        exc_info->exc_value = local_value;
+        exc_info->exc_traceback = local_tb;
+    }
+    #else
+    tmp_type = tstate->exc_type;
+    tmp_value = tstate->exc_value;
+    tmp_tb = tstate->exc_traceback;
+    tstate->exc_type = local_type;
+    tstate->exc_value = local_value;
+    tstate->exc_traceback = local_tb;
+    #endif
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_SetExcInfo(local_type, local_value, local_tb);
+#endif
+    return 0;
+bad:
+    *type = 0;
+    *value = 0;
+    *tb = 0;
+    Py_XDECREF(local_type);
+    Py_XDECREF(local_value);
+    Py_XDECREF(local_tb);
+    return -1;
+}
+
+/* RaiseException */
+  #if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
+                        CYTHON_UNUSED PyObject *cause) {
+    __Pyx_PyThreadState_declare
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    if (PyType_Check(type)) {
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+    }
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                int is_subclass = PyObject_IsSubclass(instance_class, type);
+                if (!is_subclass) {
+                    instance_class = NULL;
+                } else if (unlikely(is_subclass == -1)) {
+                    goto bad;
+                } else {
+                    type = instance_class;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+    if (cause) {
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+#if CYTHON_COMPILING_IN_PYPY
+        PyObject *tmp_type, *tmp_value, *tmp_tb;
+        PyErr_Fetch(&tmp_type, &tmp_value, &tmp_tb);
+        Py_INCREF(tb);
+        PyErr_Restore(tmp_type, tmp_value, tb);
+        Py_XDECREF(tmp_tb);
+#else
+        PyThreadState *tstate = __Pyx_PyThreadState_Current;
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+#endif
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+/* TypeImport */
+  #ifndef __PYX_HAVE_RT_ImportType
+#define __PYX_HAVE_RT_ImportType
+static PyTypeObject *__Pyx_ImportType(PyObject *module, const char *module_name, const char *class_name,
+    size_t size, enum __Pyx_ImportType_CheckSize check_size)
+{
+    PyObject *result = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+#ifdef Py_LIMITED_API
+    PyObject *py_basicsize;
+#endif
+    result = PyObject_GetAttrString(module, class_name);
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#ifndef Py_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if ((size_t)basicsize < size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize);
+        goto bad;
+    }
+    if (check_size == __Pyx_ImportType_CheckSize_Error && (size_t)basicsize != size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize);
+        goto bad;
+    }
+    else if (check_size == __Pyx_ImportType_CheckSize_Warn && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility. "
+            "Expected %zd from C header, got %zd from PyObject",
+            module_name, class_name, size, basicsize);
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+/* Import */
+  static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if ((1) && (strchr(__Pyx_MODULE_NAME, '.'))) {
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_MAJOR_VERSION < 3
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, (PyObject *)NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+bad:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+/* CLineInTraceback */
+  #ifndef CYTHON_CLINE_IN_TRACEBACK
+static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, int c_line) {
+    PyObject *use_cline;
+    PyObject *ptype, *pvalue, *ptraceback;
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject **cython_runtime_dict;
+#endif
+    if (unlikely(!__pyx_cython_runtime)) {
+        return c_line;
+    }
+    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+#if CYTHON_COMPILING_IN_CPYTHON
+    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
+    if (likely(cython_runtime_dict)) {
+        __PYX_PY_DICT_LOOKUP_IF_MODIFIED(
+            use_cline, *cython_runtime_dict,
+            __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback))
+    } else
+#endif
+    {
+      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
+      if (use_cline_obj) {
+        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
+        Py_DECREF(use_cline_obj);
+      } else {
+        PyErr_Clear();
+        use_cline = NULL;
+      }
+    }
+    if (!use_cline) {
+        c_line = 0;
+        (void) PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
+    }
+    else if (use_cline == Py_False || (use_cline != Py_True && PyObject_Not(use_cline) != 0)) {
+        c_line = 0;
+    }
+    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+    return c_line;
+}
+#endif
+
+/* CodeObjectCache */
+  static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, ((size_t)new_max) * sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+/* AddTraceback */
+  #include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+#if PY_VERSION_HEX >= 0x030b00a6
+  #ifndef Py_BUILD_CORE
+    #define Py_BUILD_CORE 1
+  #endif
+  #include "internal/pycore_frame.h"
+#endif
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = NULL;
+    PyObject *py_funcname = NULL;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_srcfile = NULL;
+    py_srcfile = PyString_FromString(filename);
+    if (!py_srcfile) goto bad;
+    #endif
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        if (!py_funcname) goto bad;
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        if (!py_funcname) goto bad;
+        funcname = PyUnicode_AsUTF8(py_funcname);
+        if (!funcname) goto bad;
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        if (!py_funcname) goto bad;
+        #endif
+    }
+    #if PY_MAJOR_VERSION < 3
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    #else
+    py_code = PyCode_NewEmpty(filename, funcname, py_line);
+    #endif
+    Py_XDECREF(py_funcname);  // XDECREF since it's only set on Py3 if cline
+    return py_code;
+bad:
+    Py_XDECREF(py_funcname);
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_srcfile);
+    #endif
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject *ptype, *pvalue, *ptraceback;
+    if (c_line) {
+        c_line = __Pyx_CLineForTraceback(tstate, c_line);
+    }
+    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
+    if (!py_code) {
+        __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) {
+            /* If the code object creation fails, then we should clear the
+               fetched exception references and propagate the new exception */
+            Py_XDECREF(ptype);
+            Py_XDECREF(pvalue);
+            Py_XDECREF(ptraceback);
+            goto bad;
+        }
+        __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        tstate,            /*PyThreadState *tstate,*/
+        py_code,           /*PyCodeObject *code,*/
+        __pyx_d,    /*PyObject *globals,*/
+        0                  /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+    PyErr_Format(PyExc_TypeError, "'%.200s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+    if ((0)) {}
+    view->obj = NULL;
+    Py_DECREF(obj);
+}
+#endif
+
+
+  /* CIntFromPyVerify */
+  #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sum_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_diff_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prod_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabsf(b.real) >= fabsf(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                float r = b.imag / b.real;
+                float s = (float)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_float_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            float r = b.real / b.imag;
+            float s = (float)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_float_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quot_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_float_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            float denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_float_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_neg_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_float(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conj_float(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_abs_float(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_pow_float(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_float(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_float(a, a);
+                        return __Pyx_c_prod_float(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if ((b.imag == 0) && (a.real >= 0)) {
+                    z.real = powf(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2f(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_float(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* Declarations */
+  #if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+/* Arithmetic */
+  #if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    #if 1
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else if (fabs(b.real) >= fabs(b.imag)) {
+            if (b.real == 0 && b.imag == 0) {
+                return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.imag);
+            } else {
+                double r = b.imag / b.real;
+                double s = (double)(1.0) / (b.real + b.imag * r);
+                return __pyx_t_double_complex_from_parts(
+                    (a.real + a.imag * r) * s, (a.imag - a.real * r) * s);
+            }
+        } else {
+            double r = b.real / b.imag;
+            double s = (double)(1.0) / (b.imag + b.real * r);
+            return __pyx_t_double_complex_from_parts(
+                (a.real * r + a.imag) * s, (a.imag * r - a.real) * s);
+        }
+    }
+    #else
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        if (b.imag == 0) {
+            return __pyx_t_double_complex_from_parts(a.real / b.real, a.imag / b.real);
+        } else {
+            double denom = b.real * b.real + b.imag * b.imag;
+            return __pyx_t_double_complex_from_parts(
+                (a.real * b.real + a.imag * b.imag) / denom,
+                (a.imag * b.real - a.real * b.imag) / denom);
+        }
+    }
+    #endif
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero_double(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj_double(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs_double(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow_double(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        return __Pyx_c_prod_double(a, a);
+                    case 3:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, a);
+                    case 4:
+                        z = __Pyx_c_prod_double(a, a);
+                        return __Pyx_c_prod_double(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                } else if ((b.imag == 0) && (a.real >= 0)) {
+                    z.real = pow(a.real, b.real);
+                    z.imag = 0;
+                    return z;
+                } else if (a.real > 0) {
+                    r = a.real;
+                    theta = 0;
+                } else {
+                    r = -a.real;
+                    theta = atan2(0.0, -1.0);
+                }
+            } else {
+                r = __Pyx_c_abs_double(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_unsigned_int(unsigned int value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const unsigned int neg_one = (unsigned int) -1, const_zero = (unsigned int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(unsigned int) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(unsigned int) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(unsigned int) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(unsigned int) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(unsigned int) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(unsigned int),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE unsigned int __Pyx_PyInt_As_unsigned_int(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const unsigned int neg_one = (unsigned int) -1, const_zero = (unsigned int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(unsigned int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(unsigned int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (unsigned int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (unsigned int) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(unsigned int, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(unsigned int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) >= 2 * PyLong_SHIFT) {
+                            return (unsigned int) (((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(unsigned int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) >= 3 * PyLong_SHIFT) {
+                            return (unsigned int) (((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(unsigned int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) >= 4 * PyLong_SHIFT) {
+                            return (unsigned int) (((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (unsigned int) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(unsigned int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(unsigned int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(unsigned int) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(unsigned int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (unsigned int) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(unsigned int, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(unsigned int,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(unsigned int) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) {
+                            return (unsigned int) (((unsigned int)-1)*(((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(unsigned int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) {
+                            return (unsigned int) ((((((unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(unsigned int) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) {
+                            return (unsigned int) (((unsigned int)-1)*(((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(unsigned int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) {
+                            return (unsigned int) ((((((((unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(unsigned int) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) - 1 > 4 * PyLong_SHIFT) {
+                            return (unsigned int) (((unsigned int)-1)*(((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(unsigned int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(unsigned int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(unsigned int) - 1 > 4 * PyLong_SHIFT) {
+                            return (unsigned int) ((((((((((unsigned int)digits[3]) << PyLong_SHIFT) | (unsigned int)digits[2]) << PyLong_SHIFT) | (unsigned int)digits[1]) << PyLong_SHIFT) | (unsigned int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(unsigned int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(unsigned int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(unsigned int) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(unsigned int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            unsigned int val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (unsigned int) -1;
+        }
+    } else {
+        unsigned int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (unsigned int) -1;
+        val = __Pyx_PyInt_As_unsigned_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to unsigned int");
+    return (unsigned int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to unsigned int");
+    return (unsigned int) -1;
+}
+
+/* CIntToPy */
+  static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const long neg_one = (long) -1, const_zero = (long) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (long) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(long,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* CIntFromPy */
+  static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wconversion"
+#endif
+    const int neg_one = (int) -1, const_zero = (int) 0;
+#ifdef __Pyx_HAS_GCC_DIAGNOSTIC
+#pragma GCC diagnostic pop
+#endif
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (int) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(int,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* FastTypeChecks */
+  #if CYTHON_COMPILING_IN_CPYTHON
+static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
+    while (a) {
+        a = a->tp_base;
+        if (a == b)
+            return 1;
+    }
+    return b == &PyBaseObject_Type;
+}
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (a == b) return 1;
+    mro = a->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(a, b);
+}
+#if PY_MAJOR_VERSION == 2
+static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
+    PyObject *exception, *value, *tb;
+    int res;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&exception, &value, &tb);
+    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
+    if (unlikely(res == -1)) {
+        PyErr_WriteUnraisable(err);
+        res = 0;
+    }
+    if (!res) {
+        res = PyObject_IsSubclass(err, exc_type2);
+        if (unlikely(res == -1)) {
+            PyErr_WriteUnraisable(err);
+            res = 0;
+        }
+    }
+    __Pyx_ErrRestore(exception, value, tb);
+    return res;
+}
+#else
+static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
+    int res = exc_type1 ? __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type1) : 0;
+    if (!res) {
+        res = __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
+    }
+    return res;
+}
+#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    assert(PyExceptionClass_Check(exc_type));
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        PyObject *t = PyTuple_GET_ITEM(tuple, i);
+        #if PY_MAJOR_VERSION < 3
+        if (likely(exc_type == t)) return 1;
+        #endif
+        if (likely(PyExceptionClass_Check(t))) {
+            if (__Pyx_inner_PyErr_GivenExceptionMatches2(exc_type, NULL, t)) return 1;
+        } else {
+        }
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
+    if (likely(err == exc_type)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        if (likely(PyExceptionClass_Check(exc_type))) {
+            return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
+        } else if (likely(PyTuple_Check(exc_type))) {
+            return __Pyx_PyErr_GivenExceptionMatchesTuple(err, exc_type);
+        } else {
+        }
+    }
+    return PyErr_GivenExceptionMatches(err, exc_type);
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
+    assert(PyExceptionClass_Check(exc_type1));
+    assert(PyExceptionClass_Check(exc_type2));
+    if (likely(err == exc_type1 || err == exc_type2)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
+    }
+    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
+}
+#endif
+
+/* CheckBinaryVersion */
+  static int __Pyx_check_binary_version(void) {
+    char ctversion[5];
+    int same=1, i, found_dot;
+    const char* rt_from_call = Py_GetVersion();
+    PyOS_snprintf(ctversion, 5, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    found_dot = 0;
+    for (i = 0; i < 4; i++) {
+        if (!ctversion[i]) {
+            same = (rt_from_call[i] < '0' || rt_from_call[i] > '9');
+            break;
+        }
+        if (rt_from_call[i] != ctversion[i]) {
+            same = 0;
+            break;
+        }
+    }
+    if (!same) {
+        char rtversion[5] = {'\0'};
+        char message[200];
+        for (i=0; i<4; ++i) {
+            if (rt_from_call[i] == '.') {
+                if (found_dot) break;
+                found_dot = 1;
+            } else if (rt_from_call[i] < '0' || rt_from_call[i] > '9') {
+                break;
+            }
+            rtversion[i] = rt_from_call[i];
+        }
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+    return 0;
+}
+
+/* InitStrings */
+  static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        if (PyObject_Hash(*t->p) == -1)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str));
+}
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#if !CYTHON_PEP393_ENABLED
+static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    char* defenc_c;
+    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+    if (!defenc) return NULL;
+    defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    {
+        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+        char* c;
+        for (c = defenc_c; c < end; c++) {
+            if ((unsigned char) (*c) >= 128) {
+                PyUnicode_AsASCIIString(o);
+                return NULL;
+            }
+        }
+    }
+#endif
+    *length = PyBytes_GET_SIZE(defenc);
+    return defenc_c;
+}
+#else
+static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    if (likely(PyUnicode_IS_ASCII(o))) {
+        *length = PyUnicode_GET_LENGTH(o);
+        return PyUnicode_AsUTF8(o);
+    } else {
+        PyUnicode_AsASCIIString(o);
+        return NULL;
+    }
+#else
+    return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+}
+#endif
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+        return __Pyx_PyUnicode_AsStringAndSize(o, length);
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrueAndDecref(PyObject* x) {
+    int retval;
+    if (unlikely(!x)) return -1;
+    retval = __Pyx_PyObject_IsTrue(x);
+    Py_DECREF(x);
+    return retval;
+}
+static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
+#if PY_MAJOR_VERSION >= 3
+    if (PyLong_Check(result)) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "__int__ returned non-int (type %.200s).  "
+                "The ability to return an instance of a strict subclass of int "
+                "is deprecated, and may be removed in a future version of Python.",
+                Py_TYPE(result)->tp_name)) {
+            Py_DECREF(result);
+            return NULL;
+        }
+        return result;
+    }
+#endif
+    PyErr_Format(PyExc_TypeError,
+                 "__%.4s__ returned non-%.4s (type %.200s)",
+                 type_name, type_name, Py_TYPE(result)->tp_name);
+    Py_DECREF(result);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_Check(x) || PyLong_Check(x)))
+#else
+  if (likely(PyLong_Check(x)))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = m->nb_long(x);
+  }
+  #else
+  if (likely(m && m->nb_int)) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  #endif
+#else
+  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
+    res = PyNumber_Int(x);
+  }
+#endif
+  if (likely(res)) {
+#if PY_MAJOR_VERSION < 3
+    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
+#else
+    if (unlikely(!PyLong_CheckExact(res))) {
+#endif
+        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(b);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    const digit* digits = ((PyLongObject*)b)->ob_digit;
+    const Py_ssize_t size = Py_SIZE(b);
+    if (likely(__Pyx_sst_abs(size) <= 1)) {
+        ival = likely(size) ? digits[0] : 0;
+        if (size == -1) ival = -ival;
+        return ival;
+    } else {
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE Py_hash_t __Pyx_PyIndex_AsHash_t(PyObject* o) {
+  if (sizeof(Py_hash_t) == sizeof(Py_ssize_t)) {
+    return (Py_hash_t) __Pyx_PyIndex_AsSsize_t(o);
+#if PY_MAJOR_VERSION < 3
+  } else if (likely(PyInt_CheckExact(o))) {
+    return PyInt_AS_LONG(o);
+#endif
+  } else {
+    Py_ssize_t ival;
+    PyObject *x;
+    x = PyNumber_Index(o);
+    if (!x) return -1;
+    ival = PyInt_AsLong(x);
+    Py_DECREF(x);
+    return ival;
+  }
+}
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
+  return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False);
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/box_overlaps.pyx b/models/cv/detection/retinaface/igie/widerface_evaluate/box_overlaps.pyx
new file mode 100755
index 0000000000000000000000000000000000000000..3cedef8f6062b53000ba52c28eb3cb8e868d9c59
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/widerface_evaluate/box_overlaps.pyx
@@ -0,0 +1,57 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = float
+ctypedef double DTYPE_t
+
+def bbox_overlaps(
+        np.ndarray[DTYPE_t, ndim=2] boxes,
+        np.ndarray[DTYPE_t, ndim=2] query_boxes):
+    """
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+    cdef DTYPE_t iw, ih, box_area
+    cdef DTYPE_t ua
+    cdef unsigned int k, n
+    for k in range(K):
+        box_area = (
+            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        )
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + 1
+            )
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + 1
+                )
+                if ih > 0:
+                    ua = float(
+                        (boxes[n, 2] - boxes[n, 0] + 1) *
+                        (boxes[n, 3] - boxes[n, 1] + 1) +
+                        box_area - iw * ih
+                    )
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/evaluation.py b/models/cv/detection/retinaface/igie/widerface_evaluate/evaluation.py
new file mode 100755
index 0000000000000000000000000000000000000000..12b5b962216b21491fd8b9f176dbc6c8ffbf9a1d
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/widerface_evaluate/evaluation.py
@@ -0,0 +1,295 @@
+"""
+WiderFace evaluation code
+author: wondervictor
+mail: tianhengcheng@gmail.com
+copyright@wondervictor
+"""
+
+import os
+import tqdm
+import pickle
+import argparse
+import numpy as np
+from scipy.io import loadmat
+from bbox import bbox_overlaps
+
+def get_gt_boxes(gt_dir):
+    """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""
+
+    gt_mat = loadmat(os.path.join(gt_dir, 'wider_face_val.mat'))
+    hard_mat = loadmat(os.path.join(gt_dir, 'wider_hard_val.mat'))
+    medium_mat = loadmat(os.path.join(gt_dir, 'wider_medium_val.mat'))
+    easy_mat = loadmat(os.path.join(gt_dir, 'wider_easy_val.mat'))
+
+    facebox_list = gt_mat['face_bbx_list']
+    event_list = gt_mat['event_list']
+    file_list = gt_mat['file_list']
+
+    hard_gt_list = hard_mat['gt_list']
+    medium_gt_list = medium_mat['gt_list']
+    easy_gt_list = easy_mat['gt_list']
+
+    return facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list
+
+
+def get_gt_boxes_from_txt(gt_path, cache_dir):
+
+    cache_file = os.path.join(cache_dir, 'gt_cache.pkl')
+    if os.path.exists(cache_file):
+        f = open(cache_file, 'rb')
+        boxes = pickle.load(f)
+        f.close()
+        return boxes
+
+    f = open(gt_path, 'r')
+    state = 0
+    lines = f.readlines()
+    lines = list(map(lambda x: x.rstrip('\r\n'), lines))
+    boxes = {}
+    print(len(lines))
+    f.close()
+    current_boxes = []
+    current_name = None
+    for line in lines:
+        if state == 0 and '--' in line:
+            state = 1
+            current_name = line
+            continue
+        if state == 1:
+            state = 2
+            continue
+
+        if state == 2 and '--' in line:
+            state = 1
+            boxes[current_name] = np.array(current_boxes).astype('float32')
+            current_name = line
+            current_boxes = []
+            continue
+
+        if state == 2:
+            box = [float(x) for x in line.split(' ')[:4]]
+            current_boxes.append(box)
+            continue
+
+    f = open(cache_file, 'wb')
+    pickle.dump(boxes, f)
+    f.close()
+    return boxes
+
+
+def read_pred_file(filepath):
+    with open(filepath, 'r') as f:
+        lines = f.readlines()
+        img_file = lines[0].rstrip('\n\r')
+        lines = lines[2:]
+
+    boxes = []
+    for line in lines:
+        line = line.rstrip('\r\n').split(' ')
+        if line[0] == '':
+            continue
+        boxes.append([float(line[0]), float(line[1]), float(line[2]), float(line[3]), float(line[4])])
+    boxes = np.array(boxes)
+    return img_file.split('/')[-1], boxes
+
+
+def get_preds(pred_dir):
+    events = os.listdir(pred_dir)
+    boxes = dict()
+    pbar = tqdm.tqdm(events)
+
+    for event in pbar:
+        pbar.set_description('Reading Predictions ')
+        event_dir = os.path.join(pred_dir, event)
+        event_images = os.listdir(event_dir)
+        current_event = dict()
+        for imgtxt in event_images:
+            imgname, _boxes = read_pred_file(os.path.join(event_dir, imgtxt))
+            current_event[imgname.rstrip('.jpg')] = _boxes
+        boxes[event] = current_event
+    return boxes
+
+
+def norm_score(pred):
+    """ norm score
+    pred {key: [[x1,y1,x2,y2,s]]}
+    """
+
+    max_score = 0
+    min_score = 1
+
+    for _, k in pred.items():
+        for _, v in k.items():
+            if len(v) == 0:
+                continue
+            _min = np.min(v[:, -1])
+            _max = np.max(v[:, -1])
+            max_score = max(_max, max_score)
+            min_score = min(_min, min_score)
+
+    diff = max_score - min_score
+    for _, k in pred.items():
+        for _, v in k.items():
+            if len(v) == 0:
+                continue
+            v[:, -1] = (v[:, -1] - min_score)/diff
+
+
+def image_eval(pred, gt, ignore, iou_thresh):
+    """ single image evaluation
+    pred: Nx5
+    gt: Nx4
+    ignore:
+    """
+
+    _pred = pred.copy()
+    _gt = gt.copy()
+    pred_recall = np.zeros(_pred.shape[0])
+    recall_list = np.zeros(_gt.shape[0])
+    proposal_list = np.ones(_pred.shape[0])
+
+    _pred[:, 2] = _pred[:, 2] + _pred[:, 0]
+    _pred[:, 3] = _pred[:, 3] + _pred[:, 1]
+    _gt[:, 2] = _gt[:, 2] + _gt[:, 0]
+    _gt[:, 3] = _gt[:, 3] + _gt[:, 1]
+
+    overlaps = bbox_overlaps(_pred[:, :4], _gt)
+
+    for h in range(_pred.shape[0]):
+
+        gt_overlap = overlaps[h]
+        max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()
+        if max_overlap >= iou_thresh:
+            if ignore[max_idx] == 0:
+                recall_list[max_idx] = -1
+                proposal_list[h] = -1
+            elif recall_list[max_idx] == 0:
+                recall_list[max_idx] = 1
+
+        r_keep_index = np.where(recall_list == 1)[0]
+        pred_recall[h] = len(r_keep_index)
+    return pred_recall, proposal_list
+
+
+def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall):
+    pr_info = np.zeros((thresh_num, 2)).astype('float')
+    for t in range(thresh_num):
+
+        thresh = 1 - (t+1)/thresh_num
+        r_index = np.where(pred_info[:, 4] >= thresh)[0]
+        if len(r_index) == 0:
+            pr_info[t, 0] = 0
+            pr_info[t, 1] = 0
+        else:
+            r_index = r_index[-1]
+            p_index = np.where(proposal_list[:r_index+1] == 1)[0]
+            pr_info[t, 0] = len(p_index)
+            pr_info[t, 1] = pred_recall[r_index]
+    return pr_info
+
+
+def dataset_pr_info(thresh_num, pr_curve, count_face):
+    _pr_curve = np.zeros((thresh_num, 2))
+    for i in range(thresh_num):
+        _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0]
+        _pr_curve[i, 1] = pr_curve[i, 1] / count_face
+    return _pr_curve
+
+
+def voc_ap(rec, prec):
+
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], rec, [1.]))
+    mpre = np.concatenate(([0.], prec, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def evaluation(pred, gt_path, iou_thresh=0.5):
+    pred = get_preds(pred)
+    norm_score(pred)
+    facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list = get_gt_boxes(gt_path)
+    event_num = len(event_list)
+    thresh_num = 1000
+    settings = ['easy', 'medium', 'hard']
+    setting_gts = [easy_gt_list, medium_gt_list, hard_gt_list]
+    aps = []
+    for setting_id in range(3):
+        # different setting
+        gt_list = setting_gts[setting_id]
+        count_face = 0
+        pr_curve = np.zeros((thresh_num, 2)).astype('float')
+        # [hard, medium, easy]
+        pbar = tqdm.tqdm(range(event_num))
+        for i in pbar:
+            pbar.set_description('Processing {}'.format(settings[setting_id]))
+            event_name = str(event_list[i][0][0])
+            img_list = file_list[i][0]
+            pred_list = pred[event_name]
+            sub_gt_list = gt_list[i][0]
+            # img_pr_info_list = np.zeros((len(img_list), thresh_num, 2))
+            gt_bbx_list = facebox_list[i][0]
+
+            for j in range(len(img_list)):
+                pred_info = pred_list[str(img_list[j][0][0])]
+
+                gt_boxes = gt_bbx_list[j][0].astype('float')
+                keep_index = sub_gt_list[j][0]
+                count_face += len(keep_index)
+
+                if len(gt_boxes) == 0 or len(pred_info) == 0:
+                    continue
+                ignore = np.zeros(gt_boxes.shape[0])
+                if len(keep_index) != 0:
+                    ignore[keep_index-1] = 1
+                pred_recall, proposal_list = image_eval(pred_info, gt_boxes, ignore, iou_thresh)
+
+                _img_pr_info = img_pr_info(thresh_num, pred_info, proposal_list, pred_recall)
+
+                pr_curve += _img_pr_info
+        pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face)
+
+        propose = pr_curve[:, 0]
+        recall = pr_curve[:, 1]
+
+        ap = voc_ap(recall, propose)
+        aps.append(ap)
+
+    print("==================== Results ====================")
+    print("Easy   Val AP: {}".format(aps[0]))
+    print("Medium Val AP: {}".format(aps[1]))
+    print("Hard   Val AP: {}".format(aps[2]))
+    print("=================================================")
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--pred', default="./widerface_txt/")
+    parser.add_argument('-g', '--gt', default='./ground_truth/')
+
+    args = parser.parse_args()
+    evaluation(args.pred, args.gt)
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_easy_val.mat b/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_easy_val.mat
new file mode 100755
index 0000000000000000000000000000000000000000..5b78df7c006982ab1a2038149f1f5168b2237f07
Binary files /dev/null and b/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_easy_val.mat differ
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_face_val.mat b/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_face_val.mat
new file mode 100755
index 0000000000000000000000000000000000000000..7ac47e55b72d04c5fa237321509b2d2319f8ded2
Binary files /dev/null and b/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_face_val.mat differ
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_hard_val.mat b/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_hard_val.mat
new file mode 100755
index 0000000000000000000000000000000000000000..061fcab0761d4fb6a0574756ae5c3d4591aecfc7
Binary files /dev/null and b/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_hard_val.mat differ
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_medium_val.mat b/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_medium_val.mat
new file mode 100755
index 0000000000000000000000000000000000000000..9670c7a96d8af29c59d6b08cab7765b8bc427182
Binary files /dev/null and b/models/cv/detection/retinaface/igie/widerface_evaluate/ground_truth/wider_medium_val.mat differ
diff --git a/models/cv/detection/retinaface/igie/widerface_evaluate/setup.py b/models/cv/detection/retinaface/igie/widerface_evaluate/setup.py
new file mode 100755
index 0000000000000000000000000000000000000000..74dba05b3809f76de0db678342d5c94520c5066c
--- /dev/null
+++ b/models/cv/detection/retinaface/igie/widerface_evaluate/setup.py
@@ -0,0 +1,13 @@
+"""
+WiderFace evaluation code
+author: wondervictor
+mail: tianhengcheng@gmail.com
+copyright@wondervictor
+"""
+
+from distutils.core import setup, Extension
+from Cython.Build import cythonize
+import numpy
+
+package = Extension('bbox', ['box_overlaps.pyx'], include_dirs=[numpy.get_include()])
+setup(ext_modules=cythonize([package]))
diff --git a/models/cv/detection/rtmdet/igie/README.md b/models/cv/detection/rtmdet/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..25b1f9b3af2be339fd701f2654641e75afe837cf
--- /dev/null
+++ b/models/cv/detection/rtmdet/igie/README.md
@@ -0,0 +1,66 @@
+# RTMDet
+
+## Description
+
+RTMDet, presented by the Shanghai AI Laboratory, is a novel framework for real-time object detection that surpasses the efficiency of the YOLO series. The model's architecture is meticulously crafted for optimal efficiency, employing a basic building block consisting of large-kernel depth-wise convolutions in both the backbone and neck, which enhances the model's ability to capture global context.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install onnxsim
+pip3 install mmdet==3.3.0
+pip3 install mmpose==1.3.1
+pip3 install mmdeploy==1.3.1
+pip3 install mmengine==0.10.4
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth>
+
+Dataset: <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+
+```bash
+wget https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth
+```
+
+### Model Conversion
+
+```bash
+# export onnx model
+python3 export.py --weight rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth --cfg rtmdet_nano_320-8xb32_coco-person.py --output rtmdet.onnx
+
+# use onnxsim optimize onnx model
+onnxsim rtmdet.onnx rtmdet_opt.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/coco/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_rtmdet_fp16_accuracy.sh
+# Performance
+bash scripts/infer_rtmdet_fp16_performance.sh
+```
+
+## Results
+
+| Model  | BatchSize | Input Shape | Precision |   FPS    | mAP@0.5(%) |
+| :----: | :-------: | :---------: | :-------: | :------: | :--------: |
+| RTMDet |    32     |   320x320   |   FP16    | 4006.849 |   0.619    |
diff --git a/models/cv/detection/rtmdet/igie/build_engine.py b/models/cv/detection/rtmdet/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/detection/rtmdet/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/rtmdet/igie/deploy_default.py b/models/cv/detection/rtmdet/igie/deploy_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d8e43dc829456f0c2e46a7acfc3128757f945d
--- /dev/null
+++ b/models/cv/detection/rtmdet/igie/deploy_default.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+onnx_config = dict(
+    type='onnx',
+    export_params=True,
+    keep_initializers_as_inputs=False,
+    opset_version=11,
+    save_file='end2end.onnx',
+    input_names=['input'],
+    output_names=['output'],
+    input_shape=None,
+    optimize=True)
+
+codebase_config = dict(
+    type='mmdet',
+    task='ObjectDetection',
+    model_type='end2end',
+    post_processing=dict(
+        score_threshold=0.05,
+        confidence_threshold=0.005,
+        iou_threshold=0.5,
+        max_output_boxes_per_class=200,
+        pre_top_k=5000,
+        keep_top_k=100,
+        background_label_id=-1,
+    ))
+
+backend_config = dict(type='onnxruntime')
\ No newline at end of file
diff --git a/models/cv/detection/rtmdet/igie/export.py b/models/cv/detection/rtmdet/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..a30efa81807460f6a7c2b9821509cb7270ec1730
--- /dev/null
+++ b/models/cv/detection/rtmdet/igie/export.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import argparse
+
+import torch
+from mmdeploy.utils import load_config
+from mmdeploy.apis import build_task_processor
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+       
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    deploy_cfg = 'deploy_default.py'
+    model_cfg = args.cfg
+    model_checkpoint = args.weight
+
+    deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+    task_processor = build_task_processor(model_cfg, deploy_cfg, device='cpu')
+
+    model = task_processor.build_pytorch_model(model_checkpoint)
+
+    input_names = ['input']
+    dynamic_axes = {'input': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 320, 320)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/models/cv/detection/rtmdet/igie/inference.py b/models/cv/detection/rtmdet/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c89904b29c95553f2b88a19cb1447a523b9819f
--- /dev/null
+++ b/models/cv/detection/rtmdet/igie/inference.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import argparse
+import tvm
+import torch
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from mmpose.registry import RUNNERS
+from mmengine.config import Config
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine    
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:       
+         # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # runner config
+        cfg = Config.fromfile("rtmdet_nano_320-8xb32_coco-person.py")
+
+        cfg.work_dir = "./"
+        cfg['test_dataloader']['batch_size'] = batch_size
+        cfg['test_dataloader']['dataset']['data_root'] = args.datasets
+        cfg['test_dataloader']['dataset']['data_prefix']['img'] = 'images/val2017/'
+        cfg['test_evaluator']['ann_file'] = os.path.join(args.datasets, 'annotations/person_keypoints_val2017.json')
+        cfg['log_level'] = 'ERROR'
+
+        # build runner
+        runner = RUNNERS.build(cfg)
+    
+        for data in tqdm(runner.test_dataloader):
+            cls_score = []
+            box_reg = []
+            input_data = runner.model.data_preprocessor(data, False)
+            image = input_data['inputs'].cpu()
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input("input", tvm.nd.array(image, device))
+
+            module.run()
+            
+            for i in range(module.get_num_outputs()):
+                output = module.get_output(i).asnumpy()
+
+                if pad_batch:
+                    output = output[:origin_size]
+
+                output = torch.from_numpy(output)
+                if output.shape[1] == 4:
+                    box_reg.append(output)
+                else:
+                    cls_score.append(output)
+
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in data['data_samples']
+            ]  
+
+            preds = runner.model.bbox_head.predict_by_feat(cls_score, box_reg, batch_img_metas=batch_img_metas, rescale=True)
+
+            batch_data_samples = runner.model.add_pred_to_datasample(input_data['data_samples'], preds)
+
+            runner.test_evaluator.process(data_samples=batch_data_samples, data_batch=data)
+
+        metrics = runner.test_evaluator.evaluate(len(runner.test_dataloader.dataset))        
+    
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/detection/rtmdet/igie/rtmdet_nano_320-8xb32_coco-person.py b/models/cv/detection/rtmdet/igie/rtmdet_nano_320-8xb32_coco-person.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e71fb66fda1f1cb566eb8bb1759795317991633
--- /dev/null
+++ b/models/cv/detection/rtmdet/igie/rtmdet_nano_320-8xb32_coco-person.py
@@ -0,0 +1,529 @@
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backend_args = None
+base_lr = 0.004
+custom_hooks = [
+    dict(
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        priority=49,
+        type='EMAHook',
+        update_buffers=True),
+    dict(
+        switch_epoch=280,
+        switch_pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.5,
+                    1.5,
+                ),
+                scale=(
+                    320,
+                    320,
+                ),
+                type='RandomResize'),
+            dict(crop_size=(
+                320,
+                320,
+            ), type='RandomCrop'),
+            dict(type='YOLOXHSVRandomAug'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(
+                pad_val=dict(img=(
+                    114,
+                    114,
+                    114,
+                )),
+                size=(
+                    320,
+                    320,
+                ),
+                type='Pad'),
+            dict(type='PackDetInputs'),
+        ],
+        type='PipelineSwitchHook'),
+]
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    checkpoint=dict(
+        _scope_='mmdet', interval=10, max_keep_ckpts=3, type='CheckpointHook'),
+    logger=dict(_scope_='mmdet', interval=50, type='LoggerHook'),
+    param_scheduler=dict(_scope_='mmdet', type='ParamSchedulerHook'),
+    sampler_seed=dict(_scope_='mmdet', type='DistSamplerSeedHook'),
+    timer=dict(_scope_='mmdet', type='IterTimerHook'),
+    visualization=dict(_scope_='mmdet', type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+img_scales = [
+    (
+        640,
+        640,
+    ),
+    (
+        320,
+        320,
+    ),
+    (
+        960,
+        960,
+    ),
+]
+input_shape = 320
+interval = 10
+load_from = None
+log_level = 'ERROR'
+log_processor = dict(
+    _scope_='mmdet', by_epoch=True, type='LogProcessor', window_size=50)
+max_epochs = 300
+model = dict(
+    _scope_='mmdet',
+    backbone=dict(
+        act_cfg=dict(inplace=True, type='SiLU'),
+        arch='P5',
+        channel_attention=True,
+        deepen_factor=0.33,
+        expand_ratio=0.5,
+        norm_cfg=dict(type='SyncBN'),
+        type='CSPNeXt',
+        use_depthwise=True,
+        widen_factor=0.25),
+    bbox_head=dict(
+        act_cfg=dict(inplace=True, type='SiLU'),
+        anchor_generator=dict(
+            offset=0, strides=[
+                8,
+                16,
+                32,
+            ], type='MlvlPointGenerator'),
+        bbox_coder=dict(type='DistancePointBBoxCoder'),
+        exp_on_reg=False,
+        feat_channels=64,
+        in_channels=64,
+        loss_bbox=dict(loss_weight=2.0, type='GIoULoss'),
+        loss_cls=dict(
+            beta=2.0,
+            loss_weight=1.0,
+            type='QualityFocalLoss',
+            use_sigmoid=True),
+        norm_cfg=dict(type='SyncBN'),
+        num_classes=1,
+        pred_kernel_size=1,
+        share_conv=False,
+        stacked_convs=2,
+        type='RTMDetSepBNHead',
+        use_depthwise=True,
+        with_objectness=False),
+    data_preprocessor=dict(
+        batch_augments=None,
+        bgr_to_rgb=False,
+        mean=[
+            103.53,
+            116.28,
+            123.675,
+        ],
+        std=[
+            57.375,
+            57.12,
+            58.395,
+        ],
+        type='DetDataPreprocessor'),
+    neck=dict(
+        act_cfg=dict(inplace=True, type='SiLU'),
+        expand_ratio=0.5,
+        in_channels=[
+            64,
+            128,
+            256,
+        ],
+        norm_cfg=dict(type='SyncBN'),
+        num_csp_blocks=1,
+        out_channels=64,
+        type='CSPNeXtPAFPN',
+        use_depthwise=True),
+    test_cfg=dict(
+        max_per_img=100,
+        min_bbox_size=0,
+        nms=dict(iou_threshold=0.6, type='nms'),
+        nms_pre=1000,
+        score_thr=0.05),
+    train_cfg=dict(
+        allowed_border=-1,
+        assigner=dict(topk=13, type='DynamicSoftLabelAssigner'),
+        debug=False,
+        pos_weight=-1),
+    type='RTMDet')
+optim_wrapper = dict(
+    _scope_='mmdet',
+    optimizer=dict(lr=0.004, type='AdamW', weight_decay=0.05),
+    paramwise_cfg=dict(
+        bias_decay_mult=0, bypass_duplicate=True, norm_decay_mult=0),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        _scope_='mmdet',
+        begin=0,
+        by_epoch=False,
+        end=1000,
+        start_factor=1e-05,
+        type='LinearLR'),
+    dict(
+        T_max=150,
+        _scope_='mmdet',
+        begin=150,
+        by_epoch=True,
+        convert_to_iter_based=True,
+        end=300,
+        eta_min=0.0002,
+        type='CosineAnnealingLR'),
+]
+resume = False
+stage2_num_epochs = 20
+test_cfg = dict(_scope_='mmdet', type='TestLoop')
+test_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        _scope_='mmdet',
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='images/val2017/'),
+        data_root='/root/.igie_cache/data/datasets/coco',
+        metainfo=dict(classes=('person', )),
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                320,
+                320,
+            ), type='Resize'),
+            dict(
+                pad_val=dict(img=(
+                    114,
+                    114,
+                    114,
+                )),
+                size=(
+                    320,
+                    320,
+                ),
+                type='Pad'),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(_scope_='mmdet', shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    _scope_='mmdet',
+    ann_file=
+    '/root/.igie_cache/data/datasets/coco/annotations/person_keypoints_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    proposal_nums=(
+        100,
+        1,
+        10,
+    ),
+    type='CocoMetric')
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        320,
+        320,
+    ), type='Resize'),
+    dict(pad_val=dict(img=(
+        114,
+        114,
+        114,
+    )), size=(
+        320,
+        320,
+    ), type='Pad'),
+    dict(
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+        ),
+        type='PackDetInputs'),
+]
+train_cfg = dict(
+    _scope_='mmdet',
+    dynamic_intervals=[
+        (
+            280,
+            1,
+        ),
+    ],
+    max_epochs=300,
+    type='EpochBasedTrainLoop',
+    val_interval=10)
+train_dataloader = dict(
+    batch_sampler=None,
+    batch_size=32,
+    dataset=dict(
+        _scope_='mmdet',
+        ann_file='annotations/instances_train2017.json',
+        backend_args=None,
+        data_prefix=dict(img='train2017/'),
+        data_root='data/coco/',
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        metainfo=dict(classes=('person', )),
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                img_scale=(
+                    320,
+                    320,
+                ),
+                max_cached_images=20,
+                pad_val=114.0,
+                random_pop=False,
+                type='CachedMosaic'),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.5,
+                    1.5,
+                ),
+                scale=(
+                    640,
+                    640,
+                ),
+                type='RandomResize'),
+            dict(crop_size=(
+                320,
+                320,
+            ), type='RandomCrop'),
+            dict(type='YOLOXHSVRandomAug'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(
+                pad_val=dict(img=(
+                    114,
+                    114,
+                    114,
+                )),
+                size=(
+                    320,
+                    320,
+                ),
+                type='Pad'),
+            dict(type='PackDetInputs'),
+        ],
+        type='CocoDataset'),
+    num_workers=10,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(_scope_='mmdet', shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        img_scale=(
+            320,
+            320,
+        ),
+        max_cached_images=20,
+        pad_val=114.0,
+        random_pop=False,
+        type='CachedMosaic'),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.5,
+            1.5,
+        ),
+        scale=(
+            640,
+            640,
+        ),
+        type='RandomResize'),
+    dict(crop_size=(
+        320,
+        320,
+    ), type='RandomCrop'),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(pad_val=dict(img=(
+        114,
+        114,
+        114,
+    )), size=(
+        320,
+        320,
+    ), type='Pad'),
+    dict(type='PackDetInputs'),
+]
+train_pipeline_stage2 = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.5,
+            1.5,
+        ),
+        scale=(
+            320,
+            320,
+        ),
+        type='RandomResize'),
+    dict(crop_size=(
+        320,
+        320,
+    ), type='RandomCrop'),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(pad_val=dict(img=(
+        114,
+        114,
+        114,
+    )), size=(
+        320,
+        320,
+    ), type='Pad'),
+    dict(type='PackDetInputs'),
+]
+tta_model = dict(
+    _scope_='mmdet',
+    tta_cfg=dict(max_per_img=100, nms=dict(iou_threshold=0.6, type='nms')),
+    type='DetTTAModel')
+tta_pipeline = [
+    dict(_scope_='mmdet', backend_args=None, type='LoadImageFromFile'),
+    dict(
+        _scope_='mmdet',
+        transforms=[
+            [
+                dict(keep_ratio=True, scale=(
+                    640,
+                    640,
+                ), type='Resize'),
+                dict(keep_ratio=True, scale=(
+                    320,
+                    320,
+                ), type='Resize'),
+                dict(keep_ratio=True, scale=(
+                    960,
+                    960,
+                ), type='Resize'),
+            ],
+            [
+                dict(prob=1.0, type='RandomFlip'),
+                dict(prob=0.0, type='RandomFlip'),
+            ],
+            [
+                dict(
+                    pad_val=dict(img=(
+                        114,
+                        114,
+                        114,
+                    )),
+                    size=(
+                        960,
+                        960,
+                    ),
+                    type='Pad'),
+            ],
+            [
+                dict(type='LoadAnnotations', with_bbox=True),
+            ],
+            [
+                dict(
+                    meta_keys=(
+                        'img_id',
+                        'img_path',
+                        'ori_shape',
+                        'img_shape',
+                        'scale_factor',
+                        'flip',
+                        'flip_direction',
+                    ),
+                    type='PackDetInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(_scope_='mmdet', type='ValLoop')
+val_dataloader = dict(
+    batch_size=5,
+    dataset=dict(
+        _scope_='mmdet',
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='data/coco/',
+        metainfo=dict(classes=('person', )),
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                320,
+                320,
+            ), type='Resize'),
+            dict(
+                pad_val=dict(img=(
+                    114,
+                    114,
+                    114,
+                )),
+                size=(
+                    320,
+                    320,
+                ),
+                type='Pad'),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(_scope_='mmdet', shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    _scope_='mmdet',
+    ann_file='data/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    proposal_nums=(
+        100,
+        1,
+        10,
+    ),
+    type='CocoMetric')
+vis_backends = [
+    dict(_scope_='mmdet', type='LocalVisBackend'),
+]
+visualizer = dict(
+    _scope_='mmdet',
+    name='visualizer',
+    type='DetLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './'
diff --git a/models/cv/detection/rtmdet/igie/scripts/infer_rtmdet_fp16_accuracy.sh b/models/cv/detection/rtmdet/igie/scripts/infer_rtmdet_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c199f8404ec20cbbe52e7de1035e8665b42c47ad
--- /dev/null
+++ b/models/cv/detection/rtmdet/igie/scripts/infer_rtmdet_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="rtmdet_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,320,320    \
+    --precision fp16                        \
+    --engine_path rtmdet_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine rtmdet_bs_${batchsize}_fp16.so  \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/detection/rtmdet/igie/scripts/infer_rtmdet_fp16_performance.sh b/models/cv/detection/rtmdet/igie/scripts/infer_rtmdet_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dbf78ee88ff36da819a961e7384025a8f1fabc8b
--- /dev/null
+++ b/models/cv/detection/rtmdet/igie/scripts/infer_rtmdet_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="rtmdet_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,320,320    \
+    --precision fp16                        \
+    --engine_path rtmdet_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine rtmdet_bs_${batchsize}_fp16.so  \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}               \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/detection/yolox/ixrt/python/inference.py b/models/cv/detection/yolox/ixrt/python/inference.py
index aa3ae2f513968d46e2a324338c0490becd24e206..a6c681542956f41e2f8fb2a379d11832e2d09817 100644
--- a/models/cv/detection/yolox/ixrt/python/inference.py
+++ b/models/cv/detection/yolox/ixrt/python/inference.py
@@ -30,47 +30,47 @@ from utils import COCO2017Dataset, COCO2017Evaluator
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    
-    parser.add_argument("--engine", 
-                        type=str, 
-                        required=True, 
+
+    parser.add_argument("--engine",
+                        type=str,
+                        required=True,
                         help="igie engine path.")
-    
+
     parser.add_argument("--batchsize",
                         type=int,
-                        required=True, 
+                        required=True,
                         help="inference batch size.")
-    
-    parser.add_argument("--datasets", 
-                        type=str, 
-                        required=True, 
+
+    parser.add_argument("--datasets",
+                        type=str,
+                        required=True,
                         help="datasets path.")
-    
-    parser.add_argument("--warmup", 
-                        type=int, 
-                        default=5, 
-                        help="number of warmup before test.")           
-    
+
+    parser.add_argument("--warmup",
+                        type=int,
+                        default=5,
+                        help="number of warmup before test.")
+
     parser.add_argument("--num_workers",
                         type=int,
                         default=16,
                         help="number of workers used in pytorch dataloader.")
-    
+
     parser.add_argument("--acc_target",
                         type=float,
                         default=None,
                         help="Model inference Accuracy target.")
-    
+
     parser.add_argument("--fps_target",
                         type=float,
                         default=None,
                         help="Model inference FPS target.")
-    
+
     parser.add_argument("--conf",
                         type=float,
                         default=0.001,
                         help="confidence threshold.")
-    
+
     parser.add_argument("--iou",
                         type=float,
                         default=0.65,
@@ -80,11 +80,11 @@ def parse_args():
                         type=bool,
                         default=False,
                         help="Run performance test only")
-    parser.add_argument("--loop_count", 
-                        type=int, 
-                        default=-1, 
+    parser.add_argument("--loop_count",
+                        type=int,
+                        default=-1,
                         help="loop count")
-    
+
     args = parser.parse_args()
 
     return args
@@ -188,11 +188,11 @@ def main():
 
     host_mem = tensorrt.IHostMemory
     logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-    
+
     # Load Engine
     engine, context = create_engine_context(args.engine, logger)
     inputs, outputs, allocations = get_io_bindings(engine)
-    
+
     # Warm up
     print("\nWarm Start.")
     for i in range(args.warmup):
@@ -218,27 +218,30 @@ def main():
                                     conf_thres=args.conf,
                                     iou_thres=args.iou,
                                     image_size=640)
-        
+        start_time = time.time()
         for all_inputs in tqdm(dataloader):
             image = all_inputs[0]
             pad_batch = len(image) != batch_size
             if pad_batch:
                 origin_size = len(image)
                 image = np.resize(image, (batch_size, *image.shape[1:]))
-            
+
             cuda.memcpy_htod(inputs[0]["allocation"], image)
             context.execute_v2(allocations)
-            
+
             cuda.memcpy_dtoh(output_np, outputs[0]["allocation"])
             # print("output_np")
             # print(output_np)
-                
+
             if pad_batch:
                 output_np = output_np[:origin_size]
-                
+
             evaluator.evaluate(output_np, all_inputs)
-    
+        end_time = time.time()
+        end2end_time = end_time - start_time
+        print(F"E2E time : {end2end_time:.3f} seconds")
+
         evaluator.summary()
-    
+
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/models/cv/detection/yolox/ixrt/python/utils.py b/models/cv/detection/yolox/ixrt/python/utils.py
index e6bca6dcd5bfa645d98b54e9e98ecd8f0a0b8d67..314aed2e7d9c20cf71e205a6a032369f8697c21f 100644
--- a/models/cv/detection/yolox/ixrt/python/utils.py
+++ b/models/cv/detection/yolox/ixrt/python/utils.py
@@ -82,7 +82,7 @@ class COCO2017Dataset(torch.utils.data.Dataset):
         self.input_layout = input_layout
 
         self.coco = COCO(annotation_file=self.label_json_path)
-        
+
         if self.val_mode:
             self.img_ids = list(sorted(self.coco.imgs.keys()))
         else:
@@ -96,7 +96,7 @@ class COCO2017Dataset(torch.utils.data.Dataset):
         img = self._load_image(img_path)
 
         img, r = self.preproc(img, input_size=self.image_size)
-        
+
         return img, img_path, r
 
     def _get_image_path(self, index):
@@ -110,13 +110,13 @@ class COCO2017Dataset(torch.utils.data.Dataset):
         assert img is not None, f"file {img_path} not found"
 
         return img
-    
+
     def preproc(self, img, input_size, swap=(2, 0, 1)):
         if len(img.shape) == 3:
             padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
         else:
             padded_img = np.ones(input_size, dtype=np.uint8) * 114
-        
+
         org_img = (img.shape[0], img.shape[1])
         r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
         resized_img = cv2.resize(
@@ -128,10 +128,10 @@ class COCO2017Dataset(torch.utils.data.Dataset):
 
         padded_img = padded_img.transpose(swap)
         padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-        
+
         return padded_img, org_img
 
-    
+
     def _load_json_label(self, index):
         _, (h0, w0), _ = self._load_image(index)
 
@@ -171,19 +171,19 @@ def get_coco_accuracy(pred_json, ann_json):
     coco_pred = coco.loadRes(pred_json)
 
     coco_evaluator = COCOeval(cocoGt=coco, cocoDt=coco_pred, iouType="bbox")
-            
+
     coco_evaluator.evaluate()
     coco_evaluator.accumulate()
     coco_evaluator.summarize()
     return coco_evaluator.stats
 
-class COCO2017Evaluator:    
+class COCO2017Evaluator:
     def __init__(self,
                  label_path,
                  image_size=640,
                  conf_thres=0.001,
                  iou_thres=0.65):
-        
+
         self.conf_thres = conf_thres
         self.iou_thres = iou_thres
         self.label_path = label_path
@@ -192,14 +192,14 @@ class COCO2017Evaluator:
         self.jdict = []
 
         # iou vector for mAP@0.5:0.95
-        self.iouv = torch.linspace(0.5, 0.95, 10)  
+        self.iouv = torch.linspace(0.5, 0.95, 10)
         self.niou = self.iouv.numel()
-    
+
     def evaluate(self, pred, all_inputs):
         im = all_inputs[0]
         img_path = all_inputs[1]
         img_info = all_inputs[2]
-        
+
         _, _, height, width = im.shape
         img_size = [height, width]
 
@@ -212,7 +212,7 @@ class COCO2017Evaluator:
         for (output, org_img, path) in zip(nms_outputs, img_info, img_path):
             if output is None:
                 continue
-            
+
             bboxes = output[:, 0:4]
 
             img_h, img_w = org_img
@@ -222,11 +222,11 @@ class COCO2017Evaluator:
             bboxes /= scale
             cls = output[:, 6]
             scores = output[:, 4] * output[:, 5]
-            
+
             bboxes = self._xyxy2xywh(bboxes)
 
             self._save_one_json(bboxes, cls, scores, self.jdict, path, coco80_to_coco91)
-        
+
     def Detect(self, outputs, img_size):
         grids = []
         expanded_strides = []
@@ -247,7 +247,7 @@ class COCO2017Evaluator:
         expanded_strides = np.concatenate(expanded_strides, 1)
         outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
         outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
-        
+
         return outputs
 
     def postprocess(self, prediction, num_classes=80, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
@@ -257,7 +257,7 @@ class COCO2017Evaluator:
         box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
         box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
         prediction[:, :, :4] = box_corner[:, :, :4]
-        
+
         output = [None for _ in range(len(prediction))]
 
         for i, image_pred in enumerate(prediction):
@@ -271,7 +271,7 @@ class COCO2017Evaluator:
             # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
             detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
             detections = detections[conf_mask]
-            
+
             if not detections.size(0):
                 continue
             if class_agnostic:
@@ -295,7 +295,7 @@ class COCO2017Evaluator:
                 output[i] = torch.cat((output[i], detections))
 
         return output
-    
+
     def _xyxy2xywh(self, bboxes):
         bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
         bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
diff --git a/models/cv/pose_estimation/rtmpose/igie/README.md b/models/cv/pose_estimation/rtmpose/igie/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..02cbc5fca1e5000c9317e60b51839d421d134e6f
--- /dev/null
+++ b/models/cv/pose_estimation/rtmpose/igie/README.md
@@ -0,0 +1,66 @@
+# RTMPose
+
+## Description
+
+RTMPose, a state-of-the-art framework developed by Shanghai AI Laboratory, excels in real-time multi-person pose estimation by integrating an innovative model architecture with the efficiency of the MMPose foundation. The framework's architecture is meticulously designed to enhance performance and reduce latency, making it suitable for a variety of applications where real-time analysis is crucial.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install onnx
+pip3 install tqdm
+pip3 install onnxsim
+pip3 install mmdet==3.3.0
+pip3 install mmpose==1.3.1
+pip3 install mmdeploy==1.3.1
+pip3 install mmengine==0.10.4
+```
+
+### Download
+
+Pretrained model: <https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth>
+
+Dataset: <http://images.cocodataset.org/zips/val2017.zip> to download the validation dataset.
+
+```bash
+wget https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth
+```
+
+### Model Conversion
+
+```bash
+# export onnx model
+python3 export.py --weight rtmpose-m_simcc-aic-coco_pt-aic-coco_420e-256x192-63eb25f7_20230126.pth --cfg rtmpose-m_8xb256-420e_coco-256x192.py --output rtmpose.onnx
+
+# use onnxsim optimize onnx model
+onnxsim rtmpose.onnx rtmpose_opt.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/coco/
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_rtmpose_fp16_accuracy.sh
+# Performance
+bash scripts/infer_rtmpose_fp16_performance.sh
+```
+
+## Results
+
+|  Model  | BatchSize | Input Shape | Precision |   FPS    | mAP@0.5(%) |
+| :-----: | :-------: | :---------: | :-------: | :------: | :--------: |
+| RTMPose |    32     |   256x192   |   FP16    | 3459.341 |   0.936    |
diff --git a/models/cv/pose_estimation/rtmpose/igie/build_engine.py b/models/cv/pose_estimation/rtmpose/igie/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3626ae76cc9781d9a01ec3d3e2afbdbca409ff5
--- /dev/null
+++ b/models/cv/pose_estimation/rtmpose/igie/build_engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import tvm
+import argparse
+from tvm import relay
+from tvm.relay.import_model import import_model_to_igie
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--model_path", 
+                        type=str, 
+                        required=True, 
+                        help="original model path.")
+    
+    parser.add_argument("--engine_path", 
+                        type=str, 
+                        required=True, 
+                        help="igie export engine path.")
+
+    parser.add_argument("--input", 
+                        type=str, 
+                        required=True, 
+                        help="""
+                            input info of the model, format should be:
+                            input_name:input_shape
+                            eg: --input input:1,3,224,224.
+                            """)
+               
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision.")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    # get input valueinfo
+    input_name, input_shape = args.input.split(":")
+    shape = tuple([int(s) for s in input_shape.split(",")])
+    input_dict = {input_name: shape}
+
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+
+    mod, params = import_model_to_igie(args.model_path, input_dict, backend="igie")
+
+    # build engine
+    lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision)
+
+    # export engine
+    lib.export_library(args.engine_path)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/pose_estimation/rtmpose/igie/deploy_default.py b/models/cv/pose_estimation/rtmpose/igie/deploy_default.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0c613591e34f7502a09fb2f6e65a1229b315a5f
--- /dev/null
+++ b/models/cv/pose_estimation/rtmpose/igie/deploy_default.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+onnx_config = dict(
+    type='onnx',
+    export_params=True,
+    keep_initializers_as_inputs=False,
+    opset_version=11,
+    save_file='end2end.onnx',
+    input_names=['input'],
+    output_names=['output'],
+    input_shape=None,
+    optimize=True)
+
+codebase_config = dict(type='mmpose', task='PoseDetection')
+
+backend_config = dict(type='onnxruntime')
diff --git a/models/cv/pose_estimation/rtmpose/igie/export.py b/models/cv/pose_estimation/rtmpose/igie/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fc4f453d92d5eb65dc745237efcc4a9187ba5ae
--- /dev/null
+++ b/models/cv/pose_estimation/rtmpose/igie/export.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+
+import torch
+from mmdeploy.utils import load_config
+from mmdeploy.apis import build_task_processor
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--weight", 
+                    type=str, 
+                    required=True, 
+                    help="pytorch model weight.")
+
+    parser.add_argument("--cfg", 
+                    type=str, 
+                    required=True, 
+                    help="model config file.")
+       
+    parser.add_argument("--output", 
+                    type=str, 
+                    required=True, 
+                    help="export onnx model path.")
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    deploy_cfg = 'deploy_default.py'
+    model_cfg = args.cfg
+    model_checkpoint = args.weight
+
+    deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+    task_processor = build_task_processor(model_cfg, deploy_cfg, device='cpu')
+
+    model = task_processor.build_pytorch_model(model_checkpoint)
+
+    input_names = ['input']
+    dynamic_axes = {'input': {0: '-1'}}
+    dummy_input = torch.randn(1, 3, 256, 192)
+
+    torch.onnx.export(
+        model, 
+        dummy_input, 
+        args.output, 
+        input_names = input_names, 
+        dynamic_axes = dynamic_axes, 
+        opset_version=13
+    )
+
+    print("Export onnx model successfully! ")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/models/cv/pose_estimation/rtmpose/igie/inference.py b/models/cv/pose_estimation/rtmpose/igie/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..d01502e26ccdcf4555b35fba8865befe9a1b3453
--- /dev/null
+++ b/models/cv/pose_estimation/rtmpose/igie/inference.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import argparse
+import tvm
+import torch
+import numpy as np
+from tvm import relay
+from tqdm import tqdm
+from mmpose.registry import RUNNERS
+from mmengine.config import Config
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="igie engine path.")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+
+    parser.add_argument("--input_name", 
+                        type=str, 
+                        required=True, 
+                        help="input name of the model.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    # create iluvatar target & device
+    target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")    
+    device = tvm.device(target.kind.name, 0)
+
+    # load engine
+    lib = tvm.runtime.load_module(args.engine)
+
+    # create runtime from engine    
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+    # just run perf test
+    if args.perf_only:
+        ftimer = module.module.time_evaluator("run", device, number=100, repeat=1)        
+        prof_res = np.array(ftimer().results) * 1000 
+        fps = batch_size * 1000 / np.mean(prof_res)
+        print(f"\n* Mean inference time: {np.mean(prof_res):.3f} ms, Mean fps: {fps:.3f}")
+    else:       
+         # warm up
+        for _ in range(args.warmup):
+            module.run()
+
+        # runner config
+        cfg = Config.fromfile("rtmpose-m_8xb256-420e_coco-256x192.py")
+
+        cfg.work_dir = "./"
+        cfg['test_dataloader']['batch_size'] = batch_size
+        cfg['test_dataloader']['dataset']['data_root'] = args.datasets
+        cfg['test_dataloader']['dataset']['data_prefix']['img'] = 'images/val2017/'
+        cfg['test_evaluator']['ann_file'] = os.path.join(args.datasets, 'annotations/person_keypoints_val2017.json')
+        cfg['log_level'] = 'ERROR'
+
+        # build runner
+        runner = RUNNERS.build(cfg)
+    
+        for data in tqdm(runner.test_dataloader):
+            outputs = []
+            input_data = runner.model.data_preprocessor(data, False)
+            image = input_data['inputs'].cpu()
+            pad_batch = len(image) != batch_size
+
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+
+            module.set_input("input", tvm.nd.array(image, device))
+
+            module.run()
+            
+            for i in range(module.get_num_outputs()):
+                output = module.get_output(i).asnumpy()
+
+                if pad_batch:
+                    output = output[:origin_size]
+
+                output = torch.from_numpy(output)
+                outputs.append(output)
+
+            preds = runner.model.head.decode((outputs[0], outputs[1]))
+
+            if isinstance(preds, tuple):
+                batch_pred_instances, batch_pred_fields = preds
+            else:
+                batch_pred_instances = preds
+                batch_pred_fields = None
+
+            batch_data_samples = runner.model.add_pred_to_datasample(batch_pred_instances, batch_pred_fields, input_data['data_samples'])
+
+            runner.test_evaluator.process(data_samples=batch_data_samples, data_batch=data)
+
+        metrics = runner.test_evaluator.evaluate(len(runner.test_dataloader.dataset))        
+    
+
+if __name__ == "__main__":
+    main()
diff --git a/models/cv/pose_estimation/rtmpose/igie/rtmpose-m_8xb256-420e_coco-256x192.py b/models/cv/pose_estimation/rtmpose/igie/rtmpose-m_8xb256-420e_coco-256x192.py
new file mode 100644
index 0000000000000000000000000000000000000000..9428b7fee0979ba91bc996a5f2c095d9662e946e
--- /dev/null
+++ b/models/cv/pose_estimation/rtmpose/igie/rtmpose-m_8xb256-420e_coco-256x192.py
@@ -0,0 +1,465 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+auto_scale_lr = dict(base_batch_size=1024)
+backend_args = dict(backend='local')
+base_lr = 0.004
+codec = dict(
+    input_size=(
+        192,
+        256,
+    ),
+    normalize=False,
+    sigma=(
+        4.9,
+        5.66,
+    ),
+    simcc_split_ratio=2.0,
+    type='SimCCLabel',
+    use_dark=False)
+custom_hooks = [
+    dict(
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        priority=49,
+        type='EMAHook',
+        update_buffers=True),
+    dict(
+        switch_epoch=390,
+        switch_pipeline=[
+            dict(backend_args=dict(backend='local'), type='LoadImage'),
+            dict(type='GetBBoxCenterScale'),
+            dict(direction='horizontal', type='RandomFlip'),
+            dict(type='RandomHalfBody'),
+            dict(
+                rotate_factor=60,
+                scale_factor=[
+                    0.75,
+                    1.25,
+                ],
+                shift_factor=0.0,
+                type='RandomBBoxTransform'),
+            dict(input_size=(
+                192,
+                256,
+            ), type='TopdownAffine'),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                transforms=[
+                    dict(p=0.1, type='Blur'),
+                    dict(p=0.1, type='MedianBlur'),
+                    dict(
+                        max_height=0.4,
+                        max_holes=1,
+                        max_width=0.4,
+                        min_height=0.2,
+                        min_holes=1,
+                        min_width=0.2,
+                        p=0.5,
+                        type='CoarseDropout'),
+                ],
+                type='Albumentation'),
+            dict(
+                encoder=dict(
+                    input_size=(
+                        192,
+                        256,
+                    ),
+                    normalize=False,
+                    sigma=(
+                        4.9,
+                        5.66,
+                    ),
+                    simcc_split_ratio=2.0,
+                    type='SimCCLabel',
+                    use_dark=False),
+                type='GenerateTarget'),
+            dict(type='PackPoseInputs'),
+        ],
+        type='mmdet.PipelineSwitchHook'),
+]
+data_mode = 'topdown'
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    badcase=dict(
+        _scope_='mmpose',
+        badcase_thr=5,
+        enable=False,
+        metric_type='loss',
+        out_dir='badcase',
+        type='BadCaseAnalysisHook'),
+    checkpoint=dict(
+        _scope_='mmpose',
+        interval=10,
+        max_keep_ckpts=1,
+        rule='greater',
+        save_best='coco/AP',
+        type='CheckpointHook'),
+    logger=dict(_scope_='mmpose', interval=50, type='LoggerHook'),
+    param_scheduler=dict(_scope_='mmpose', type='ParamSchedulerHook'),
+    sampler_seed=dict(_scope_='mmpose', type='DistSamplerSeedHook'),
+    timer=dict(_scope_='mmpose', type='IterTimerHook'),
+    visualization=dict(
+        _scope_='mmpose', enable=False, type='PoseVisualizationHook'))
+default_scope = 'mmpose'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+input_size = (
+    192,
+    256,
+)
+load_from = None
+log_level = 'ERROR'
+log_processor = dict(
+    _scope_='mmpose',
+    by_epoch=True,
+    num_digits=6,
+    type='LogProcessor',
+    window_size=50)
+max_epochs = 420
+model = dict(
+    backbone=dict(
+        _scope_='mmdet',
+        act_cfg=dict(type='SiLU'),
+        arch='P5',
+        channel_attention=True,
+        deepen_factor=0.67,
+        expand_ratio=0.5,
+        init_cfg=dict(
+            checkpoint=
+            'https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/cspnext-m_udp-aic-coco_210e-256x192-f2f7d6f6_20230130.pth',
+            prefix='backbone.',
+            type='Pretrained'),
+        norm_cfg=dict(type='SyncBN'),
+        out_indices=(4, ),
+        type='CSPNeXt',
+        widen_factor=0.75),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='PoseDataPreprocessor'),
+    head=dict(
+        decoder=dict(
+            input_size=(
+                192,
+                256,
+            ),
+            normalize=False,
+            sigma=(
+                4.9,
+                5.66,
+            ),
+            simcc_split_ratio=2.0,
+            type='SimCCLabel',
+            use_dark=False),
+        final_layer_kernel_size=7,
+        gau_cfg=dict(
+            act_fn='SiLU',
+            drop_path=0.0,
+            dropout_rate=0.0,
+            expansion_factor=2,
+            hidden_dims=256,
+            pos_enc=False,
+            s=128,
+            use_rel_bias=False),
+        in_channels=768,
+        in_featuremap_size=(
+            6,
+            8,
+        ),
+        input_size=(
+            192,
+            256,
+        ),
+        loss=dict(
+            beta=10.0,
+            label_softmax=True,
+            type='KLDiscretLoss',
+            use_target_weight=True),
+        out_channels=17,
+        simcc_split_ratio=2.0,
+        type='RTMCCHead'),
+    test_cfg=dict(flip_test=True),
+    type='TopdownPoseEstimator')
+num_keypoints = 17
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=35, norm_type=2),
+    optimizer=dict(lr=0.004, type='AdamW', weight_decay=0.05),
+    paramwise_cfg=dict(
+        bias_decay_mult=0, bypass_duplicate=True, norm_decay_mult=0),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=1000, start_factor=1e-05,
+        type='LinearLR'),
+    dict(
+        T_max=210,
+        begin=210,
+        by_epoch=True,
+        convert_to_iter_based=True,
+        end=420,
+        eta_min=0.0002,
+        type='CosineAnnealingLR'),
+]
+randomness = dict(seed=21)
+resume = False
+stage2_num_epochs = 30
+test_cfg = dict()
+test_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_file='annotations/person_keypoints_val2017.json',
+        data_mode='topdown',
+        data_prefix=dict(img='images/val2017/'),
+        data_root='/root/.igie_cache/data/datasets/coco',
+        pipeline=[
+            dict(backend_args=dict(backend='local'), type='LoadImage'),
+            dict(type='GetBBoxCenterScale'),
+            dict(input_size=(
+                192,
+                256,
+            ), type='TopdownAffine'),
+            dict(type='PackPoseInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(round_up=False, shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    ann_file=
+    '/root/.igie_cache/data/datasets/coco/annotations/person_keypoints_val2017.json',
+    type='CocoMetric')
+train_batch_size = 256
+train_cfg = dict(by_epoch=True, max_epochs=420, val_interval=10)
+train_dataloader = dict(
+    batch_size=256,
+    dataset=dict(
+        ann_file='annotations/person_keypoints_train2017.json',
+        data_mode='topdown',
+        data_prefix=dict(img='train2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=dict(backend='local'), type='LoadImage'),
+            dict(type='GetBBoxCenterScale'),
+            dict(direction='horizontal', type='RandomFlip'),
+            dict(type='RandomHalfBody'),
+            dict(
+                rotate_factor=80,
+                scale_factor=[
+                    0.6,
+                    1.4,
+                ],
+                type='RandomBBoxTransform'),
+            dict(input_size=(
+                192,
+                256,
+            ), type='TopdownAffine'),
+            dict(type='mmdet.YOLOXHSVRandomAug'),
+            dict(
+                transforms=[
+                    dict(p=0.1, type='Blur'),
+                    dict(p=0.1, type='MedianBlur'),
+                    dict(
+                        max_height=0.4,
+                        max_holes=1,
+                        max_width=0.4,
+                        min_height=0.2,
+                        min_holes=1,
+                        min_width=0.2,
+                        p=1.0,
+                        type='CoarseDropout'),
+                ],
+                type='Albumentation'),
+            dict(
+                encoder=dict(
+                    input_size=(
+                        192,
+                        256,
+                    ),
+                    normalize=False,
+                    sigma=(
+                        4.9,
+                        5.66,
+                    ),
+                    simcc_split_ratio=2.0,
+                    type='SimCCLabel',
+                    use_dark=False),
+                type='GenerateTarget'),
+            dict(type='PackPoseInputs'),
+        ],
+        type='CocoDataset'),
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(backend_args=dict(backend='local'), type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(direction='horizontal', type='RandomFlip'),
+    dict(type='RandomHalfBody'),
+    dict(
+        rotate_factor=80,
+        scale_factor=[
+            0.6,
+            1.4,
+        ],
+        type='RandomBBoxTransform'),
+    dict(input_size=(
+        192,
+        256,
+    ), type='TopdownAffine'),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        transforms=[
+            dict(p=0.1, type='Blur'),
+            dict(p=0.1, type='MedianBlur'),
+            dict(
+                max_height=0.4,
+                max_holes=1,
+                max_width=0.4,
+                min_height=0.2,
+                min_holes=1,
+                min_width=0.2,
+                p=1.0,
+                type='CoarseDropout'),
+        ],
+        type='Albumentation'),
+    dict(
+        encoder=dict(
+            input_size=(
+                192,
+                256,
+            ),
+            normalize=False,
+            sigma=(
+                4.9,
+                5.66,
+            ),
+            simcc_split_ratio=2.0,
+            type='SimCCLabel',
+            use_dark=False),
+        type='GenerateTarget'),
+    dict(type='PackPoseInputs'),
+]
+train_pipeline_stage2 = [
+    dict(backend_args=dict(backend='local'), type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(direction='horizontal', type='RandomFlip'),
+    dict(type='RandomHalfBody'),
+    dict(
+        rotate_factor=60,
+        scale_factor=[
+            0.75,
+            1.25,
+        ],
+        shift_factor=0.0,
+        type='RandomBBoxTransform'),
+    dict(input_size=(
+        192,
+        256,
+    ), type='TopdownAffine'),
+    dict(type='mmdet.YOLOXHSVRandomAug'),
+    dict(
+        transforms=[
+            dict(p=0.1, type='Blur'),
+            dict(p=0.1, type='MedianBlur'),
+            dict(
+                max_height=0.4,
+                max_holes=1,
+                max_width=0.4,
+                min_height=0.2,
+                min_holes=1,
+                min_width=0.2,
+                p=0.5,
+                type='CoarseDropout'),
+        ],
+        type='Albumentation'),
+    dict(
+        encoder=dict(
+            input_size=(
+                192,
+                256,
+            ),
+            normalize=False,
+            sigma=(
+                4.9,
+                5.66,
+            ),
+            simcc_split_ratio=2.0,
+            type='SimCCLabel',
+            use_dark=False),
+        type='GenerateTarget'),
+    dict(type='PackPoseInputs'),
+]
+val_batch_size = 64
+val_cfg = dict()
+val_dataloader = dict(
+    batch_size=64,
+    dataset=dict(
+        ann_file='annotations/person_keypoints_val2017.json',
+        data_mode='topdown',
+        data_prefix=dict(img='images/val2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=dict(backend='local'), type='LoadImage'),
+            dict(type='GetBBoxCenterScale'),
+            dict(input_size=(
+                192,
+                256,
+            ), type='TopdownAffine'),
+            dict(type='PackPoseInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(round_up=False, shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    ann_file='data/coco/annotations/person_keypoints_val2017.json',
+    type='CocoMetric')
+val_pipeline = [
+    dict(backend_args=dict(backend='local'), type='LoadImage'),
+    dict(type='GetBBoxCenterScale'),
+    dict(input_size=(
+        192,
+        256,
+    ), type='TopdownAffine'),
+    dict(type='PackPoseInputs'),
+]
+vis_backends = [
+    dict(_scope_='mmpose', type='LocalVisBackend'),
+]
+visualizer = dict(
+    _scope_='mmpose',
+    name='visualizer',
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './'
diff --git a/models/cv/pose_estimation/rtmpose/igie/scripts/infer_rtmpose_fp16_accuracy.sh b/models/cv/pose_estimation/rtmpose/igie/scripts/infer_rtmpose_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9e0c7da0edadacb3b36fd0394d8e923b375d2eeb
--- /dev/null
+++ b/models/cv/pose_estimation/rtmpose/igie/scripts/infer_rtmpose_fp16_accuracy.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="rtmpose_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,256,192    \
+    --precision fp16                        \
+    --engine_path rtmpose_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine rtmpose_bs_${batchsize}_fp16.so  \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}
\ No newline at end of file
diff --git a/models/cv/pose_estimation/rtmpose/igie/scripts/infer_rtmpose_fp16_performance.sh b/models/cv/pose_estimation/rtmpose/igie/scripts/infer_rtmpose_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..568cb1bc6a317c7915f5cb15a491c083c76859e6
--- /dev/null
+++ b/models/cv/pose_estimation/rtmpose/igie/scripts/infer_rtmpose_fp16_performance.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+batchsize=32
+model_path="rtmpose_opt.onnx"
+datasets_path=${DATASETS_DIR}
+
+# build engine
+python3 build_engine.py                     \
+    --model_path ${model_path}              \
+    --input input:${batchsize},3,256,192    \
+    --precision fp16                        \
+    --engine_path rtmpose_bs_${batchsize}_fp16.so
+
+
+# inference
+python3 inference.py                          \
+    --engine rtmpose_bs_${batchsize}_fp16.so  \
+    --batchsize ${batchsize}                  \
+    --input_name input                        \
+    --datasets ${datasets_path}               \
+    --perf_only True
\ No newline at end of file
diff --git a/models/cv/segmentation/mask_rcnn/ixrt/README.md b/models/cv/segmentation/mask_rcnn/ixrt/README.md
index 8b2bd2b7689b50dfdad9f526d80b880eb19eed70..ab86e0886d58ed6a8bd77e50eca92c68f3acafae 100644
--- a/models/cv/segmentation/mask_rcnn/ixrt/README.md
+++ b/models/cv/segmentation/mask_rcnn/ixrt/README.md
@@ -4,45 +4,60 @@
 
 Mask R-CNN (Mask Region-Based Convolutional Neural Network) is an extension of the Faster R-CNN model, which is itself an improvement over R-CNN and Fast R-CNN. Developed by Kaiming He et al., Mask R-CNN is designed for object instance segmentation tasks, meaning it not only detects objects within an image but also generates high-quality segmentation masks for each instance.
 
-## Setup
+## Prepare
+
+```bash
+# go to current model home path
+cd ${PROJ_ROOT}/models/cv/segmentation/mask_rcnn/ixrt
+```
+
+Prepare weights and datasets referring to below steps:
+
+"maskrcnn.wts" [export method](https://github.com/wang-xinyu/tensorrtx/tree/master/rcnn#how-to-run)
 
-Prepare on MR GPU
+- use the [script](https://github.com/wang-xinyu/tensorrtx/blob/master/rcnn/gen_wts.py)
+- use the [config file](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml)
+- use [weights](https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/model_final_9243eb.pkl)
 
 ```bash
-cd PATH/TO/scripts
-bash init.sh
+# put maskrcnn.wts in "python/maskrcnn.wts"
+wget -P python/ http://files.deepspark.org.cn:880/deepspark/wts/maskrcnn.wts
 ```
 
-Prepare on NV GPU
+Visit [COCO site](https://cocodataset.org/) and get COCO2017 datasets
+
+- images directory: coco/images/val2017/*.jpg
+- annotations directory: coco/annotations/instances_val2017.json
+
+## Setup
 
 ```bash
-cd PATH/TO/scripts
-bash init_nv.sh
+cd scripts/
 ```
 
-### Model Conversion
+### Prepare on MR GPU
 
-Prepare weights and datasets referring to below steps:
+```bash
+bash init.sh
+```
+
+### Prepare on NV GPU
 
-- "maskrcnn.wts" [export method](https://github.com/wang-xinyu/tensorrtx/tree/master/rcnn#how-to-run), use the [script](https://github.com/wang-xinyu/tensorrtx/blob/master/rcnn/gen_wts.py), use the [config file](https://github.com/facebookresearch/detectron2/blob/main/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml), use [weights](https://dl.fbaipublicfiles.com/detectron2/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x/137259246/model_final_9243eb.pkl)
-  - put maskrcnn.wts in "python/maskrcnn.wts"
-- Visit [COCO site](https://cocodataset.org/) and get COCO2017 datasets
-  - images directory: coco/images/val2017/*.jpg
-  - annotations directory: coco/annotations/instances_val2017.json
+```bash
+bash init_nv.sh
+```
 
 ## Inference
 
 ### FP16 Performance
 
 ```bash
-cd PATH/TO/scripts
 bash infer_maskrcnn_fp16_performance.sh
 ```
 
 ### FP16 Accuracy
 
 ```bash
-cd PATH/TO/scripts
 bash infer_maskrcnn_fp16_acc.sh
 ```
 
diff --git a/models/cv/segmentation/mask_rcnn/ixrt/python/maskrcnn.py b/models/cv/segmentation/mask_rcnn/ixrt/python/maskrcnn.py
index ce68fec2c54fed9c26a4fdeeae6e4158e9dce71d..75484195b0424a3ef852e620fb1dea2390048700 100644
--- a/models/cv/segmentation/mask_rcnn/ixrt/python/maskrcnn.py
+++ b/models/cv/segmentation/mask_rcnn/ixrt/python/maskrcnn.py
@@ -206,12 +206,12 @@ def get_maskrcnn_perf(config):
 
     # Process I/O and execute the network
     cuda.memcpy_htod(inputs[0]["allocation"], data_batch)
-    # warm up 
+    # warm up
     print("Warmup start ...")
     for i in range(5):
         context.execute_v2(allocations)
     print("Warmup done !\nStart forward ...")
-    # run 
+    # run
     forward_time = 0
     for i in range(20):
         start_time = time.time()
@@ -268,13 +268,14 @@ def get_maskrcnn_acc(config):
     # Prepare the output data
     output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
 
-    # warm up 
+    # warm up
     print("Warmup start ...")
     for i in range(3):
         context.execute_v2(allocations)
     print("Warmup done !\nStart forward ...")
-    
+
     # run
+    start_time = time.time()
     for batch_data, batch_img_shape, batch_img_id, batched_paddings, paths in tqdm(dataloader):
         batch_data = batch_data.numpy()
         batch_img_shape = batch_img_shape.numpy()
@@ -286,7 +287,7 @@ def get_maskrcnn_acc(config):
         # cpu -> gpu
         batch_data = np.ascontiguousarray(batch_data)
         cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
-        
+
         context.execute_v2(allocations)
 
         # gpu -> cpu
@@ -312,19 +313,22 @@ def get_maskrcnn_acc(config):
             batched_paddings[0]
         )
         save2json(batch_img_id, bboxs_masks, json_result, class_map)
-        
+    end_time = time.time()
+    end2end_time = end_time - start_time
+
+    print(F"E2E time : {end2end_time:.3f} seconds")
     print("Forward done !")
-    
+
     tmp_result_name = "pred_results.json"
     if os.path.exists(tmp_result_name):
         os.remove(tmp_result_name)
     with open(tmp_result_name, "w") as f:
         json.dump(json_result, f)
 
-    
+
     anno = COCO(anno_json)  # init annotations api
     pred = anno.loadRes(tmp_result_name)  # init predictions api
-    
+
     eval = COCOeval(anno, pred, "bbox")
     eval.evaluate()
     eval.accumulate()
@@ -340,11 +344,11 @@ def get_maskrcnn_acc(config):
     _, map50 = eval.stats[:2]
     print("bbox mAP@0.5 : ", map50)
     print(f"bbox Accuracy Check : Test {map50} >= target {config.map_target}")
-    
+
     _, segm_map50 = segm_eval.stats[:2]
     print("segm mAP@0.5 : ", segm_map50)
     print(f"segm Accuracy Check : Test {segm_map50} >= target {config.segm_map_target}")
-    
+
     if map50 >= config.map_target and segm_map50 >= config.segm_map_target:
         print("pass!")
     else:
diff --git a/models/cv/segmentation/mask_rcnn/ixrt/scripts/init.sh b/models/cv/segmentation/mask_rcnn/ixrt/scripts/init.sh
index 7b699195a5bdf40b74f2d610a51bb5b17e5415ef..bd1fe1177a797dc149968f82017ee25877667c00 100644
--- a/models/cv/segmentation/mask_rcnn/ixrt/scripts/init.sh
+++ b/models/cv/segmentation/mask_rcnn/ixrt/scripts/init.sh
@@ -14,13 +14,14 @@
 #    under the License.
 
 # build plugin
-cd ../plugins
+pushd ../plugins
 rm -rf build
 mkdir build
 cd build
-cmake .. -DIXRT_HOME=/usr/local/corex
-# cmake .. -DIXRT_HOME=/opt/sw_home/local
+cmake .. -D CMAKE_C_COMPILER="/usr/local/corex/bin/clang" -D CMAKE_CXX_COMPILER="/usr/local/corex/bin/clang++" -DIXRT_HOME=/usr/local/corex
+
 make -j8
+popd
 
 
 ## install packages
@@ -28,10 +29,10 @@ bash prepare_system_env.sh
 
 # pip whl
 #pip3 install opencv-python==4.6.0.66
-pip3 install pycocotools==2.0.2
+pip3 install pycocotools==2.0.7
 pip3 install tqdm
 
 # build engine
-cd ../../python
+cd ../python
 rm -rf ./maskrcnn.engine
 python3 maskrcnn.py build_engine --wts_file ./maskrcnn.wts --engine_file ./maskrcnn.engine
diff --git a/models/nlp/language_model/albert/ixrt/README.md b/models/nlp/language_model/albert/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..29e41623354e16bca9be8afb67ff78a2071ae8ff
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/README.md
@@ -0,0 +1,92 @@
+# AlBERT
+
+## Description
+
+Albert (A Lite BERT) is a variant of the BERT (Bidirectional Encoder Representations from Transformers) model that focuses on efficiency and scalability while maintaining strong performance in natural language processing tasks. The AlBERT model introduces parameter reduction techniques and incorporates self-training strategies to enhance its effectiveness.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_albert.tar>
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar > to download the squad dataset.
+
+or you can :
+```bash
+bash /scripts/prepare_model_and_dataset.sh
+
+```
+
+### Model Conversion
+Please correct the paths in the following commands or files.
+```bash
+tar -xvf open_albert.tar
+wget <https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/albert-torch-fp32.json >
+python3 torch2onnx.py --model_path albert-base-squad.pt --output_path albert-torch-fp32.onnx
+onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx
+
+```
+
+## Inference
+
+
+```bash
+export ORIGIN_ONNX_NAME=/Path/albert-base-squad
+export OPTIMIER_FILE=/Path/ixrt/oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+
+bash scripts/infer_albert_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+
+pip3 install -r toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/requirements.txt
+mv /ixrt/perf_engine.py toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn     密码：123..com
+get /upload/3-app/byteperf/madlag.tar
+exit
+tar -zxvf madlag.tar
+
+接着修改代码：toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+AutoTokenizer.from_pretrained("madlag/albert-base-v2-squad") => AutoTokenizer.from_pretrained("/Your/Path/madlag/albert-base-v2-squad")
+
+cd toolbox/ByteMLPerf/byte_infer_perf/
+mv /general_perf/general_perf/model_zoo/popular/open_albert /general_perf/model_zoo/popular/open_albert
+cd /toolboxByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
+```
+
+
+## Results
+
+Model   |BatchSize  |Precision |QPS       |Exact Match  |F1 Score
+--------|-----------|----------|----------|-------------|------------
+AlBERT  |    16     |   FP16   | 50.99    | 80.18       | 87.57
\ No newline at end of file
diff --git a/models/nlp/language_model/bert_base_squad/ixrt/README.md b/models/nlp/language_model/bert_base_squad/ixrt/README.md
index acc3592b59533fcfd6334628977c78b05916de02..6629b384e8a90d7cdc83598649ee59d75c3a501a 100644
--- a/models/nlp/language_model/bert_base_squad/ixrt/README.md
+++ b/models/nlp/language_model/bert_base_squad/ixrt/README.md
@@ -14,6 +14,11 @@ docker pull nvcr.io/nvidia/tensorrt:23.04-py3
 
 ## Install
 
+```bash
+pip install onnx
+pip install pycuda
+```
+
 ### Install on Iluvatar
 
 ```bash
@@ -37,39 +42,31 @@ bash script/prepare.sh v1_1
 
 ## Inference
 
-```bash
-# INT8
-cd python
-pip install onnx pycuda
-bash script/build_engine.sh --bs 32 --int8
-bash script/inference_squad.sh --bs 32 --int8
-```
-
 ### On Iluvatar
 
 #### FP16
 
 ```bash
-cd python/script
-bash infer_bert_base_squad_fp16_ixrt.sh
-```
+cd script
 
-#### INT8
+# FP16
+bash infer_bert_base_squad_fp16_ixrt.sh
 
-```bash
-cd python/script
+# INT8
 bash infer_bert_base_squad_int8_ixrt.sh
 ```
 
-### On T4
+### On NV
 
 ```bash
 # FP16
-cd python
-pip install onnx pycuda
 # use --bs to set max_batch_size (dynamic) 
 bash script/build_engine.sh --bs 32
 bash script/inference_squad.sh --bs 32
+
+# INT8
+bash script/build_engine.sh --bs 32 --int8
+bash script/inference_squad.sh --bs 32 --int8
 ```
 
 ## Results
diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/inference.py b/models/nlp/language_model/bert_base_squad/ixrt/python/inference.py
index 8685604f7e259263be098378f52d6ff90514a6ac..a509f0713867980168192ae9ed884821072c4ecf 100644
--- a/models/nlp/language_model/bert_base_squad/ixrt/python/inference.py
+++ b/models/nlp/language_model/bert_base_squad/ixrt/python/inference.py
@@ -153,10 +153,10 @@ if __name__ == '__main__':
                 break
         if selected_profile == -1:
             raise RuntimeError("Could not find any profile that can run batch size {}.".format(args.batch_size))
-        
+
         # Create a stream in which to copy inputs/outputs and run inference.
         stream = cuda.Stream()
-        
+
         # if args.use_trt:
         #     context.active_optimization_profile = selected_profile
         # else:
@@ -212,7 +212,7 @@ if __name__ == '__main__':
 
                 # Only retrieve and post-process the first batch
                 batch = h_output[0]
-                
+
                 networkOutputs.append(_NetworkOutput(
                     start_logits = np.array(batch.squeeze()[:, 0]),
                     end_logits = np.array(batch.squeeze()[:, 1]),
@@ -247,8 +247,8 @@ if __name__ == '__main__':
             _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name
                     "NetworkOutput",
                     ["start_logits", "end_logits", "feature_index"])
-            networkOutputs = []    
-        
+            networkOutputs = []
+
             batch_input_ids = []
             batch_segment_ids = []
             all_token_ids = []
@@ -323,7 +323,7 @@ if __name__ == '__main__':
                 cuda.memcpy_htod_async(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream)
                 context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
             stream.synchronize()
-            
+
             start_time = time.time()
             output_index = 0
             for input_ids, segment_ids in tqdm(all_token_ids):
@@ -337,10 +337,10 @@ if __name__ == '__main__':
 
                 context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
                 stream.synchronize()
-                
+
                 cuda.memcpy_dtoh_async(h_output, d_output, stream)
                 stream.synchronize()
-    
+
                 new_h_output = np.array(h_output.reshape(-1)[:input_ids.shape[0]*input_ids.shape[1]*2]).reshape(input_ids.shape[0], input_ids.shape[1], 2)
                 for index in range(input_ids.shape[0]):
                     networkOutputs.append(_NetworkOutput(
@@ -372,8 +372,8 @@ if __name__ == '__main__':
                 lengths.append(len(features[0].input_ids))
 
             sort_index = np.argsort(lengths)
-            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)          
-            
+            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)
+            print(F"E2E time : {infer_time:.3f} seconds")
             qps = len(squad_examples)/infer_time
             print(f"Latency QPS: {qps} sentences/s")
 
diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py
index 92c4e83bf7f150156108b7ccd99f0a9373222c2a..67d6c18245b7eec0c8a995fc2a7284715429b498 100644
--- a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py
+++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py
@@ -26,6 +26,7 @@ import re
 import argparse
 import json
 import sys
+import time
 
 def normalize_answer(s):
     """Lower text and remove punctuation, articles and extra whitespace."""
diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/inference.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/inference.py
index a85e765c91152562d6180307c2bb1317dc385356..b6af06dcf683496128bbbdb9e5458f8b2753885d 100644
--- a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/inference.py
+++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/inference.py
@@ -379,7 +379,8 @@ if __name__ == '__main__':
                 lengths.append(len(features[0].input_ids))
 
             sort_index = np.argsort(lengths)
-            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)          
+            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)
+            print(F"E2E time : {infer_time:.3f} seconds")
             
             qps = math.ceil(len(squad_examples)/args.batch_size)*args.batch_size/infer_time
             print(f"Latency QPS: {qps} sentences/s")
diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/script/mdb_infer_run.sh b/models/nlp/language_model/bert_base_squad/ixrt/python/script/mdb_infer_run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6386fe83a92b61e23ab0e33c8f88b0d4440846e1
--- /dev/null
+++ b/models/nlp/language_model/bert_base_squad/ixrt/python/script/mdb_infer_run.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+index=0
+options=("$@") # 将所有参数存储到数组中
+PRECISION=fp16
+BSZ=32
+
+# 循环遍历所有参数
+while [[ $index -lt ${#options[@]} ]]; do
+    argument=${options[$index]}
+    case $argument in
+    --bs)
+        ((index++))
+        BSZ=${options[$index]}
+        ;;
+    --prec)
+        ((index++))
+        PRECISION=${options[$index]}
+        ;;
+    esac
+    ((index++))
+done
+
+# 设置INT8_FLAG
+INT8_FLAG=""
+if [[ "$PRECISION" == "int8" ]]; then
+    INT8_FLAG="--int8"
+fi
+
+# 设置BSZ_FLAG
+BSZ_FLAG=""
+if [[ "$BSZ" -ne 32 ]]; then
+    BSZ_FLAG="--bs $BSZ"
+fi
+
+echo "PREC_FLAG=$INT8_FLAG"
+echo "PRECISION=$PRECISION"
+echo "BSZ=$BSZ"
+echo "BSZ_FLAG=$BSZ_FLAG"
+
+# 检查环境并执行相应的脚本
+if command -v ixsmi &>/dev/null; then
+    echo "MR env"
+    cmake -S . -B build
+    cmake --build build -j16
+    cd ./python/script/
+    bash infer_bert_base_squad_${PRECISION}_ixrt.sh $BSZ_FLAG
+
+elif command -v nvidia-smi &>/dev/null; then
+    echo "NV env"
+    cmake -S . -B build -DUSE_TENSORRT=true
+    cmake --build build -j16
+    cd ./python/
+    bash script/build_engine.sh --bs $BSZ $INT8_FLAG
+    bash script/inference_squad.sh --bs $BSZ $INT8_FLAG
+else
+    echo "No driver detected"
+    exit 1
+fi
diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/script/prepare.sh b/models/nlp/language_model/bert_base_squad/ixrt/python/script/prepare.sh
index 843166dec9d30224e818649f61868e9b968a2f37..5bd750c46738ac3489fadd81c730fff0eabfe5eb 100644
--- a/models/nlp/language_model/bert_base_squad/ixrt/python/script/prepare.sh
+++ b/models/nlp/language_model/bert_base_squad/ixrt/python/script/prepare.sh
@@ -51,7 +51,7 @@ else
     echo 'squad directory existed'
 fi
 
-echo "Step 2: Downloading model file and config to ./data/bert-large-uncased"
+echo "Step 2: Downloading model file and config to ./data/bert_base_uncased_squad"
 
 if [ ! -d "./bert_base_uncased_squad" ]; then
     wget https://drive.google.com/file/d/1_q7SaiZjwysJ3jWAIQT2Ne-duFdgWivR/view?usp=drive_link
diff --git a/models/nlp/language_model/bert_large_squad/ixrt/README.md b/models/nlp/language_model/bert_large_squad/ixrt/README.md
index fcd17cd1f84b8fba7fa2c644a00a627253d7286e..47aeb0f37add903d6e1b9453e4f3d73faab025e8 100644
--- a/models/nlp/language_model/bert_large_squad/ixrt/README.md
+++ b/models/nlp/language_model/bert_large_squad/ixrt/README.md
@@ -6,29 +6,34 @@ BERT is designed to pre-train deep bidirectional representations from unlabeled
 
 ## Setup
 
-### T4 requirement(tensorrt_version >= 8.6)
+### NV requirement(tensorrt_version >= 8.6)
 
 ```bash
 docker pull nvcr.io/nvidia/tensorrt:23.04-py3
 ```
 
-### Install
+## Install
 
-#### On iluvatar
+```bash
+pip install onnx
+pip install pycuda
+```
+
+### On Iluvatar
 
 ```bash
 cmake -S . -B build
 cmake --build build -j16
 ```
 
-#### On T4
+### On NV
 
 ```bash
 cmake -S . -B build -DUSE_TENSORRT=true
 cmake --build build -j16
 ```
 
-### Download
+## Download
 
 ```bash
 cd python
@@ -41,10 +46,10 @@ bash script/prepare.sh v1_1
 
 ```bash
 cd python
-pip install onnx pycuda
+
 # use --bs to set max_batch_size (dynamic)
-bash script/build_engine --bs 32
-bash script/inference_squad.sh --bs {batch_size}
+bash script/build_engine.sh --bs 32
+bash script/inference_squad.sh --bs 32
 ```
 
 ### INT8
@@ -52,16 +57,16 @@ bash script/inference_squad.sh --bs {batch_size}
 ```bash
 cd python
 pip install onnx pycuda
-bash script/build_engine --bs 32 --int8
-bash script/inference_squad.sh --bs {batch_size} --int8
+bash script/build_engine.sh --bs 32 --int8
+bash script/inference_squad.sh --bs 32 --int8
 ```
 
 ## Results
 
-Model | BatchSize | Precision | FPS | ACC
-------|-----------|-----------|-----|----
-BERT-Large-SQuAD | 32 | FP16 | Latency QPS: 470.26 sentences/s | "exact_match": 82.36, "f1": 89.68
-BERT-Large-SQuAD | 32 | INT8 | Latency QPS: 1490.47 sentences/s | "exact_match": 80.92, "f1": 88.20
+| Model            | BatchSize | Precision | Latency QPS         | exact_match | f1    |
+| ---------------- | --------- | --------- | ------------------- | ----------- | ----- |
+| BERT-Large-SQuAD | 32        | FP16      | 470.26 sentences/s  | 82.36       | 89.68 |
+| BERT-Large-SQuAD | 32        | INT8      | 1490.47 sentences/s | 80.92       | 88.20 |
 
 ## Referenece
 
diff --git a/models/nlp/language_model/bert_large_squad/ixrt/python/inference.py b/models/nlp/language_model/bert_large_squad/ixrt/python/inference.py
index 8685604f7e259263be098378f52d6ff90514a6ac..860322c3ed5873e0002d9aa24a011394dd92e570 100644
--- a/models/nlp/language_model/bert_large_squad/ixrt/python/inference.py
+++ b/models/nlp/language_model/bert_large_squad/ixrt/python/inference.py
@@ -372,7 +372,8 @@ if __name__ == '__main__':
                 lengths.append(len(features[0].input_ids))
 
             sort_index = np.argsort(lengths)
-            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)          
+            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)
+            print(F"E2E time : {infer_time:.3f} seconds")
             
             qps = len(squad_examples)/infer_time
             print(f"Latency QPS: {qps} sentences/s")
diff --git a/models/nlp/language_model/bert_large_squad/ixrt/python/script/mdb_infer_run.sh b/models/nlp/language_model/bert_large_squad/ixrt/python/script/mdb_infer_run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f19c1def4b139edc1e02b7ae595327dc367c919e
--- /dev/null
+++ b/models/nlp/language_model/bert_large_squad/ixrt/python/script/mdb_infer_run.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+index=0
+options=("$@") # 将所有参数存储到数组中
+PRECISION=fp16
+BSZ=32
+
+# 循环遍历所有参数
+while [[ $index -lt ${#options[@]} ]]; do
+    argument=${options[$index]}
+    case $argument in
+    --bs)
+        ((index++))
+        BSZ=${options[$index]}
+        ;;
+    --prec)
+        ((index++))
+        PRECISION=${options[$index]}
+        ;;
+    esac
+    ((index++))
+done
+
+# 设置INT8_FLAG
+INT8_FLAG=""
+if [[ "$PRECISION" == "int8" ]]; then
+    INT8_FLAG="--int8"
+fi
+
+echo "PREC_FLAG=$INT8_FLAG"
+echo "PRECISION=$PRECISION"
+echo "BSZ=$BSZ"
+
+# 检查环境并执行相应的脚本
+if command -v ixsmi &>/dev/null; then
+    echo "MR env"
+    cmake -S . -B build
+    cmake --build build -j16
+elif command -v nvidia-smi &>/dev/null; then
+    echo "NV env"
+    cmake -S . -B build -DUSE_TENSORRT=true
+    cmake --build build -j16
+else
+    echo "No driver detected"
+    exit 1
+fi
+cd ./python/
+bash script/build_engine.sh --bs $BSZ $INT8_FLAG
+bash script/inference_squad.sh --bs $BSZ $INT8_FLAG
diff --git a/models/nlp/language_model/deberta/ixrt/README.md b/models/nlp/language_model/deberta/ixrt/README.md
index 56c3607e114fc5300eec0cd7e46c1a660ae7d430..4c0ecf5b7e5fb87aba8891efc1650722a4c59608 100644
--- a/models/nlp/language_model/deberta/ixrt/README.md
+++ b/models/nlp/language_model/deberta/ixrt/README.md
@@ -1,4 +1,4 @@
-# DeBERTa
+# DeBerta
 
 ## Description
 
@@ -9,8 +9,6 @@ DeBERTa (Decoding-enhanced BERT with disentangled attention) is an enhanced vers
 ### Install
 
 ```bash
-apt install -y libnuma-dev
-
 pip3 install onnxsim
 pip3 install onnx_graphsurgeon
 pip3 install scikit-learn
@@ -21,25 +19,26 @@ pip3 install tabulate
 pip3 install cv2
 pip3 install pycocotools
 pip3 install opencv-python==4.6.0.66
-pip3 install tf2onnx
-pip3 install transformers==4.33.3
 ```
 
 ### Download
 
-Pretrained model: <<https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_deberta.tar> >
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_deberta.tar >
 
-Dataset: <<https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar> > to download the squad dataset.
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar > to download the squad dataset.
 
+or you can :
 ```bash
-bash scripts/prepare_model_and_dataset.sh
+bash /scripts/prepare_model_and_dataset.sh
+
 ```
 
 ### Model Conversion
-
+Please correct the paths in the following commands or files.
 ```bash
-wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/deberta-torch-fp32.json
-python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/open_deberta/deberta-base-squad.pt --output_path deberta-torch-fp32.onnx
+tar -xvf open_deberta.tar
+wget <https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/deberta-torch-fp32.json >
+python3 torch2onnx.py --model_path deberta-base-squad.pt --output_path deberta-torch-fp32.onnx
 onnxsim deberta-torch-fp32.onnx deberta-torch-fp32-sim.onnx
 python3 remove_clip_and_cast.py
 
@@ -47,8 +46,9 @@ python3 remove_clip_and_cast.py
 
 ## Inference
 
+
 ```bash
-export ORIGIN_ONNX_NAME=./deberta-sim-drop-clip-drop-invaild-cast
+export ORIGIN_ONNX_NAME=/Path/deberta-sim-drop-clip-drop-invaild-cast
 export OPTIMIER_FILE=/Path/ixrt/oss/tools/optimizer/optimizer.py
 export PROJ_PATH=./
 ```
@@ -62,36 +62,33 @@ bash scripts/infer_deberta_fp16_performance.sh
 
 ### Accuracy
 
-If you want to evaluate the accuracy of this model, please visit the website: < <https://github.com/yudefu/ByteMLPerf/tree/iluvatar_general_infer> >, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
 
-For detailed steps regarding this model, please refer to this document: < <https://github.com/yudefu/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> > Note: You need to modify the relevant paths in the code to your own correct paths.
 
-```bash
-# clone and install requirements
-git clone https://github.com/yudefu/ByteMLPerf.git -b iluvatar_general_infer
-pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
-pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
-
-# setup
-mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
-cp ./datasets/open_squad/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
 
-mv ./deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
-mv ./general_perf/model_zoo/popular/ ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/
+```bash
 
-cd /ByteMLPerf/byte_infer_perf/general_perf
-wget http://files.deepspark.org.cn:880/deepspark/Palak.tar
+pip3 install -r toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/requirements.txt
+mv /ixrt/perf_engine.py toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn     密码：123..com
+get /upload/3-app/byteperf/Palak.tar
+exit
 tar -zxvf Palak.tar
 
-#接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py -AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/Your/Path/Palak/microsoft_deberta-base_squad")
+接着修改代码：toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/Your/Path/Palak/microsoft_deberta-base_squad")
 
-# run acc perf
-sed -i 's/tensorrt_legacy/tensorrt/g' backends/ILUVATAR/common.py
+mv deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
+cd toolbox/ByteMLPerf/byte_infer_perf/
+mv /general_perf/general_perf/model_zoo/popular/open_deberta /general_perf/model_zoo/popular/open_deberta
+cd toolbox/ByteMLPerf/byte_infer_perf/general_perf
 python3 core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
 ```
 
+
 ## Results
 
-| Model   | BatchSize | Precision | QPS   | Exact Match | F1 Score |
-| ------- | --------- | --------- | ----- | ----------- | -------- |
-| DeBERTa | 1         | FP16      | 18.58 | 73.76       | 81.24    |
+Model   |BatchSize  |Precision |QPS       |Exact Match  |F1 Score
+--------|-----------|----------|----------|-------------|------------
+DeBerta |    16     |   FP16   | 18.58    | 73.76       | 81.24
diff --git a/models/nlp/language_model/roberta/ixrt/README.md b/models/nlp/language_model/roberta/ixrt/README.md
index 5ba6e8880b088b23bc9d8b4205291309951f0624..d781bfad9f63c3421a8288545fe530bfa9526831 100644
--- a/models/nlp/language_model/roberta/ixrt/README.md
+++ b/models/nlp/language_model/roberta/ixrt/README.md
@@ -58,34 +58,34 @@ bash scripts/infer_roberta_fp16_performance.sh
 
 ### Accuracy
 
-If you want to evaluate the accuracy of this model, please visit the website: <https://github.com/yudefu/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
 
-For detailed steps regarding this model, please refer to this document: <https://github.com/yudefu/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
 
 ```bash
 # Install requirements
-pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
-pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
-mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+pip3 install -r toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+mv perf_engine.py toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
 
 # Move open_roberta
-mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
-mv open_roberta ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+mkdir -p toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+mv open_roberta toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
 
 # Get open_squad
 wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
 tar xf open_squad.tar
-cp ./open_squad/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad
+cp ./open_squad/* toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad
 rm -f open_squad.tar
 
 # Get csarron.tar
 wget http://files.deepspark.org.cn:880/deepspark/csarron.tar
 tar xf csarron.tar
 rm -f csarron.tar
-mv csarron/ ./ByteMLPerf/byte_infer_perf/
+mv csarron/ toolbox/ByteMLPerf/byte_infer_perf/
 
 # Run Acc scripts
-cd ./ByteMLPerf/byte_infer_perf/
+cd toolbox/ByteMLPerf/byte_infer_perf/
 python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
 ```
 
diff --git a/models/nlp/language_model/roformer/ixrt/README.md b/models/nlp/language_model/roformer/ixrt/README.md
index ba1e5975e0d0f6d094f3055aaf708c65dfed9b07..8947a617e22298a4195d22f32884911426fdc032 100644
--- a/models/nlp/language_model/roformer/ixrt/README.md
+++ b/models/nlp/language_model/roformer/ixrt/README.md
@@ -64,31 +64,29 @@ bash scripts/infer_roformer_fp16_performance.sh
 
 ### Accuracy
 
-If you want to evaluate the accuracy of this model, please visit the website: <https://github.com/yudefu/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend.
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend.
 
-For detailed steps regarding this model, please refer to this document: <https://github.com/yudefu/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
 
 ```bash
-# Clone ByteMLPerf
-git clone -b iluvatar_general_infer https://github.com/yudefu/ByteMLPerf.git
-pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
-mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
-mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+pip3 install -r toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+mv perf_engine.py toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+mkdir -p toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
 
 # Comment Line102 in compile_backend_iluvatar.py
-sed -i '102s/build_engine/# build_engine/' ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+sed -i '102s/build_engine/# build_engine/' toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
 
 # Move open_roformer
-mv ./data/open_roformer ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+mv ./data/open_roformer toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
 
 # Setup open_cail2019 dataset
 wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_cail2019.tar
 tar xf open_cail2019.tar
-cp ./open_cail2019/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019
+cp ./open_cail2019/* toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019
 rm -f open_cail2019.tar
 
 # Go to general_perf/
-cd ./ByteMLPerf/byte_infer_perf/general_perf
+cd toolbox/ByteMLPerf/byte_infer_perf/general_perf
 # Modify model_zoo/roformer-tf-fp32.json
 sed -i 's/segment:0/segment0/g; s/token:0/token0/g' model_zoo/roformer-tf-fp32.json
 # Run Acc scripts
diff --git a/models/nlp/language_model/videobert/ixrt/README.md b/models/nlp/language_model/videobert/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..18e2362d4f519492f73baea1509364720108bf8a
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/README.md
@@ -0,0 +1,76 @@
+# VideoBERT 
+
+## Description
+
+VideoBERT is a model designed for video understanding tasks, extending the capabilities of BERT (Bidirectional Encoder Representations from Transformers) to video data. It enhances video representation learning by integrating both visual and textual information into a unified framework.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+```
+
+### Download
+
+Pretrained model: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_videobert.tar>
+
+Dataset: <https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/cifar-100-python.tar  > to download the cifar-100-python dataset.
+
+or you can :
+```bash
+bash /scripts/prepare_model_and_dataset.sh
+
+```
+
+## Inference
+
+
+```bash
+export ORIGIN_ONNX_NAME=/Path/video-bert
+export OPTIMIER_FILE=/Path/ixrt/oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+
+```
+
+### Performance
+
+```bash
+
+bash scripts/infer_videobert_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/byte_infer_perf/general_perf>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+
+pip3 install -r toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+mv /ixrt/perf_engine.py toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+cd /toolbox/ByteMLPerf/byte_infer_perf/
+mv /general_perf/general_perf/model_zoo/popular/open_videobert /general_perf/model_zoo/popular/open_videobert
+cd /toolbox/ByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
+```
+Modify the <model> variable in the <optimize_to_ixrt> function of the <toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py> file to the actual video-bert.onnx path.
+
+
+## Results
+
+Model    |BatchSize  |Precision |QPS       |Top-1 ACC    |
+---------|-----------|----------|----------|-------------|
+VideoBERT|    16     |   FP16   | 37.68    | 61.67       |
\ No newline at end of file
diff --git a/models/nlp/language_model/videobert/ixrt/perf_engine.py b/models/nlp/language_model/videobert/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh b/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7911aecdb775bcec206c398b81eff18e083597f0
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+    BS=16
+    TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+    TARGET_ENGINE=${ORIGIN_ONNX_NAME}_end.engine
+    if [[ ! -f "${ORIGIN_ONNX}" ]];then
+        echo "${ORIGIN_ONNX} not exists!"
+        exit 1
+    fi
+
+    # Graph optimize
+    python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+
+    # Build Engine
+    ixrtexec --onnx ${TARGET_ONNX} --min_shape image:${BS}x3x224x224,text:100x77 \
+                                   --opt_shape image:${BS}x3x224x224,text:100x77 \
+                                   --max_shape image:${BS}x3x224x224,text:100x77 \
+                                   --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin --shapes image:${BS}x3x224x224,text:100x77
+
+    # Test Performance
+    ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes image:${BS}x3x224x224,text:100x77
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh b/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c57f758d35547a14106d1acbedb2510fba335c44
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model....  *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_cifar/
+
+wget -O general_perf/download/open_videobert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_videobert.tar
+tar xf general_perf/download/open_videobert.tar -C general_perf/model_zoo/popular/
+
+
+# # Download Datasets
+wget -O general_perf/download/cifar-100-python.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/cifar-100-python.tar
+tar xf general_perf/download/cifar-100-python.tar -C datasets/open_cifar
+
+
+echo "Extract Done."
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/README.md b/models/recommendation/widedeep/ixrt/README.md
index fb01a4d49378ff076dd712ca225bb352262d1f80..13b88008e1de42bdc6f004d9205e291f89ac1ecf 100644
--- a/models/recommendation/widedeep/ixrt/README.md
+++ b/models/recommendation/widedeep/ixrt/README.md
@@ -50,28 +50,26 @@ bash scripts/infer_widedeep_fp16_performance.sh
 
 ### Accuracy
 
-If you want to evaluate the accuracy of this model, please visit the website: <https://github.com/yudefu/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+If you want to evaluate the accuracy of this model, please visit here: <toolbox/ByteMLPerf/tree/iluvatar_general_infer>, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
 
-For detailed steps regarding this model, please refer to this document: <https://github.com/yudefu/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
+For detailed steps regarding this model, please refer to this document: <toolbox/ByteMLPerf/blob/iluvatar_general_infer/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md> Note: You need to modify the relevant paths in the code to your own correct paths.
 
 ```bash
-# Clone ByteMLPerf
-git clone -b iluvatar_general_infer https://github.com/yudefu/ByteMLPerf.git
-pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
-mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+pip3 install -r toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+mv perf_engine.py toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
 
 # Get eval.csv and onnx
-mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model
-mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/
+mkdir -p toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model
+mkdir -p toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/
 
 wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/eval.csv
-mv eval.csv ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/
+mv eval.csv toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/
 
 wget http://files.deepspark.org.cn:880/deepspark/widedeep_dynamicshape_new.onnx
-mv widedeep_dynamicshape_new.onnx ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model/
+mv widedeep_dynamicshape_new.onnx toolbox/ByteMLPerf/byte_infer_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model/
 
 # Run Acc scripts
-cd ./ByteMLPerf/byte_infer_perf/general_perf
+cd toolbox/ByteMLPerf/byte_infer_perf/general_perf
 python3 core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
 ```
 
diff --git a/toolbox/ByteMLPerf/CONTRIBUTING.md b/toolbox/ByteMLPerf/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8de88fde2de45151b6a3f3ea52ba4267226c7ce
--- /dev/null
+++ b/toolbox/ByteMLPerf/CONTRIBUTING.md
@@ -0,0 +1,52 @@
+<!-- omit in toc -->
+# Contributing to Byte MLPerf
+
+First of all, thanks for taking the time to contribute!
+
+All types of contributions are encouraged and valued. See the [Table of Contents](#table-of-contents) for different ways to help and details about how this project handles them. Please make sure to read the relevant section before making your contribution. It will make it a lot easier for our maintainers and smooth out the experience for all involved. The community looks forward to your contributions.
+
+> And if you like the project, but just don't have time to contribute, that's fine. There are other easy ways to support the project and show your appreciation, which we would also be very happy about:
+> - Star the project
+> - Tweet about it
+> - Refer this project in your project's readme
+> - Mention the project at local meetups and tell your friends/colleagues
+
+<!-- omit in toc -->
+## Table of Contents
+
+- [Contributor License Agreementt](#contributor-license-agreement)
+- [Pull Requests](#pull-requests)
+- [I Have a Question](#i-have-a-question)
+
+## Contributor License Agreement
+
+Thank you for your interest in contributing to open source projects hosted or managed by Bytedance Ltd. and/or its Affiliates ("ByteDance"). In order to clarify the intellectual property license granted with Contributions from any person or entity, ByteDance must have a Contributor License Agreement ("CLA") on file that has been signed by each Contributor, indicating agreement to the license terms below. This license is for your protection as a Contributor as well as the protection of ByteDance and its users; it does not change your rights to use your own Contributions for any other purpose.
+
+- If you work for a company that wants to allow you to contribute your work, then you'll need to sign a corporate CLA.
+
+- If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an individual CLA.
+
+- If you have not already done so, please complete and sign, then scan and email a pdf file of this Agreement to opensource-cla@bytedance.com. Please read this document carefully before signing and keep a copy for your records.   
+
+##  Pull Requests
+We actively welcome your pull requests.
+
+- Fork the repo and create your branch from `master`.
+- If you've changed APIs, update the documentation.
+- Make sure your code lints.
+- If you haven't already, complete the Contributor License Agreement ("CLA").
+
+
+## I Have a Question
+
+> If you want to ask a question, we assume that you have read the available [Documentation]().
+
+Before you ask a question, it is best to search for existing [Issues](https://github.com/bytedance/ByteMLPerf/issues) that might help you. In case you have found a suitable issue and still need clarification, you can write your question in this issue. It is also advisable to search the internet for answers first.
+
+If you then still feel the need to ask a question and need clarification, we recommend the following:
+
+- Open an [Issue](https://github.com/bytedance/ByteMLPerf/issues/new).
+- Provide as much context as you can about what you're running into.
+- Provide project and platform versions, depending on what seems relevant.
+
+We will then take care of the issue as soon as possible.
diff --git a/toolbox/ByteMLPerf/LICENSE b/toolbox/ByteMLPerf/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/toolbox/ByteMLPerf/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/toolbox/ByteMLPerf/NOTICE b/toolbox/ByteMLPerf/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..4b563f6914ab0ebc4af4279ff29752ed518f71ac
--- /dev/null
+++ b/toolbox/ByteMLPerf/NOTICE
@@ -0,0 +1,2 @@
+ByteMLPerf
+Copyright 2023 ByteDance Ltd. and/or its affiliates.
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/README.md b/toolbox/ByteMLPerf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..418f41afdbd4d815b99775c877de53eba41767d7
--- /dev/null
+++ b/toolbox/ByteMLPerf/README.md
@@ -0,0 +1,41 @@
+<div align="center">
+  <img src="docs/images/icon.png">
+</div>
+
+
+# ByteMLPerf Benchmark Tool
+ByteMLPerf is an AI Accelerator Benchmark that focuses on evaluating AI Accelerators from practical production perspective, including the ease of use and versatility of software and hardware. Byte MLPerf has the following characteristics:
+- Models and runtime environments are more closely aligned with practical business use cases.
+- For ASIC hardware evaluation, besides evaluate performance and accuracy, it also measure metrics like compiler usability and coverage.
+- Performance and accuracy results obtained from testing on the open Model Zoo serve as reference metrics for evaluating ASIC hardware integration.
+
+## Category
+The ByteMLPerf benchmark is structured into three main categories: Inference, Training, and Micro, each targeting different aspects of AI accelerator performance:
+
+- Inference: This category is subdivided into two distinct sections to cater to different types of models:
+
+  - General Performance: This section is dedicated to evaluating the inference capabilities of accelerators using common models such as ResNet-50 and BERT. It aims to provide a broad understanding of the accelerator's performance across a range of typical tasks. Vendors can refer to this document for guidance on building general perf backend: [ByteMLPerf General Perf Guide](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof) [[中文版](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf#TDK8of)]
+
+  - Large Language Model (LLM) Performance: Specifically designed to assess the capabilities of accelerators in handling large language models, this section addresses the unique challenges posed by the size and complexity of these models. Vendors can refer to this document for guidance on building llm perf backend: [ByteMLPerf LLM Perf Guide](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc) [[中文版](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc)]
+
+- Micro: The Micro category focuses on the performance of specific operations or "ops" that are fundamental to AI computations, such as Gemm, Softmax, and various communication operations. This granular level of testing is crucial for understanding the capabilities and limitations of accelerators at a more detailed operational level. Vendors can refer to this document for guidance on building micro perf backend: [ByteMLPerf Micro Perf Guide](https://bytedance.us.larkoffice.com/docx/EpjFdSpRsoOIHWxtKgjuRsMPsFB)[[中文版](https://bytedance.us.larkoffice.com/docx/LJWvdGVAzoxXkTxF9h9uIETbsWc)]
+
+- Training: Currently under development, this category aims to evaluate the performance of AI accelerators in training scenarios. It will provide insights into how well accelerators can handle the computationally intensive process of training AI models, which is vital for the development of new and more advanced AI systems.
+
+Vendors looking to evaluate and improve their AI accelerators can utilize the ByteMLPerf benchmark as a comprehensive guide. The benchmark not only offers a detailed framework for performance and accuracy evaluation but also includes considerations for compiler usability and coverage for ASIC hardware, ensuring a holistic assessment approach.
+
+For more details, you can visit our offical website here: [bytemlperf.ai](https://bytemlperf.ai/)
+
+## Vendor List
+ByteMLPerf Vendor Backend List will be shown below
+
+| Vendor | SKU | Key Parameters | Inference(General Perf) | Inference(LLM Perf) |
+| :---- | :----| :---- | :---- | :---- |
+| Intel | Xeon | - | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) | - |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.md) | - |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) | - |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) | - |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
diff --git a/toolbox/ByteMLPerf/README.zh_CN.md b/toolbox/ByteMLPerf/README.zh_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f4779490be31232c063add2fcf14100b54d6256
--- /dev/null
+++ b/toolbox/ByteMLPerf/README.zh_CN.md
@@ -0,0 +1,39 @@
+<div align="center">
+  <img src="docs/images/icon.png">
+</div>
+
+
+# ByteMLPerf Benchmark Tool
+ByteMLPerf是字节使用的一个基准套件，用于测量推理系统在各种部署场景中运行模型的速度。相比MLPerf，ByteMLPerf有如下特点：
+- 模型和运行环境会更贴近真实业务；
+- 对于新硬件，除了评估性能和精度之外，同时也会评估图编译的易用性、覆盖率等指标；
+- 在开放Model Zoo上测试所得的性能和精度，会作为新硬件引入评估的参考；
+
+## 类别
+ByteMLPerf 基准分为三个主要类别：推理（Inference）、训练（Training）和微观性能（Micro），每个类别针对 AI 加速器性能的不同方面：
+
+- Inference：此类别进一步细分为两个部分，以适应不同类型的模型：
+  - General Perf：此部分致力于使用常见模型（如 ResNet-50 和 BERT）评估加速器的推理能力。其目的是提供加速器在一系列典型任务中性能的广泛理解。想要接入General Perf的厂商可以参考该文档接入测试：[ByteMLPerf Inference General Perf厂商接入指南](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf) 
+  - LLM Perf：专门设计用于评估加速器处理大型语言模型的能力，此部分解决了这些模型的大小和复杂性带来的独特挑战。想要接入LLM Perf的厂商可以参考该文档接入测试：[ByteMLPerf Inference LLM Perf厂商接入指南](https://bytedance.larkoffice.com/docx/ZoU7dkPXYoKtJtxlrRMcNGMwnTc) 
+
+- Micro：Micro Perf侧重于评估特定操作或“运算”（如 Gemm、Softmax 和各种通信操作）的性能，这些操作是 AI 计算的基础。这种详细级别的测试对于了解加速器在更细致的操作层面的能力和限制至关重要。想要接入Micro Perf的厂商可以参考该文档接入测试：[ByteMLPerf Micro Perf厂商接入指南](https://bytedance.us.larkoffice.com/docx/LJWvdGVAzoxXkTxF9h9uIETbsWc) 
+
+- Training：目前正在开发中的此类别旨在评估 AI 加速器在训练场景中的性能。它将提供关于加速器如何处理训练 AI 模型的计算密集过程的见解，这对于开发新的和更先进的 AI 系统至关重要。
+
+希望评估和改进其 AI 加速器的供应商可以使用 ByteMLPerf 基准作为全面的指南。该基准不仅提供了性能和准确性评估的详细框架，还包括了 ASIC 硬件的编译器可用性和覆盖范围的考虑，确保了全面的评估方法。
+
+更多细节您可以访问我们的官方网站:[bytemlperf.ai](https://bytemlperf.ai/)
+
+## Vendor List
+目前支持的厂商Backend如下:
+
+| Vendor | SKU | Key Parameters | Inference(General Perf) | Inference(LLM Perf) |
+| :---- | :----| :---- | :---- | :---- |
+| Intel | Xeon | - | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) | - |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.md) | - |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) | - |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) | - |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a811abb80315a5928196c6e31b8f8f27de976055
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.md
@@ -0,0 +1,108 @@
+<div align="center">
+  <img src="../../docs/images/icon.png">
+</div>
+
+
+# Byte MLPerf Inference Benchmark Tool
+Byte MLPerf(Inference) is an AI Accelerator Benchmark that focuses on evaluating AI Accelerators from practical production perspective, including the ease of use and versatility of software and hardware. Byte MLPerf has the following characteristics:
+- Models and runtime environments are more closely aligned with practical business use cases.
+- For ASIC hardware evaluation, besides evaluate performance and accuracy, it also measure metrics like compiler usability and coverage.
+- Performance and accuracy results obtained from testing on the open Model Zoo serve as reference metrics for evaluating ASIC hardware integration.
+
+Vendors can refer to this document for guidance on building backend: [ByteMLPerf Guide](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof) [[中文版](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf#TDK8of)]
+
+## Usage
+The user uses launch.py as the entry point. When using Byte MLPerf to evaluate the model, you only need to pass in two parameters --task and --hardware_type, as shown below:
+```bash
+python3 launch.py --task xxx --hardware_type xxx
+```
+
+1. task
+--task parameter is the name of the incoming workload. You need to specify the workload. For example, if you would like to evaluate the workload: bert-tf-fp16.json, you need to specify --task bert-tf-fp16.
+Note: All workloads are defined under general_perf/workloads, and the name needs to be aligned with the file name when passing parameters. The current format is model-framework-precision.
+
+2. hardware_type
+--hardware_type parameter is the incoming hardware_type name, there is no default value, it must be specified by the user. Example: To evaluate Habana Goya, specify --hardware_type GOYA .
+Note: All hardware types are defined under general_perf/backends, and the name needs to be aligned with the folder name when passing parameters.
+
+3. compile_only
+--compile_only parameter will make task stoped once compilation is finished
+
+4. show_task_list
+--show_task_list parameter will print all task name
+
+5. show_hardware_list
+--show_hardware_list parameter will print all hardware backend
+
+### Workload Description
+A workload definition needs to contain the following fields:
+```javascript
+{
+    "model": "bert-torch-fp32",   //The name of the model to be evaluated, which needs to be aligned with the model_zoo name
+    "test_perf": true,            //Evaluate model performance
+    "test_accuracy": true,        //Evaluate model accuracy
+    "test_numeric": true,         //Accuracy：Evaluate model numeric
+    "clients": 3,                 //Performance：Client threads that submit data
+    "iterations": 100,            //Performance：How many iterations are submitted by each thread
+    "batch_sizes":[1,4,8,16,32,64],//Performance：The batch size when each thread submits data
+    "data_percent": 50,           //Accuracy：Ratio of data to assess accuracy, [1-100]
+    "compile_only": false,           //Compile the model only
+}
+```
+
+## Model Zoo List
+Model Zoo&Dataset
+The models supported by Byte MLPerf are collected under the Model Zoo. From the perspective of access rights, they are currently divided into internal models and open models. Released with Byte MLPerf is the open model included in the corresponding version.
+
+Open model collection principles:
+- Basic Model: including Resnet50, Bert and WnD;
+- Popular Model：Includes models currently widely used in the industry;
+- SOTA: including SOTA models corresponding to business domains;
+
+In addition to the complete model structure, Byte MLPerf will also add some typical model substructure subgraphs or OPs (provided that the open model cannot find a suitable model containing such classic substructures), such as transformer encoder/decoder with different sequence lengths , all kinds of common conv ops, such as group conv, depwise-conv, point-wise conv, and rnn common structures, such as gru/lstm, etc.
+
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | tensorflow, pytorch | imagenet2012 | fp32 |
+| bert-base | nlp | regular | tensorflow, pytorch | squad-1.1 | fp32 |
+| wide&deep | rec | regular | tensorflow | criteo | fp32 |
+| videobert | mm  |popular | onnx | cifar100 | fp32 |
+| albert | nlp | popular | pytorch | squad-1.1 | fp32 |
+| conformer | nlp | popular | onnx | none | fp32 |
+| roformer | nlp | popular | tensorflow | cail2019 | fp32 |
+| yolov5 | cv | popular | onnx | none | fp32 |
+| roberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| deberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| swin-transformer | cv | popular | pytorch | imagenet2012 | fp32 |
+| gpt2 | nlp | sota | pytorch | none | fp32 |
+| stable diffusion | cv | sota | onnx | none | fp32 |
+| LlaMa2 7B | nlp | sota | torch | none | fp16 |
+| chatGLM2 6B | nlp | sota | torch | none | fp16 |
+
+### ByteIR
+
+The ByteIR Project is a ByteDance model compilation solution. ByteIR includes compiler, runtime, and frontends, and provides an end-to-end model compilation solution.
+
+Although all ByteIR components (compiler/runtime/frontends) are together to provide an end-to-end solution, and all under the same umbrella of this repository, each component technically can perform independently.
+
+For More Information, please refer to [ByteIR](https://github.com/bytedance/byteir)
+
+Models Supported By ByteIR:
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50_mhlo.tar) | imagenet2012 | fp32 |
+| bert-base | nlp | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/bert_mhlo.tar) | squad-1.1 | fp32 |
+
+## Vendor List
+ByteMLPerf Vendor Backend List will be shown below
+
+| Vendor |  SKU | Key Parameters | Supplement |
+| :---- | :----| :---- | :---- |
+| Intel | Xeon | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](general_perf/backends/STC/README.md) |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](general_perf/backends/IPU/README.md) |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W                           | [SPU Introduction](general_perf/backends/SPU/README.md) |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM                            | [HPU Introduction](general_perf/backends/HPU/README.md) |
+
+## Statement
+[ASF Statement on Compliance with US Export Regulations and Entity List](https://news.apache.org/foundation/entry/statement-by-the-apache-software)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.zh_CN.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.zh_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..dadac89f3814b60cc69ad9411f180feb883cc979
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/README.zh_CN.md
@@ -0,0 +1,105 @@
+<div align="center">
+  <img src="../../docs/images/icon.png">
+</div>
+
+
+# Byte MLPerf Inference Benchmark Tool
+Byte MLPerf（推理）是字节使用的一个基准套件，用于测量推理系统在各种部署场景中运行模型的速度。相比MLPerf，Byte MLPerf有如下特点：
+- 模型和运行环境会更贴近真实业务；
+- 对于新硬件，除了评估性能和精度之外，同时也会评估图编译的易用性、覆盖率等指标；
+- 在开放Model Zoo上测试所得的性能和精度，会作为新硬件引入评估的参考；
+
+厂商可以参考该文档接入测试：[ByteMLPerf厂商接入指南](https://bytedance.feishu.cn/docs/doccno9eLS3OseTA5aMBeeQf2cf) [[English Version](https://bytedance.us.feishu.cn/docx/L98Mdw3J6obMtJxeRBzuHeRbsof)]
+
+## Usage
+用户使用入口为launch.py, 在使用byte mlperf评估时，只需传入--task 、--hardware_type 两个参数，如下所示：
+```bash
+python3 launch.py --task xxx --hardware_type xxx
+```
+
+1. tasks
+--task 参数为传入的workload 名字，需要指定评估workload，例如：若要评估 open_bert-tf-fp16.json 定义的 workload，则需指定   --task open_bert-tf-fp16 。
+注：所有workload定义在general_perf/workloads下，传参时名字需要和文件名对齐。目前格式为model-framework-precision。
+
+2. hardware_type
+--hardware_type 参数为传入的hardware_type 名字，无默认值，必须用户指定。例如：若要评估 Habana Goya ，则需指定   --hardware_type GOYA 。
+注：所有hardware type定义在general_perf/backends下，传参时名字需要和folder名对齐。
+
+3. compile_only
+--compile_only 参数将在模型编译完成后停止任务
+
+4. show_task_list
+--show_task_list 参数会打印所有任务名字
+
+5. show_hardware_list
+--show_hardware_list 参数会打印目前所有支持的硬件Backend名称
+
+### Workload说明
+一个workload定义需包含如下字段:
+```javascript
+{
+    "model": "bert-torch-fp32",   //待评估模型的名字，需要和model_zoo名字对齐
+    "test_perf": true,            //是否评估模型性能
+    "test_accuracy": true,        //是否评估模型精度
+    "test_numeric": true,         //精度：是否评估数值误差
+    "clients": 3,                 //性能：提交数据的client threads
+    "iterations": 100,            //性能：每个thread提交多少iteration
+    "batch_sizes":[1,4,8,16,32],  //性能：每个thread提交数据时的bs
+    "data_percent": 50,           //精度：使用百分多少数据集评估精度, [1-100]
+    "compile_only": false,        //是否仅编译模型
+}
+```
+
+## Model Zoo List
+Model Zoo&Dataset
+Model Zoo下收录了Byte MlPerf支持的模型，从访问权限上，目前分为内部模型、开放模型。随Byte MlPerf 发布的是对应版本收录的开放模型。
+Dataset为模型需要用到数据集，对应的dataloader、accuracy_checker从结构上也归入Dataset。
+
+开放模型收录原则：
+- 基础模型：包含十分常见的Rn50、Bert和WnD；
+- 业务类似：包含目前内部较多的、或结构相似的模型结构；
+- 前沿模型：包含业务领域对应的SOTA模型；
+
+此外，除了完整模型结构，Byte MlPerf还会加入一些典型模型子结构子图或OP（前提是开放模型无法找到合适的完整模型包含这类经典子结构），比如各不同序列长度的transformer encoder/decoder，各类常见conv op，如group conv、depwise-conv、point-wise conv，以及rnn 常见结构，如gru/lstm等。
+
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | tensorflow, pytorch | imagenet2012 | fp32 |
+| bert-base | nlp | regular | tensorflow, pytorch | squad-1.1 | fp32 |
+| wide&deep | rec | regular | tensorflow | criteo | fp32 |
+| videobert | mm  |popular | onnx | cifar100 | fp32 |
+| albert | nlp | popular | pytorch | squad-1.1 | fp32 |
+| conformer | nlp | popular | onnx | none | fp32 |
+| roformer | nlp | popular | tensorflow | cail2019 | fp32 |
+| yolov5 | cv | popular | onnx | none | fp32 |
+| roberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| deberta | nlp | popular | pytorch | squad-1.1 | fp32 |
+| swin-transformer | cv | popular | pytorch | imagenet2012 | fp32 |
+| gpt2 | nlp | sota | pytorch | none | fp32 |
+| stable diffusion | cv | sota | onnx | none | fp32 |
+| LlaMa2 7B | nlp | sota | torch | none | fp16 |
+| chatGLM2 6B | nlp | sota | torch | none | fp16 |
+
+### ByteIR
+
+ByteIR项目是字节跳动的模型编译解决方案。ByteIR包括编译器、运行时和前端，并提供端到端的模型编译解决方案。 尽管所有的ByteIR组件（编译器/runtime/前端）一起提供端到端的解决方案，并且都在同一个代码库下，但每个组件在技术上都可以独立运行。
+
+更多信息请查看[ByteIR](https://github.com/bytedance/byteir)
+
+ByteIR 编译支持的模型列表:
+| Model | Domain | Purpose | Framework | Dataset | Precision |
+| ---- | ---- | ---- | ---- | ---- | ---- |
+| resnet50-v1.5 | cv | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50_mhlo.tar) | imagenet2012 | fp32 |
+| bert-base | nlp | regular | [mhlo](https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/bert_mhlo.tar) | squad-1.1 | fp32 |
+
+
+## Vendor List
+目前支持的厂商Backend如下:
+
+| Vendor |  SKU | Key Parameters | Supplement |
+| :---- | :----| :---- | :---- |
+| Intel | Xeon | - | - |
+| Stream Computing | STC P920 | <li>Computation Power:128 TFLOPS@FP16 <li> Last Level Buffer: 8MB, 256GB/s <li>Level 1 Buffer: 1.25MB, 512GB/s   <li> Memory: 16GB, 119.4GB/S <li> Host Interface：PCIe 4, 16x, 32GB/s <li> TDP: 160W | [STC Introduction](byte_infer_perf/general_perf/backends/STC/README.md) |
+| Graphcore | Graphcore® C600 | <li>Compute: 280 TFLOPS@FP16, 560 TFLOPS@FP8 <li> In Processor Memory: 900 MB, 52 TB/s <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 185W | [IPU Introduction](byte_infer_perf/general_perf/backends/IPU/README.zh_CN.md) |
+| Moffett-AI | Moffett-AI S30 | <li>Compute: 1440 (32x-Sparse) TFLOPS@BF16, 2880 (32x-Sparse) TOPS@INT8, <li> Memory: 60 GB,  <li> Host Interface: Dual PCIe Gen4 8-lane interfaces, 32GB/s <li> TDP: 250W                           | [SPU Introduction](byte_infer_perf/general_perf/backends/SPU/README.md) |
+| Habana | Gaudi2 | <li>24 Tensor Processor Cores, Dual matrix multiplication engines <li> Memory: 96 GB HBM2E, 48MB SRAM                            | [HPU Introduction](byte_infer_perf/general_perf/backends/HPU/README.md) |
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e645a9c4910ed16d8662ea0ee56cf8c34431426
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/__init__.py
@@ -0,0 +1,54 @@
+import sys
+from packaging.version import parse
+import warnings
+
+from .version import __version__
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Defaults to 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    assert 'parrots' not in version_str
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+python3_minimum_version = '3.6.0'
+python_version = digit_version(sys.version.split()[0])
+
+assert (python_version >= digit_version(python3_minimum_version)), \
+    f'PYTHON=={sys.version.split()[0]} is used but incompatible. ' \
+    f'Please install python>={python3_minimum_version}.'
+
+__all__ = ['__version__']
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.py
new file mode 100644
index 0000000000000000000000000000000000000000..400a0ac958faa159bfbc1d279bc2beade8e7bcf2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.py
@@ -0,0 +1,106 @@
+import argparse
+import logging
+import os
+import importlib
+import json
+import sys
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("CPUBase")
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--task", default='resnet50-tf-fp32')
+    parser.add_argument("--hardware_type", default="CPU")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        help="Batch sizes we will test in performace mode")
+    parser.add_argument(
+        "--data_percent",
+        type=int,
+        help=
+        "Data percent we will used in the whole data set when we will test in accuracy mode"
+    )
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine(object):
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+
+    def start_engine(self):
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        log.info("Runing CPU Base...")
+
+        self.compile_backend = init_compile_backend(self.args.hardware_type)
+        self.runtime_backend = init_runtime_backend(self.args.hardware_type)
+        if self.workload:
+            return self.workload_perf(self.workload)
+
+    def workload_perf(self, workload):
+        # set reports dir
+        output_dir = os.path.abspath('general_perf/reports/' + self.args.hardware_type +
+                                     '/' + workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        model_info = self.get_model_info(workload['model'])
+
+        ds = load_dataset(model_info)
+        ds.preprocess()
+
+        compile_info = self.compile_backend.compile({
+            "workload": workload,
+            'model_info': model_info
+        })
+
+        # load runtime backend
+        runtime_backend = self.runtime_backend
+        runtime_backend.configs = compile_info
+        runtime_backend.workload = workload
+        runtime_backend.model_info = model_info
+        runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        if workload['test_accuracy'] or workload['test_numeric']:
+            ds.rebatch(self.args.batch_size)
+            AccuracyChecker = self.get_accuracy_checker(
+                model_info['dataset_name']
+                if model_info['dataset_name'] else 'fake_dataset')
+            AccuracyChecker.runtime_backend = runtime_backend
+            AccuracyChecker.dataloader = ds
+            AccuracyChecker.output_dir = output_dir
+            AccuracyChecker.configs = compile_info
+            AccuracyChecker.calculate_acc(workload['data_percent'])
+
+        return
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str):
+        with open("general_perf/model_zoo/" + model_name + '.json', 'r') as f:
+            model_info = json.load(f)
+        return model_info
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f786dfa0ba7c3d2e2950f55e0a284db2088d59de
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/calculate_cpu_diff.sh
@@ -0,0 +1,12 @@
+#！bin/bash
+if [ ! -d "general_perf/backends/CPU/venv" ];then
+    virtualenv -p python3 general_perf/backends/CPU/venv
+    source general_perf/backends/CPU/venv/bin/activate
+    general_perf/backends/CPU/venv/bin/python3 -m pip install --upgrade pip  -q
+    general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+else
+    source general_perf/backends/CPU/venv/bin/activate
+    general_perf/backends/CPU/venv/bin/python3 -m pip install -r general_perf/backends/CPU/requirements.txt -q
+fi
+
+python3 general_perf/backends/CPU/calculate_cpu_diff.py --task $1 --batch_size $2
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d88a1114f5c132a44a129516b5fb9e5da4fcf3c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/compile_backend_cpu.py
@@ -0,0 +1,97 @@
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+import torch
+import onnxruntime
+import time
+import numpy as np
+
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendCPU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+
+class CompileBackendCPU(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendCPU, self).__init__()
+        self.hardware_type = 'CPU'
+        self.need_reload = False
+        self.model_runtimes = []
+
+    def compile(self, config, dataloader=None):
+        result = {
+            "model":
+            config['model_info']['model'],
+            "framework":
+            config['model_info']['framework'],
+            "compile_precision":
+            config['model_info']['model_precision'],
+            "optimizations":{},
+            "instance_count": 1,
+            "device_count": 128,
+            "input_type":
+            config['model_info']['input_type'].split(","),
+            "max_batch_size":
+            config['model_info']['max_batch_size'],
+            "compile_status":
+            "success",
+            "sg_percent":
+            100,
+            "segments": [
+                {
+                    "sg_idx":
+                    0,
+                    "is_fallback":
+                    False,
+                    "input_tensor_map":
+                    config['model_info']['input_shape'],
+                    "output_tensor_map":
+                    config['model_info']['outputs'],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": config['model_info']['model_path'],
+                        },
+                    ],
+                },
+            ]
+        }
+        self.configs = result
+        self.workload = config['workload']
+        self.model_info = config['model_info']
+        return result
+
+    def get_interact_profile(self, config):
+        model_profile = []
+        file_path = "general_perf/backends/CPU/" + self.hardware_type + '.json'
+        if os.path.exists(file_path):
+            with open(file_path, 'r') as f:
+                model_profile = json.load(f)
+        else:
+            log.info(
+                'File path: {} does not exist, please check'.format(file_path))
+
+        return model_profile
+
+    def get_best_batch_size(self):
+        """
+        Get Best Batch Size for the model
+        """
+        return None
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b12c02eb3832026cd61894304cec4eaf4b08a4a1
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/requirements.txt
@@ -0,0 +1,14 @@
+matplotlib
+scikit-learn
+opencv-python-headless
+transformers
+tokenization
+bert-tensorflow==1.0.1
+torchvision
+onnx
+numpy==1.19.2
+tensorflow==2.4.0
+onnxruntime
+torch==1.13.1
+sentencepiece==0.1.96
+pandas==1.3.3
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..eec8c98b2ca0b614e7516e2ee4ff991f46d86ee6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/CPU/runtime_backend_cpu.py
@@ -0,0 +1,184 @@
+import os
+import json
+import logging
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+import tensorflow as tf
+import torch
+import onnxruntime
+import time
+import numpy as np
+
+from general_perf.backends import runtime_backend
+
+log = logging.getLogger("BackendCPU")
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "LONG": torch.long
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+
+class RuntimeBackendCPU(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendCPU, self).__init__()
+        self.hardware_type = 'CPU'
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.batch_size = -1
+
+    def predict(self, feeds):
+        results = {}
+        if self.framework == "Tensorflow":
+            entry_rt = self.model_runtimes[0].signatures['serving_default']
+            all_sn_inputs = entry_rt.structured_input_signature
+
+            def get_real_feeds(feeds, sn_inputs):
+                sn_inputs = tf.nest.flatten(sn_inputs, True)
+                real_feeds = {}
+                itr = 0
+                for _, val in feeds.items():
+                    real_feeds[sn_inputs[itr].name] = tf.constant(val)
+                    itr += 1
+                return real_feeds
+
+            real_feeds = get_real_feeds(feeds, all_sn_inputs)
+
+            for model_runtime in self.model_runtimes:
+                with tf.device('/CPU:0'):
+                    _results = model_runtime.signatures['serving_default'](
+                        **real_feeds)
+
+            results = {}
+            for key, val in _results.items():
+                results[key] = val.numpy()
+
+            assert len(results) != 0
+
+        elif self.framework == "Pytorch":
+            input_tensors = []
+            i = 0
+            for key, _ in feeds.items():
+                input_tensors.append(
+                    torch.tensor(feeds[key],
+                                 dtype=pt_dtype_map[self.input_type[i]]).to(
+                                     self.device))
+                i += 1
+            with torch.no_grad():
+                for model_runtime in self.model_runtimes:
+                    results = model_runtime(*input_tensors)
+
+            if isinstance(results, dict):
+                for key, val in results.items():
+                    results[key] = val.cpu().detach().numpy()
+            elif isinstance(results, tuple):
+                dic = {}
+                for i, key in enumerate(self.outputs):
+                    dic[key] = list(results)[i]
+            else:
+                results = {self.outputs[0]: results.cpu().numpy()}
+        else:
+            for model_runtime in self.model_runtimes:
+                results = model_runtime.run(None, feeds)
+        return results
+
+    def benchmark(self, dataloader):
+        iterations = self.workload['iterations']
+        batch_size = self.get_loaded_batch_size()
+        times_range = []
+        report = {}
+        report['BS'] = batch_size
+        test_data = self._get_fake_samples(
+            batch_size, self.configs['segments'][0]['input_tensor_map'],
+            self.configs['input_type'])
+
+        for _ in range(30):
+            self.predict(test_data)
+
+        for _ in range(iterations):
+            start_time = time.time()
+            self.predict(test_data)
+            end_time = time.time()
+            times_range.append(end_time - start_time)
+
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * batch_size / avg_latency)
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(batch_size, qps, avg_latency, tail_latency))
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        self.batch_size = batch_size
+        self.model_runtimes = []
+        self.input_type = self.configs['input_type']
+        self.framework = self.configs['framework']
+
+        self.model_name = self.configs['model']
+
+        for i, segment in enumerate(self.configs['segments']):
+            # there is no input/output meta data i the graph so it need to come from config.
+            if not segment['input_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs inputs")
+            if not segment['output_tensor_map']:
+                raise ValueError("Segment " + str(i) + " needs outputs")
+
+            self.input_shapes = segment['input_tensor_map']
+            self.outputs = segment['output_tensor_map'].split(",")
+
+            if self.framework == "Tensorflow":
+                with tf.device('/CPU:0'):
+                    model = tf.saved_model.load(
+                        segment['compiled_model'][0]['compiled_obj'])
+            elif self.framework == "Pytorch":
+                self.device = "cpu"
+                model = torch.jit.load(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    torch.device('cpu'))
+                model.eval()
+            else:
+                model = onnxruntime.InferenceSession(
+                    segment['compiled_model'][0]['compiled_obj'],
+                    providers=['CPUExecutionProvider'])
+
+            self.model_runtimes.append(model)
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..402e2dceccd2b9385fc9599de60b6559f1704111
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/README.zh_CN.md
@@ -0,0 +1,319 @@
+"""
+    ****************************************操作说明*********************************
+    如果不想跑CPU端的性能、精度、数值指标对比，可以直接执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）
+             如果模型提供了pt、pb格式的优先选择torch的配置进行测试；
+             如果执行整个pipeline，需要执行：python3 lauch.py --hardware_type ILUVATAR --task widedeep-tf-fp32（示例）（跑cpu结果会很耗时）
+
+    功能实现：
+        1、pt、pb模型转换在compile模块预处理过程中实现；
+        2、在天数智芯BI-150显卡上，调用推理引擎tensorrt进行推理，一些onnx模型需要利用前面一步导出的onnx模型再进行插件算子的优化；
+    
+    环境准备：
+        1、sdk版本： 由天数智芯工程师提供
+        2、ixrt版本：由天数智芯工程师提供
+"""
+
+
+"""
+    ***************************11个小模型的测试与测试报告生成的操作方法****************************
+    整个代码运行过程中，主要是从workloads目录下加载对应的模型的配置，主要有test_perf、test_accuracy、test_numeric三项测试内容，用户可以根据自己的需要选择开启与否；
+    一般情况下采用字节默认的配置项即可；需要特别修改的配置下面会进行说明
+
+    输出性能文档里面涉及的字段说明：
+        1、QPS、AVG Latency、P99 Latency：这3个指标是走字节框架，采用天数智芯的推理引擎IxRT会计算H2D、D2H的时间，也就是数据在不同的设备（CPU、GPU）之间传输耗时；
+        2、predict QPS、predict AVG Latency、predict P99 Latency：这部分指标把上面一步计算H2D、D2H的耗时剔除出去了，因此可以看做纯推理耗时，这个耗时可以与利用
+           ixerexec命令跑出来的结果做一定的对比，但是不一定完全对齐，因为走整个框架代码肯定会导致一部分性能损失
+
+    数据集、模型准备：
+        cd ByteMLPerf/byte_infer_perf/general_perf
+
+        bash general_perf/prepare_model_and_dataset.sh bert-torch-fp32 open_squad
+        bash general_perf/prepare_model_and_dataset.sh resnet50-torch-fp32 open_imagenet
+        bash general_perf/prepare_model_and_dataset.sh widedeep-tf-fp32 open_criteo_kaggle
+        bash general_perf/prepare_model_and_dataset.sh albert-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh roformer-tf-fp32 open_cail2019
+        bash general_perf/prepare_model_and_dataset.sh videobert-onnx-fp32 open_cifar
+        bash general_perf/prepare_model_and_dataset.sh yolov5-onnx-fp32 
+        bash general_perf/prepare_model_and_dataset.sh conformer-encoder-onnx-fp32
+        bash general_perf/prepare_model_and_dataset.sh roberta-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh deberta-torch-fp32 
+        bash general_perf/prepare_model_and_dataset.sh swin-large-torch-fp32
+        bash general_perf/prepare_model_and_dataset.sh gpt2-torch-fp32 
+
+        上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面
+        如果还缺少什么模型、数据集可以在prepare_model_and_dataset.sh里面执行类似上面的操作即可；
+
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+
+    备注：由于sftp机器崩溃，文件全部丢失，因此已有的获取数据方式可能不存在了
+
+    1、bert模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32/
+
+    2、albert模型：
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+        
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/madlag.tar
+                 tar -zxvf madlag.tar
+                 exit
+
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("madlag/albert-base-v2-squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/madlag/albert-base-v2-squad")  (注意绝对路径根据实际情况修改，需要在ByteMLPerf前面在加一个当前目录最上层的路径，下同)
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/albert-torch-fp32/
+
+    3、debert模型：
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/Palak.tar
+                 tar -zxvf Palak.tar
+                 exit
+
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/Palak/microsoft_deberta-base_squad")
+
+        给定的pt模型转成onnx后输入只有2个，因此这里特殊处理了一下；加载处理好的onnx模型：deberta-sim-drop-clip-drop-invaild-cast.onnx
+        将其放到：general_perf/model_zoo/popular/open_deberta/ 目录下；
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd yudefu/bytedance_perf ; get deberta-sim-drop-clip-drop-invaild-cast.onnx
+                 exit
+        
+        移动：mv deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/deberta-torch-fp32/
+
+    4、roberta模型：
+        测试过程中如果从huggingface网址不能下载文件，可以按照下面的操作进行下载
+
+        下载方式：sftp -P 29880 vipzjtd@iftp.iluvatar.com.cn（如果链接不上用ip替换：10.160.20.60）  密码：123..com
+                 get /upload/3-app/byteperf/csarron.tar
+                 tar -zxvf csarron.tar
+                 exit
+
+        接着修改代码：ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+        AutoTokenizer.from_pretrained("csarron/roberta-base-squad-v1") => AutoTokenizer.from_pretrained("/ByteMLPerf/byte_infer_perf/csarron/roberta-base-squad-v1")
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/roberta-torch-fp32/
+
+    5、videobert模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/videobert-onnx-fp32
+    
+    6、widedeep模型：
+        该模型经过了特殊的处理，需要采用处理好的onnx模型：widedeep_dynamicshape_new.onnx；
+        将其放到：general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd yudefu/bytedance_perf ; get widedeep_dynamicshape_new.onnx
+                 exit
+        
+        移动：mv widedeep_dynamicshape_new.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/ 
+        
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+    7、swin-transformer模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/swin-large-torch-fp32
+
+    8、resnet50模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+    9、yolov5模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+
+    10、conformer模型：
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task conformer-encoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/conformer-encoder-onnx-fp32
+
+    11、roformer模型：
+        该模型经过了特殊的处理，需要采用处理好的onnx模型：roformer_frozen.onnx；
+        将其放到：general_perf/model_zoo/popular/open_roformer/ 
+
+        下载方式：sftp -P 29889 user01@58.247.142.52  密码：5$gS%659
+                 cd yudefu/bytedance_perf ; get roformer_frozen.onnx
+                 exit
+        
+        移动：mv roformer_frozen.onnx general_perf/model_zoo/popular/open_roformer/ 
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/roformer-tf-fp32
+
+    12、gpt2模型：
+        在进行测试时，请把workloads下面的gpt2-torch-fp32.json里面的精度、数值对比测试改成false
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task gpt2-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/gpt2-torch-fp32
+"""
+
+"""
+    ***************************大模型操作流程********************
+    说明：
+        此部分侵入了字节代码框架，因此需要重新重构，暂时不需要进行测试
+
+    操作流程：
+        1. 进入ByteMLPerf目录
+        2. 执行
+            1）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chatglm2-torch-fp16-6b --hardware_type ILU, 
+               得到chatglm2-torch-fp16-6b的精度和性能数据
+
+            2）python3 byte_infer_perf/llm_perf/core/perf_engine.py --task chinese-llama2-torch-fp16-13b --hardware_type ILU,
+               得到 chinese-llama2-torch-fp16-13b的精度和性能数据
+
+        3. 在byte_infer_perf/llm_perf/reports/ILU目录下查看得到模型精度和性能数据的json文件
+"""
+
+"""
+    ***************************Stable Diffusion模型操作流程********************
+    环境准备：官方的onnx2torch有bug存在，所以需要安装天数智芯适配版本的onnx2torch，采用pytorch推理框架
+
+    操作过程：
+        1、cd ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/onnx2torch
+        2、执行：python3 setup.py install
+        3、cd -
+
+        数据集、模型准备：
+        cd ByteMLPerf/byte_infer_perf/general_perf
+
+        bash general_perf/prepare_model_and_dataset.sh vae-encoder-onnx-fp32
+
+        上面的模型与数据集下载完毕后会生成在：general_perf/general_perf，需要把该目录在的model_zoo下面的regular、popular、sota移到general_perf/model_zoo下面
+        如果还缺少什么模型、数据集可以在prepare_model_and_dataset.sh里面执行类似上面的操作即可；
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+
+    1、vae-decoder模型:
+        注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置
+            "batch_sizes":[4,8], "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-decoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/vae-decoder-onnx-fp32
+
+    2、vae-encoder模型：
+        注意事项：由于天数智芯的显卡基本上都是32G显存, 因此需要修改workloads下面的模型启动配置
+            "batch_sizes":[4,8], "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task vae-encoder-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/vae-encoder-onnx-fp32
+
+    2、clip模型：
+        注意事项：为了实现性能测试, 因此需要修改workloads下面的模型启动配置
+            "test_numeric": false, 
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task clip-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/clip-onnx-fp32
+"""
+
+
+"""
+    ***************************大模型操作流程-VLLM框架********************
+    说明：
+        此部分代码未侵入框架代码，由于vllm框架未实现精度测试，因此精度测试可以沿用GPU的backends；其次，vllm的tp定义目前与框架定义的tp含义不一样，
+        因此chatglm2、llama2模型的workloads配置里面的tp=2暂时不考虑，待后续商定好解决方案在继续
+
+    环境准备：
+        需要提前下载天数智芯适配的vllm安装包到测试环境下，为了方便看输出日志，省掉不必要的信息，安装完毕后，请注释掉：
+        /usr/local/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py 内部函数async def add_request 下面的logger.info输出日志
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+        
+    1、chatglm2模型：
+        执行：python3 llm_perf/launch.py --task chatglm2-torch-fp16-6b --hardware_type ILUVATAR 
+        生成的测试报告位置：llm_perf/reports/ILUVATAR/chatglm2-torch-fp16-6b
+    
+    2、llama2模型：
+        执行：python3 llm_perf/launch.py --task chinese-llama2-torch-fp16-13b --hardware_type ILUVATAR
+        生成的测试报告位置：llm_perf/reports/ILUVATAR/chinese-llama2-torch-fp16-13b
+"""
+
+
+"""
+    **************************部分小模型的int8精度推理测试************************
+    说明：
+        字节目前想验证部分小模型的int8精度推理的性能，因此需要基于ixrt（tensorrt）推理引擎进行适配支持
+        目前需要验证的小模型包括：resnet50、yolov5、widedeep、bert
+
+        注意如果在测试bert的int8推理时，报错，可能是sdk、ixrt版本问题导致；需要升级；
+        生成的报告，并没有更改里面的精度标识，这里只是给出一个测试case，因此并没有将这部分代码加到代码中
+    
+    环境准备：不需要特别准备，之前如果测试过小模型的性能，相关的环境已经存在了；
+
+    测试开始：
+
+    cd ByteMLPerf/byte_infer_perf
+
+    1、resnet50 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get quantized_Resnet50.onnx  
+            exit
+            mv quantized_Resnet50.onnx general_perf/model_zoo/regular/open_resnet50
+
+        手动更改配置文件：general_perf/model_zoo/resnet50-torch-fp32.json 中的 model_precision 精度为 INT8
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task resnet50-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/resnet50-torch-fp32
+
+    2、widedeep 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get quantized_widedeep_staticshape.onnx 
+            exit
+            mv quantized_widedeep_staticshape.onnx general_perf/model_zoo/regular/open_wide_deep_saved_model/
+
+        手动更改配置文件：general_perf/model_zoo/widedeep-tf-fp32.json 中的 model_precision 精度为 INT8
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/widedeep-tf-fp32
+
+    3、yolov5 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get quantized_yolov5s.onnx 
+            exit
+            mv quantized_yolov5s.onnx general_perf/model_zoo/popular/open_yolov5/
+
+        手动更改配置文件：general_perf/model_zoo/yolov5-onnx-fp32.json 中的 model_precision 精度为 INT8
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task yolov5-onnx-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/yolov5-onnx-fp32
+
+    4、bert 模型：
+        模型准备：在进行int8精度推理时，需要提供经过量化后的onnx模型，这里直接给出量化好的模型；该模型直接拿生成好的engine进行推理
+
+        下载方式：
+            sftp -P 29889 user01@58.247.142.52  密码：5$gS%659（内网连接：sftp -P 29889 user01@10.160.20.61）
+            cd yudefu/bytedance_perf  
+            get bert_zijie_int8_b196.engine  
+            exit
+            mv bert_zijie_int8_b196.engine general_perf/model_zoo/regular/open_bert/
+
+        手动更改配置文件：general_perf/model_zoo/bert-torch-fp32.json 中的 model_precision 精度为 INT8
+
+        执行：python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task bert-torch-fp32
+        生成的测试报告位置：general_perf/reports/ILUVATAR/bert-torch-fp32
+"""
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/common.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..096a83e7fe205bf424c7d306a9784d6f2752aa1a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/common.py
@@ -0,0 +1,320 @@
+import os
+import random
+import torch
+import ctypes
+import numpy as np
+from os.path import join, dirname, exists
+
+import pycuda.driver as cuda
+from cuda import cuda,cudart
+import threading
+
+import importlib
+
+tensorrt = None      
+Dims = None                                                                           
+                          
+tvm = None  
+
+def setup_seed(seed):
+     torch.manual_seed(seed)
+     torch.cuda.manual_seed_all(seed)
+     np.random.seed(seed)
+     random.seed(seed)
+     torch.backends.cudnn.deterministic = True
+
+
+def load_ixrt_plugin(logger=None, namespace="", dynamic_path="", model="", precision=""):
+    global tensorrt
+    global Dims
+
+    if tensorrt is not None:
+        return
+    
+    if precision == 'FP16':
+        if model == 'resnet50' or model == 'bert' or model == 'albert' or model == 'deberta' or model == 'yolov5':
+            tensorrt = importlib.import_module("tensorrt")
+            Dims = getattr(tensorrt, "Dims")
+        else:
+            tensorrt = importlib.import_module("tensorrt")
+            Dims = getattr(tensorrt, "Dims")
+    
+    if precision == 'INT8':
+        tensorrt = importlib.import_module("tensorrt")
+        Dims = getattr(tensorrt, "Dims")
+    
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    tensorrt.init_libnvinfer_plugins(tensorrt.Logger(tensorrt.Logger.INFO), namespace)
+    print(f"Loaded plugin from {dynamic_path}")
+
+
+def build_engine(model_name, onnx_model_path, engine_path, MaxBatchSize, BuildFlag):
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    builder = tensorrt.Builder(IXRT_LOGGER)
+    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+    build_config = builder.create_builder_config()
+
+    profile = builder.create_optimization_profile()
+
+    if model_name == 'resnet50':
+        profile.set_shape(
+                "input", Dims([1, 3, 224, 224]), Dims([32, 3, 224, 224]), Dims([MaxBatchSize, 3, 224, 224]))
+        
+    elif model_name == 'videobert':
+        profile.set_shape(
+            "image", Dims([1, 3, 224, 224]), Dims([32, 3, 224, 224]), Dims([MaxBatchSize, 3, 224, 224]))
+        profile.set_shape(
+            "text", Dims([100, 77]), Dims([100, 77]), Dims([100, 77]))
+        
+    elif model_name == 'yolov5':
+        profile.set_shape(
+                "images", Dims([1, 3, 640, 640]), Dims([32, 3, 640, 640]), Dims([MaxBatchSize, 3, 640, 640]))
+    
+    elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta':
+        profile.set_shape(
+            "input_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "attention_mask.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "token_type_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        
+    elif model_name == 'deberta':
+        profile.set_shape(
+            "input_ids.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+        profile.set_shape(
+            "attention_mask.1", Dims([1, 384]), Dims([16, 384]), Dims([MaxBatchSize, 384]))
+    
+    elif model_name == 'widedeep':
+        profile.set_shape(
+            "new_numeric_placeholder:0", Dims([1, 13]), Dims([16, 13]), Dims([MaxBatchSize, 13]))
+        profile.set_shape(
+            "new_categorical_placeholder:0", Dims([1 * 26, 2]), Dims([16 * 26, 2]), Dims([MaxBatchSize * 26, 2]))
+        profile.set_shape(
+            "import/head/predictions/zeros_like:0", Dims([1, 1]), Dims([16, 1]), Dims([MaxBatchSize, 1]))
+        
+    elif model_name == 'conformer':
+        profile.set_shape(
+            "src", Dims([1, 3, 64, 512]), Dims([16, 3, 64, 512]), Dims([MaxBatchSize, 3, 64, 512]))
+        profile.set_shape(
+            "src_pad_mask", Dims([1, 128]), Dims([16, 128]), Dims([MaxBatchSize, 128]))
+        
+    elif model_name == 'roformer':
+        profile.set_shape(
+            "input_segment0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
+        profile.set_shape(
+            "input_token0", Dims([1, 1024]), Dims([16, 1024]), Dims([MaxBatchSize, 1024]))
+        
+    elif model_name == 'swin':
+        profile.set_shape(
+                "pixel_values.1", Dims([1, 3, 384, 384]), Dims([32, 3, 384, 384]), Dims([MaxBatchSize, 3, 384, 384]))
+
+    else:
+        pass
+
+    build_config.add_optimization_profile(profile)
+
+    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+    parser.parse_from_file(onnx_model_path)
+    
+    if BuildFlag == 'FP16':
+        build_config.set_flag(tensorrt.BuilderFlag.FP16)
+    
+    if BuildFlag == 'INT8':
+        build_config.set_flag(tensorrt.BuilderFlag.INT8)
+
+    # set dynamic shape
+    num_inputs = network.num_inputs
+
+    for i in range(num_inputs):
+        if model_name == 'resnet50':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 224, 224])
+
+        elif model_name == 'videobert':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-1, 3, 224, 224])
+            else:
+                input_tensor.shape = Dims([100, 77])
+
+        elif model_name == 'yolov5':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 640, 640])
+            network.get_input(i).dtype = tensorrt.float16
+
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta':        
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 384])
+        
+        elif model_name == 'widedeep':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-26, 2])
+            elif i == 1:
+                input_tensor.shape = Dims([-1, 13])
+            else:
+                input_tensor.shape = Dims([-1, 1])
+
+        elif model_name == 'conformer':
+            input_tensor = network.get_input(i)
+            if i == 0:
+                input_tensor.shape = Dims([-1, 3, 64, 512])
+            else:
+                input_tensor.shape = Dims([-1, 128])
+        
+        elif model_name == 'roformer':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 1024])
+
+        elif model_name == 'swin':
+            input_tensor = network.get_input(i)
+            input_tensor.shape = Dims([-1, 3, 384, 384])
+
+        else:
+            pass
+
+    plan = builder.build_serialized_network(network, build_config)
+
+    with open(engine_path, "wb") as f:
+        f.write(plan)
+
+    print("***Build dynamic shape engine success!***")
+
+
+def build_igie_engine(model_name, model_path, input_dict, model_framework, precision, engine_path):
+    global tvm
+
+    if tvm is not None:
+        return
+    
+    if not os.path.exists(engine_path):
+        tvm = importlib.import_module("tvm")
+        from general_perf.backends.ILUVATAR.utils.import_model import import_model_to_igie
+
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")
+        mod, params = import_model_to_igie(model_path, input_dict, model_framework, backend='igie')
+        lib = tvm.relay.build(mod, target=target, params=params, precision=precision, verbose=False)
+        lib.export_library(engine_path)
+    else:
+        pass
+
+
+def init_by_tensorrt(engine_path):
+    datatype = tensorrt.DataType.FLOAT
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    with open(engine_path, "rb") as f, tensorrt.Runtime(logger) as runtime:
+        runtime = tensorrt.Runtime(logger)
+        assert runtime
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine
+        context = engine.create_execution_context()
+        assert context
+    
+    return engine, context
+
+
+def setup_io_bindings(engine, context):
+    # Setup I/O bindings
+    inputs = []
+    outputs = []
+    allocations = []
+
+    for i in range(engine.num_bindings):
+        is_input = False
+        if engine.binding_is_input(i):
+            is_input = True
+
+        name = engine.get_binding_name(i)
+        dtype = engine.get_binding_dtype(i)
+        shape = context.get_binding_shape(i)
+
+        if is_input:
+            batch_size = shape[0]
+        size = np.dtype(tensorrt.nptype(dtype)).itemsize
+
+        for s in shape:
+            size *= s
+        
+        # allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
+        
+        binding = {
+            "index": i,
+            "name": name,
+            "dtype": np.dtype(tensorrt.nptype(dtype)),
+            "shape": list(shape),
+            "allocation": allocation,
+            "nbytes": size
+        }
+
+        allocations.append(allocation)
+
+        if engine.binding_is_input(i):
+            inputs.append(binding)
+        else:
+            outputs.append(binding)
+
+    return inputs, outputs, allocations
+
+
+# multi cores inference codes
+class Task:
+    def __init__(self, bs, dataset, device_id, load_fun, benchmark_fun, performance_reports, lock, framework) -> None:
+        self.dataset = dataset
+        self.benchmark_fun = benchmark_fun
+        self.device_id = device_id
+        self.performance_reports = performance_reports
+        checkCudaErrors(cudart.cudaSetDevice(device_id))
+        if framework != 'gpt2':
+            load_fun(bs)
+
+        self.lock = lock
+        self.module = None
+        
+
+    def run(self):
+        checkCudaErrors(cudart.cudaSetDevice(self.device_id))
+        batch_reports = self.benchmark_fun(self.dataset)
+        self.performance_reports.append(batch_reports)
+
+
+class TaskThread(threading.Thread):
+   def __init__(self, func, args):
+      threading.Thread.__init__(self)
+      self.func = func
+      self.args = args
+      
+   def run(self):
+      self.func(*self.args)
+
+
+def _cudaGetErrorEnum(error):
+    if isinstance(error, cuda.CUresult):
+        err, name = cuda.cuGetErrorName(error)
+        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
+    elif isinstance(error, cudart.cudaError_t):
+        return cudart.cudaGetErrorName(error)[1]
+    else:
+        raise RuntimeError('Unknown error type: {}'.format(error))
+
+
+def checkCudaErrors(result):
+    if result[0].value:
+        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
+    if len(result) == 1:
+        return None
+    elif len(result) == 2:
+        return result[1]
+    else:
+        return result[1:]
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ebb251a405c60e083e200c5fff31db413df9833
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
@@ -0,0 +1,262 @@
+# Copyright 2023 Graphcore Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+import subprocess
+
+from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
+
+from general_perf.backends.ILUVATAR.common import build_engine
+from general_perf.backends.ILUVATAR.optimizer.passes import *
+from general_perf.tools.torch_to_onnx import torch_to_onnx
+from general_perf.tools.saved_to_onnx import savedmodel_to_onnx
+from general_perf.model_zoo import *
+from general_perf.backends import compile_backend
+
+log = logging.getLogger("CompileBackendILUVATAR")
+
+
+class CompileBackendILUVATAR(compile_backend.CompileBackend):
+    def __init__(self):
+        super(CompileBackendILUVATAR, self).__init__()
+        self.hardware_type = "ILUVATAR"
+        self.need_reload = False
+        self.model_runtimes = []
+        self.model_config = None
+
+    def version(self) -> str:
+        """Return compile backend version details."""
+        return tensorrt.__version__
+    
+    def compile(self, configs, dataloader=None):
+        model = configs['model_info']['model']
+        model_name = configs['model_info']['model'].split("-")[0]
+        model_path = configs['model_info']['model_path']
+        MaxBatchSize = configs['model_info']['max_batch_size']
+
+        precision = configs['model_info']['model_precision'].replace('FP32', 'FP16')
+
+        if precision == 'FP16':
+            if model_name == 'resnet50' or model_name == 'bert' or model_name == 'albert' or model == 'deberta' or model_name == 'yolov5':
+                import tensorrt
+            else:
+                import tensorrt
+        
+        if precision == 'INT8':
+            import tensorrt
+
+        load_ixrt_plugin(model=model_name, precision=precision)
+
+        if model_name == 'gpt2':
+            from general_perf.backends.ILUVATAR.common import build_igie_engine
+
+        # call the ONNX model and the compiled engine file
+        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+            onnx_model_path = model_path.split(".")[0] + "_end.onnx"
+            engine_path = model_path.split(".")[0] + "_end.engine"
+
+        elif model_name == 'widedeep' or model_name == 'roformer':
+            onnx_model_path = model_path + "/" + model + "_end.onnx"
+            engine_path = model_path + "/" + model + "_end.engine"
+
+        elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin' \
+             or model_name == 'resnet50':
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + "_end.onnx"
+            engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine"
+        
+        else:
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+            engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+
+        # model preprocessing
+        self.get_onnx(configs)
+
+        # build engine
+        if configs['model_info']['model_precision'].replace('FP32', 'FP16') == 'FP16':
+            precision_flag = "FP16"
+            if model_name == 'widedeep':
+                onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'deberta':
+                onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'roformer':
+                onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+
+            elif model_name == 'gpt2':
+                for bs in configs['workload']['batch_sizes']:
+                    onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+                    engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(bs) + ".so" 
+
+                    for key, val in configs['model_info']['input_shape'].items():
+                        input_dict = {}
+                        val = val = [val[0] * bs] + val[1:] 
+                        input_dict[key] = val
+                        
+                    build_igie_engine(model_name=model_name, model_path=onnx_model_path, input_dict=input_dict, model_framework='onnx', precision='fp16', engine_path=engine_path)
+            
+            elif model == 'vae-decoder-onnx-fp32' or model == 'vae-encoder-onnx-fp32' or model == 'clip-onnx-fp32':
+                pass
+            
+            else:
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='FP16')
+            
+        if configs['model_info']['model_precision'] == 'INT8':
+            precision_flag = "INT8"
+            if model_name == 'widedeep':
+                onnx_model_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
+            
+            if model_name == 'resnet50':
+                onnx_model_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50.onnx"
+                engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
+
+            if model_name == 'yolov5':
+                onnx_model_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s.onnx"
+                engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine"    
+                build_engine(model_name=model_name, onnx_model_path=onnx_model_path, engine_path=engine_path, MaxBatchSize=MaxBatchSize, BuildFlag='INT8')
+
+            if model_name == 'bert':
+                print(f"\n==========****bert模型的int8精度推理采用直接加载engine文件, 因此不需要build engine! ****===========")
+
+        result = {
+            "model": 
+                configs['model_info']['model'],
+            "model_name": 
+                configs['model_info']['model'].split("-")[0],
+            "model_path":
+                configs['model_info']['model_path'],
+            "framework": 
+                configs['model_info']['framework'],
+            "compile_precision": 
+                precision_flag,
+            "input_type": 
+                configs['model_info']['input_type'].split(","),
+            "max_batch_size": 
+                configs['model_info']['max_batch_size'],
+            "compile_status":
+                "success",
+            "sg_percent": 100,
+            "segments": [
+                {
+                    "sg_idx": 0,
+                    "is_fallback": False,
+                    "input_tensor_map": 
+                        configs['model_info']['input_shape'],
+                    "output_tensor_map": 
+                        configs['model_info']['outputs'],
+                    "compiled_model": [
+                        {
+                            "compiled_bs": 1,
+                            "compiled_obj": configs['model_info']['model_path'],
+                        },
+                    ],
+                },
+            ],
+        }
+
+        self.configs = result
+        self.workload = configs['workload']
+        self.model_info = configs['model_info']
+
+        for key, value in result.items():
+            print('{key}: {value}'.format(key=key, value=value))
+
+        return result
+
+
+    def get_interact_profile(self, configs):
+        """
+            Collect information for core engine to let user interactively fill in configurations.
+        """
+        return []
+
+    def get_best_batch_size(self):
+        """Get Best Batch Size for the model.
+        Usually take the max batch size can be loaded to IPU as the best batch size to
+        get highest throughput.
+        """
+        return None
+    
+    def get_onnx(self, configs):
+        model = configs['model_info']['model']
+        model_name = configs['model_info']['model'].split("-")[0]
+        model_path = configs['model_info']['model_path']
+
+        # model save location
+        if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+            onnx_model_path = model_path 
+
+        elif model_name == 'widedeep' or model_name == 'roformer':
+            onnx_model_path = model_path + "/" + model + ".onnx"
+
+        else:
+            onnx_model_path = os.path.dirname(model_path) + "/" + model + ".onnx"
+
+        framework = configs['model_info']['framework']
+
+        if framework == 'Pytorch':
+            torch_to_onnx(model_path=model_path, output_path=onnx_model_path)
+            print("***Convert pt model to onnx model success!***")
+
+        if framework == 'Tensorflow':
+            savedmodel_to_onnx(model_path=model_path, output_path=onnx_model_path)
+            print("***Convert pb model to onnx model success!***")
+
+        # Convert ONNX model to plugin operator model: Support fusion of dynamic and static graphs
+        if model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or \
+            model_name == 'videobert' or model_name == 'resnet50' or model_name == 'widedeep':
+            
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path}'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'deberta':
+            onnx_model_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast.onnx"
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path}'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'swin':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type swint'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'yolov5':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type yolo'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'roformer':
+            onnx_model_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen.onnx"
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type roformer --input_shapes input_segment0:bsx1024,input_token0:bsx1024'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        elif model_name == 'conformer':
+            cmd = f'python3 general_perf/backends/ILUVATAR/optimizer/optimizer.py --onnx {onnx_model_path} --model_type conformer --hidden_size 512 --num_heads 8'
+            subprocess.call(cmd, shell=True)
+            print("***Convert onnx model to plugin operator model success!***")
+
+        else:
+            pass
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..65175643c0e50d8445ef65deae088de4600244f0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/README.md
@@ -0,0 +1,44 @@
+## CI Test tool for IxRT
+
+### 1. Install dltest tool
+    
+    python setup.py develop
+
+### 2. Usage
+
+#### 2.1 Fetch log
+
+Commmand:
+
+```shell
+ixdltest-fetch args_or_pipe ${log_path}
+```
+
+Arguments:
+
+- p or patterns, The pattern of fetch log;
+- pn or pattern_names, The name of pattern;
+- use_re, Whether use regular expression;
+- d or nearest_distance, default=10, The nearest distance of matched pattern;
+- start_flag, The flag of start to record log;
+- end_flag, The flag of stop to record log;
+- split_pattern, The pattern is used to match line, If the line is matched, argument `split_sep` to split the line.
+- split_sep, The seperator is used to split line;
+- split_idx, The index of split line;
+- saved, Save result to path;
+- log, Log path.
+
+Example
+Analyse from file
+```
+$ ixdltest-fetch run.log -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
+{'results': [{'Throughput': [188.5461778786721]}]}
+- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
+```
+
+Analyse from command line pipe
+```
+$ cat run.log | ixdltest-fetch -p "Throughput" -t_bi150 Throughput:100 -t_mr100 Throughput:100
+{'results': [{'Throughput': [188.5461778786721]}]}
+- Check Throughput on BI150 passed (result vs target): 188.5461778786721>=100.0
+```
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5458f31666f11de72d52a4e834b8a87be9a992d0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/__init__.py
@@ -0,0 +1 @@
+from .utils.infer_args import show_infer_arguments
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6e197c00f65e4f6cb563b4e2993ed1da360379
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/assert_cli.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+import os
+from typing import List, Iterable, Optional
+
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.log_parser import LogParser
+from dltest.model_compare_config import get_compare_config_with_full_path
+from dltest.utils.misc import get_full_path
+from dltest.utils.subprocess_tools import get_output
+from dltest.model_compare_config import ComparatorConfig
+
+
+FRAMEWORKS = list(ComparatorConfig.get_frameworks())
+
+REMAINDER = '...'
+
+assertion_expr_factory = dict(
+    eq = "a == b",
+    ne = "a != b",
+    ge = "a >= b",
+    le = "a <= b",
+    gt = "a > b",
+    lt = "a < b",
+)
+
+
+class AssertCLI(LogParserCLI):
+
+    def command_name(self):
+        return "assert"
+
+    def predefine_args(self):
+        super(AssertCLI, self).predefine_args()
+        self.parser.add_argument('-b', '--assertion_second_value', type=float, default=None,
+                                 help='It is used in assertion expression.')
+        self.parser.add_argument('--print_result', action="store_true", default=False,
+                                 help='Whether print result')
+        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'],
+                                 help='The method of capture output')
+        # FIXME: Using store_action to replase it
+        self.parser.add_argument('--only_last', type=int, default=0,
+                                 help='Whether use the last result to compare')
+        self.parser.add_argument('--expr', type=str, default="ge",
+                                 help=f"Assertion expression, option keys: {', '.join(assertion_expr_factory.keys())}" +
+                                 ", or a executable code, such as `a > b`, `a > 1`, ...")
+        self.parser.add_argument('--use_predefined_parser_rules', action="store_true", default=False,
+                                 help='Whether use predefined args of parser.')
+        self.parser.add_argument('--log', type=str, default=None, help="Log path")
+        self.parser.add_argument("--run_script", default=[], nargs=REMAINDER)
+
+    def parse_args(self, *args, **kwargs):
+        args = super(AssertCLI, self).parse_args()
+        args.only_last = args.only_last > 0
+        if len(args.run_script) == 0 and args.log is None:
+            raise ValueError("The one of `--run_script` or `--log` must be given.")
+
+        if args.assertion_second_value is None:
+            if args.expr is None:
+                raise ValueError("The one of `--assertion_second_value` or `--expr` must be given.")
+
+            if args.expr in assertion_expr_factory:
+                raise ValueError(
+                    "The comparison operators depend on the argument `assertion_second_value`."
+                )
+
+        return args
+
+    def create_parser(self, args):
+        if args.use_predefined_parser_rules:
+            script_path = self._get_script_path(args.run_script)
+            config = get_compare_config_with_full_path(script_path, to_dict=False)
+
+            return LogParser(
+                patterns=config.patterns, pattern_names=config.pattern_names,
+                use_re=config.use_re, nearest_distance=config.nearest_distance,
+                start_line_pattern_flag=config.start_line_pattern_flag,
+                end_line_pattern_flag=config.end_line_pattern_flag,
+                split_pattern=config.split_pattern,
+                split_sep=config.split_sep,
+                split_idx=config.split_idx
+            )
+
+        return LogParser(
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx
+        )
+
+    def run(self):
+        args = self.parse_args()
+        parser = self.create_parser(args)
+
+        if args.print_result:
+            print(args)
+
+        output = self.get_log(args)
+        parsed_logs = self.parser_log(parser, output, args)
+        self.check_logs(parsed_logs, args)
+
+    def get_log(self, args):
+        if len(args.run_script) == 0:
+            try:
+                with open(args.log) as f:
+                    return f.readlines()
+            except:
+                print(f"ERROR: Read log fail in {args.log}")
+                exit(1)
+        else:
+            return get_output(args.run_script, capture_output_method=args.capture_output)
+
+    def parser_log(self, parser, output, args) -> List[float]:
+        results = parser.parse(output)
+        if args.only_last:
+            results = results[-1:]
+
+        if len(results) == 0:
+            raise ValueError("The parsed results is empty, please check patterns.")
+        if isinstance(results[0], dict):
+            if len(results[0]) == 0:
+                raise ValueError("The parsed results is empty, please check patterns.")
+            key = list(results[0].keys())[0]
+            results = [result[key] for result in results]
+
+        if isinstance(results[0], Iterable):
+            results = [result[0] for result in results]
+
+        return results
+
+    def check_logs(self, parsed_logs, args):
+        if args.print_result:
+            print("Parsed result:", parsed_logs)
+
+        assertion_expr = assertion_expr_factory.get(args.expr, args.expr)
+
+        assert_results = []
+        b = args.assertion_second_value
+        for a in parsed_logs:
+            assert_results.append(eval(assertion_expr))
+
+        if args.print_result:
+            print("The result of assertion expression:", assert_results)
+
+        if any(assert_results):
+            print("SUCCESS")
+            exit(0)
+        print("FAIL")
+        exit(1)
+
+    def _get_script_path(self, run_script: List[str]):
+        # Find shell script by current run_script
+        def _find_real_shell_script(cmd: List[str]):
+            for i, field in enumerate(cmd):
+                if field.endswith('.sh') and self._get_framework(field) in FRAMEWORKS:
+                    return field
+
+        real_shell_script = _find_real_shell_script(run_script)
+
+        # Find shell script by parent process
+        if real_shell_script is None:
+            ppid = os.getppid()
+            import psutil
+            pproc = psutil.Process(ppid)
+            pproc_cmd = pproc.cmdline()
+            real_shell_script = _find_real_shell_script(pproc_cmd)
+
+        if real_shell_script is not None:
+            real_shell_script = self._get_script_abs_path(real_shell_script)
+            return real_shell_script
+
+        raise RuntimeError("The script is not named correctly, " + \
+                           "please use a script name ending with the framework, " + \
+                           f"got `{' '.join(run_script)}`, " + \
+                           "e.g. train_resnet50_torch.sh")
+
+    def _get_framework(self, shell_script: str) -> Optional[str]:
+        try:
+            return shell_script.split('.')[-2].split('_')[-1]
+        except:
+            return None
+
+    def _get_script_abs_path(self, run_script):
+        real_run_script = os.path.realpath(run_script)
+        if os.path.exists(real_run_script):
+            return real_run_script
+
+        if "MODEL_DIR" in os.environ:
+            return os.path.join(os.environ["MODEL_DIR"], run_script)
+
+        if "OLDPWD" in os.environ:
+            real_run_script = os.path.join(os.environ["OLDPWD"], run_script)
+            if os.path.exists(real_run_script):
+                return real_run_script
+
+        raise FileNotFoundError("Not found running script path, " + \
+                                "please set environment variable `MODEL_DIR`, " + \
+                                "e.g /path/to/deeplearningsamples/executables/resnet.")
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..b40f3a72fb949c18104963fb598c58076c65b479
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/check_cli.py
@@ -0,0 +1,56 @@
+import os
+
+from .assert_cli import AssertCLI
+from ..utils.subprocess_tools import execute_shell
+
+RUN_MODE_KEY = "RUN_MODE"
+RUN_MODE_STRICT = "strict"
+
+
+class CheckCli(AssertCLI):
+
+    def __init__(self, *args, **kwargs):
+        super(CheckCli, self).__init__(*args, **kwargs)
+        self.args = None
+
+    def command_name(self):
+        return "check"
+
+    def predefine_args(self):
+        self.parser.add_argument("--check_mode", type=str, default="no",
+                                 choices=["all", "strict", "nonstrict", "no"],
+                                 help="which running mode needs to be checked")
+        self.parser.add_argument("--nonstrict_mode_args", type=str, default="",
+                                 help="the arguments are used with nonstric testing")
+        super(CheckCli, self).predefine_args()
+
+    def parse_args(self, *args, **kwargs):
+        if self.args is None:
+            args = super(CheckCli, self).parse_args(*args, **kwargs)
+            args.use_predefined_parser_rules = True
+            args.nonstrict_mode_args = args.nonstrict_mode_args.split(" ")
+
+            if not self.is_strict_testing():
+                args.run_script.extend(args.nonstrict_mode_args)
+
+            if args.check_mode == "all":
+                args.check_mode = self.current_running_mode()
+
+            self.args = args
+        return self.args
+
+    def run(self):
+        args = self.parse_args()
+        if args.check_mode == self.current_running_mode():
+            return super(CheckCli, self).run()
+        else:
+            res = execute_shell(args.run_script)
+            exit(res.returncode)
+
+    def current_running_mode(self):
+        return os.environ.get(RUN_MODE_KEY, RUN_MODE_STRICT)
+
+    def is_strict_testing(self):
+        return self.current_running_mode() == RUN_MODE_STRICT
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..3451623d5c811ccdc9ee82714d5069ee04512742
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/entry_points.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from dltest.cli.assert_cli import AssertCLI
+from dltest.cli.log_comparator_cli import LogComparatorCLI
+from dltest.cli.model_validator_cli import ModelValidatorCLI
+from dltest.cli.fetch_log_cli import FetchLog
+from dltest.cli.check_cli import CheckCli
+
+
+#log_comparator_cli = LogComparatorCLI()
+#model_validator_cli = ModelValidatorCLI()
+fetch_log_cli = FetchLog()
+#assert_cli = AssertCLI()
+#check_cli = CheckCli()
+
+
+def make_execute_path():
+    preffix = "dltest.cli.entry_points"
+    clis = []
+    for cli_var in globals():
+        if cli_var.endswith('_cli'):
+            cmd_name = globals()[cli_var].command_name()
+            clis.append(f"ixdltest-{cmd_name}={preffix}:{cli_var}")
+
+    return clis
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..0059cecf77b8a71c8b41de947af84a8210d55950
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/fetch_log_cli.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+import sys
+from typing import Mapping
+from os.path import basename, join, exists, expanduser, dirname
+
+from dltest.log_parser import LogParser
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.utils.iluvatar import get_iluvatar_card_type, IluvatarGPU
+
+
+
+
+def parse_target(target):
+    result = {}
+    targets = target.split(",")
+    for i in targets:
+        item = i.split(":")
+        assert len(item) == 2
+        key, value = item
+        result[key] = float(value)
+    return result
+        
+
+def load_json(file):
+    file_path = expanduser(file)
+    # 检查文件是否存在
+    if exists(file_path):
+        # 加载json文件
+        with open(file_path, 'r') as file:
+            data = json.load(file)
+    else:
+        # 创建一个空的json文件
+        data = {}
+
+    return data
+
+def process_results(results):
+    result = dict()
+    for i in results["results"]:
+        for k, v in i.items():
+            result[k] = v[0]
+    return result
+
+class FetchLog(LogParserCLI):
+
+    def command_name(self):
+        return "fetch"
+
+    def predefine_args(self):
+        super(FetchLog, self).predefine_args()
+        self.parser.add_argument('log', nargs='?', type=str, help="Log path")
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--saved_entry', type=str, default=None, help='Save to path')
+        self.parser.add_argument('-t_bi150','--target_bi150', type=str, default=-1.)
+        self.parser.add_argument('-t_mr100','--target_mr100', type=str, default=-1.)
+        self.parser.add_argument('-t_mr50','--target_mr50', type=str, default=-1.)
+
+    def run(self):
+        args = self.parse_args()
+        parser = LogParser(
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx
+        )
+
+        results = parser.parse(args.log)
+        if not isinstance(results, Mapping):
+            results = dict(results=results)
+        results = process_results(results)
+        print(results)
+
+        if args.saved is not None:
+            saved = load_json(args.saved)
+            if not args.saved_entry:
+                raise Exception("You need to use --saved_entry to specify entry name of the result")
+
+            saved[args.saved_entry] = results
+            with open(args.saved, 'w') as f:
+                json.dump(saved, f, indent=4)
+        self.compare_results(args, results)
+
+
+    def compare_results(self, args, results):
+        card = get_iluvatar_card_type()
+        if card == IluvatarGPU.UNKNOWN:
+            print("Not known which card is used, can you use ixsmi in the environment?")
+            return
+        user_target = getattr(args, 'target_'+card.name.lower(), "")
+        user_target = parse_target(user_target)
+
+        is_expected = True
+        for key, target in user_target.items():
+            if key not in results:
+                continue
+            if results[key]<target:
+                is_expected = False
+                print(f"- Check {key} on {card.name} failed (result vs target): {results[key]}<{target}")
+            else:
+                print(f"- Check {key} on {card.name} passed (result vs target): {results[key]}>={target}")
+        if not is_expected:
+            sys.exit(1)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5863b56d999f4a2b2fb27c6c8aae85ab7898341
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_comparator_cli.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+from pprint import pprint
+
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.log_comparator import compare_logs_with_paths, DEFAULT_NEAREST_MATCH_CHARS
+
+
+class LogComparatorCLI(LogParserCLI):
+
+    def command_name(self):
+        return "compare"
+
+    def predefine_args(self):
+        super(LogComparatorCLI, self).predefine_args()
+        self.parser.add_argument('--log1', type=str, help="First log")
+        self.parser.add_argument('--log2', type=str, help="Second log")
+        self.parser.add_argument('--threshold', type=float, default=0.0001, help="Threshold")
+        self.parser.add_argument('--only_last', type=int, default=1, help='Whether use the last result to compare')
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
+        self.parser.add_argument('--allow_greater_than', action="store_true", default=False, help='Allow log1 greater than log2')
+
+    def parse_args(self, *args, **kwargs):
+        args = super(LogComparatorCLI, self).parse_args(*args, **kwargs)
+        args.only_last = args.only_last >= 1
+
+        return args
+
+    def run(self):
+        args = self.parse_args()
+        satisfied, results = compare_logs_with_paths(
+            log1=args.log1, log2=args.log2,
+            threshold=args.threshold,
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            only_last=args.only_last,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx,
+            allow_greater_than=True
+        )
+
+        if args.print_result:
+            pprint(results)
+
+        if satisfied:
+            print("SUCCESS")
+        else:
+            print("FAIL")
+
+        if args.saved is not None:
+            with open(args.saved, 'w') as f:
+                json.dump(results, f)
+
+
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..7263543ef3c8addee1ebe20363ae18a34571e35d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/log_parser_cli.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+from typing import Mapping
+
+from dltest.log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
+from dltest.utils.base_cli import BaseCLI
+
+
+class LogParserCLI(BaseCLI):
+
+    def predefine_args(self):
+        self.parser.add_argument('-p', '--patterns', nargs="*", type=str, default=None, help='Fetched patterns')
+        self.parser.add_argument('-pn', '--pattern_names', nargs="*", type=str, default=None, help='The name of pattern')
+        self.parser.add_argument('--use_re', action="store_true", default=False, help='Whether use regular expression')
+        self.parser.add_argument('-d', '--nearest_distance', type=int, default=DEFAULT_NEAREST_MATCH_CHARS, help='The nearest distance of matched pattern')
+        self.parser.add_argument('--start_flag', type=str, default=None, help='The flag of start to record log')
+        self.parser.add_argument('--end_flag', type=str, default=None, help='The flag of stop to record log')
+        self.parser.add_argument('--split_pattern', type=str, default=None, help='The pattern is used to match line')
+        self.parser.add_argument('--split_sep', nargs="*", type=str, default=None, help='The seperator is used to split line')
+        self.parser.add_argument('--split_idx', nargs="*", type=int, default=None, help='The index of split line')
+
+    def parse_args(self, *args, **kwargs):
+        args = super(LogParserCLI, self).parse_args(*args, **kwargs)
+
+        return args
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4f68c587282553c48055256ca0a37f6b2a4eab
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/cli/model_validator_cli.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+import os
+import os.path as ospath
+from pprint import pprint
+from typing import List, Union
+
+from dltest.utils.base_cli import BaseCLI
+from dltest.utils.get_env import get_gpu_type
+from dltest.utils.misc import get_full_path
+from dltest.model_compare_config import get_compare_config_with_full_path
+from dltest.log_comparator import compare_logs_with_paths
+from dltest.utils.subprocess_tools import get_output
+
+
+REMAINDER = '...'
+
+
+class ModelValidatorCLI(BaseCLI):
+
+    def command_name(self):
+        return "validate"
+
+    def predefine_args(self):
+        super(ModelValidatorCLI, self).predefine_args()
+        self.parser.add_argument('-l', '--compare_log', type=str, default=None, help="Compare log")
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--with_exit_code', type=int, default=1, help="Add exit code for the result of compared")
+        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
+        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'], help='The method of capture output')
+        self.parser.add_argument("run_script", nargs=REMAINDER)
+
+    def parse_args(self, *args, **kwargs):
+        args = super(ModelValidatorCLI, self).parse_args()
+        if len(args.run_script) == 0:
+            print("ERROR: Invalid run_script")
+            exit(1)
+
+        return args
+
+    def run(self):
+        args = self.parse_args()
+        output = self._run_script(args.run_script, capture_output_method=args.capture_output)
+        self.compare_logs(
+            output, args.compare_log, args.run_script,
+            args.saved, args.with_exit_code,
+            args.print_result
+        )
+
+    def compare_logs(self, output: List, compare_log: str,
+                     run_script: List[str], saved: str=None,
+                     with_exit_code: int=1, print_result=False):
+        script_path = self._get_script_path(run_script)
+        script_path = get_full_path(script_path)
+        compare_args = get_compare_config_with_full_path(script_path)
+
+        if compare_log is None:
+            epoch = self._get_epoch(run_script)
+            script_name = ospath.basename(script_path)
+            dist_tag = self._get_dist_tag(script_name)
+            compare_log = self._find_comparable_log(script_path, epoch, dist_tag)
+
+            if not ospath.exists(compare_log):
+                print(f"ERROR: {compare_log} not exist. Or please use argument `l` to locate log.")
+                exit(1)
+
+        compare_args['log1'] = output
+        compare_args['log2'] = compare_log
+
+        satisfied, results = compare_logs_with_paths(**compare_args)
+
+        if print_result:
+            pprint(results)
+
+        if satisfied:
+            print("SUCCESS")
+        else:
+            print("FAIL")
+
+        if saved is not None:
+            with open(saved, 'w') as f:
+                json.dump(results, f)
+
+        if with_exit_code:
+            if satisfied:
+                exit(0)
+            else:
+                exit(1)
+
+    def _run_script(self, command: List, capture_output_method: str='tempfile'):
+        return get_output(command, capture_output_method=capture_output_method)
+
+    def _get_script_path(self, run_script: List[str]):
+        for i, field in enumerate(run_script):
+            if field.endswith('.py') or field.endswith('.sh'):
+                return field
+
+        raise RuntimeError("Not found the name of script, " +
+                           "only support python or `sh` script, but got {}.".format(run_script))
+
+    def _find_comparable_log(self, script_path: str, epoch: Union[str, int], dist_tag: str):
+        gpu_type = get_gpu_type().lower()
+
+        # Get the platform of trained log
+        if gpu_type == "nv":
+            gpu_type = 'bi'
+        else:
+            gpu_type = 'nv'
+
+        script_path = get_full_path(script_path)
+        project_dir = self._get_project_dir(script_path)
+        script_name = ospath.basename(script_path)
+
+        log_path = f"{project_dir}/runing_logs/{gpu_type}/{gpu_type}-{script_name}.epoch_{epoch}{dist_tag}.log"
+
+        return log_path
+
+
+    def _get_epoch(self, run_script: List[str]):
+        for i, field in enumerate(run_script):
+            if "--epoch" in field:
+                if "=" in field:
+                    return field.split("=")[1]
+                else:
+                    return run_script[i + 1]
+
+        return 'default'
+
+    def _get_dist_tag(self, script_name: str):
+        try:
+            import torch
+            num_gpus = torch.cuda.device_count()
+        except:
+            num_gpus = os.environ.get("CUDA_VISIBLE_DEVICES", "all")
+
+        if '_dist_' in script_name or '_multigpu_' in script_name:
+            return f".{num_gpus}card"
+        return ""
+
+    def _get_project_dir(self, abs_path):
+        abs_path = ospath.abspath(abs_path)
+        script_dir = ospath.dirname(abs_path)
+        executables_dir = ospath.dirname(script_dir)
+        return ospath.dirname(executables_dir)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a633b63b07ef397969c071aee94e023de4aea37
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_comparator.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import List, Mapping, Union, Tuple
+from .log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
+
+LogLines = List[Mapping]
+CompareResult = Tuple[bool, Union[List, Mapping]]
+
+
+def _compute_errors(value1: Mapping, value2: Mapping, threshold: Mapping, allow_greater_than=False) -> CompareResult:
+    if not isinstance(threshold, Mapping):
+        _thds = dict()
+        for key in value1.keys():
+            _thds[key] = threshold
+        threshold = _thds
+
+    result = dict()
+    satisfied = True
+    for key, _thd in threshold.items():
+        v1, v2 = value1[key], value2[key]
+        origin_value_type = list
+        if not isinstance(v1, (tuple, list)):
+            origin_value_type = float
+            v1 = [v1]
+            v2 = [v2]
+
+        real_errors = []
+        for v1_i, v2_i in zip(v1, v2):
+            real_error = v1_i - v2_i
+            real_errors.append(real_error)
+            if satisfied and abs(real_error) > _thd:
+                if allow_greater_than and real_error > 0:
+                    continue
+                satisfied = False
+
+        if origin_value_type is float and len(real_errors) > 0:
+            real_errors = real_errors[0]
+
+        result[key] = real_errors
+
+    return satisfied, result
+
+
+def compare_logs(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
+    total_lines = len(log1[0])
+    real_errors = []
+    satisfied = True
+    for line_idx in range(total_lines):
+        _satisfied, _error = _compute_errors(log1[line_idx], log2[line_idx], threshold, allow_greater_than=allow_greater_than)
+        real_errors.append(_error)
+        if satisfied and not _satisfied:
+            satisfied = False
+
+    return satisfied, real_errors
+
+
+def compare_logs_by_last_result(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
+    if len(log1) == 0 or len(log2) == 0:
+        return False, []
+    return _compute_errors(log1[-1], log2[-1], threshold, allow_greater_than=allow_greater_than)
+
+
+def compare_logs_with_paths(log1, log2, threshold: Union[float, Mapping],
+                            patterns: List[str],
+                            pattern_names: List[str] = None,
+                            use_re: bool = False,
+                            nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS,
+                            start_line_pattern_flag: str = None,
+                            end_line_pattern_flag: str = None,
+                            only_last: bool=True,
+                            split_pattern: Union[str, List] = None,
+                            split_sep: List = None,
+                            split_idx: List = None,
+                            allow_greater_than: bool = False):
+    parser = LogParser(
+        patterns=patterns, pattern_names=pattern_names,
+        use_re=use_re, nearest_distance=nearest_distance,
+        start_line_pattern_flag=start_line_pattern_flag,
+        end_line_pattern_flag=end_line_pattern_flag,
+        split_pattern=split_pattern,
+        split_sep=split_sep,
+        split_idx=split_idx
+    )
+
+    log1 = parser.parse(log1)
+    log2 = parser.parse(log2)
+
+    if only_last:
+        compare_result = compare_logs_by_last_result(log1, log2, threshold, allow_greater_than=allow_greater_than)
+    else:
+        compare_result = compare_logs(log1, log2, threshold, allow_greater_than=allow_greater_than)
+
+    return compare_result[0], dict(log1=log1, log2=log2, errors=compare_result[-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..663b028a134bbeb8c659ab62c766b530c734c5d4
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/log_parser.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import List, Optional, Union, Mapping
+import re
+import sys
+
+
+DEFAULT_NEAREST_MATCH_CHARS = 10
+
+
+def read_file(file):
+    with open(file, 'r') as f:
+        return f.readlines()
+
+def read_pipe():
+    result = []
+    for line in sys.stdin:
+        result.append(line)
+    return result
+
+def postprocess_search_result(results: List[str]) -> List[float]:
+    if len(results) != 0:
+        results = list(map(float, results))
+    return results
+
+
+def extract_nearest_value_by_key_inline(content: str, key: str,
+                                        nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
+    pattern = "%s[\s\S]{0,%d}?(\d+(?:\.\d+)?)" % (key, nearest_distance)
+    return extract_value_by_pattern_inline(content, pattern)
+
+
+def extract_value_by_pattern_inline(content: str, pattern: str) -> List[float]:
+    results = re.findall(pattern, content)
+    return postprocess_search_result(results)
+
+
+def extract_value(content: str, pattern: str,
+                  inline=True, use_re=False,
+                  nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
+    if inline:
+        if use_re:
+            return extract_value_by_pattern_inline(content, pattern)
+        else:
+            return extract_nearest_value_by_key_inline(content, pattern, nearest_distance)
+    else:
+        raise NotImplementedError()
+
+
+class LogParser:
+
+    def __init__(self,
+                 patterns: List[str]=None,
+                 pattern_names: List[str]=None,
+                 use_re: bool=False,
+                 nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS,
+                 start_line_pattern_flag: str=None,
+                 end_line_pattern_flag: str=None,
+                 split_pattern: Union[str, List]=None,
+                 split_sep: List[str]=None,
+                 split_idx: List[int]=None):
+        if patterns is None and split_sep is None:
+            raise ValueError("The one of argument `patterns` or `split_sep` must be given.")
+
+        if pattern_names is not None:
+            if isinstance(patterns, (tuple, list)) and patterns is not None and len(patterns) != len(pattern_names):
+                raise ValueError("The length of `pattern_names` argument not equal to `patterns`.")
+            if isinstance(split_sep, (tuple, list)) and split_sep is not None and len(split_sep) != len(pattern_names):
+                raise ValueError("The length of `pattern_names` argument not equal to `split_sep`.")
+
+        if split_sep is not None and (split_idx is None or not isinstance(split_idx, (int, tuple, list))):
+            raise ValueError("Invalid index to split text, got {}.".format(split_idx))
+
+        if split_sep is not None and split_pattern is None:
+            raise ValueError("Invalid pattern to split text, got {}.".format(split_pattern))
+
+        self.patterns = patterns
+        self.use_re = use_re
+        self.nearest_distance = nearest_distance
+        self.start_line_pattern_flag = start_line_pattern_flag
+        self.end_line_pattern_flag = end_line_pattern_flag
+
+        if not isinstance(split_sep, (tuple, list)) and split_sep is not None:
+            split_sep = [split_sep]
+
+            if not isinstance(split_idx, (tuple, list)):
+                split_idx = [split_idx]
+
+        self.split_sep = split_sep
+        self.split_idx = split_idx
+
+        if pattern_names is None:
+            if patterns is None:
+                pattern_names = split_idx
+            else:
+                pattern_names = patterns
+        self.pattern_names = pattern_names
+
+        if not isinstance(split_pattern, (tuple, list)) and split_sep is not None:
+            split_pattern = [split_pattern] * len(split_sep)
+        self.split_pattern = split_pattern
+
+        self.start_record = start_line_pattern_flag is None
+
+    def parse(self, path_or_logs: Union[str, List]) -> List[dict]:
+        """
+        : return: [{matric_name: value}, ...]
+        """
+
+        
+        if path_or_logs:
+            path_or_logs = read_file(path_or_logs)
+        else:
+            path_or_logs = read_pipe()
+
+        ret = []
+        for line in path_or_logs:
+            result = self.parse_inline(line)
+            if len(result) == 0:
+                continue
+            ret.append(result)
+        return ret
+
+    def parse_inline(self, line) -> dict:
+        if not self.can_record(line):
+            return {}
+
+        if self.split_sep is None:
+            return self._parse_inline_by_match(line)
+        return self._parse_inline_by_split(line)
+
+    def _parse_inline_by_match(self, line: str):
+        ret = {}
+        for name, pattern in zip(self.pattern_names, self.patterns):
+            result = extract_value(
+                line, pattern, inline=True, use_re=self.use_re,
+                nearest_distance=self.nearest_distance
+            )
+            if len(result) == 0:
+                continue
+            ret[name] = result
+        return ret
+
+    def _parse_inline_by_split(self, line: str, to_type=float):
+        ret = {}
+        for name, sep, idx, pattern in zip(self.pattern_names,
+                                  self.split_sep,
+                                  self.split_idx,
+                                  self.split_pattern):
+            if not self.can_matched(line, pattern):
+                continue
+            if '\t' in sep:
+                segs = line.strip().split(sep)
+            else:
+                segs = line.strip().replace('\t', ' ').split(sep)
+            segs = list(filter(lambda kv: kv.strip() not in ["", " ", None], segs))
+            if len(segs) <= idx:
+                continue
+            ret[name] = to_type(segs[idx])
+        return ret
+
+    def can_record(self, line: str):
+        if self.start_line_pattern_flag is None:
+            self.start_record = True
+        elif not self.start_record:
+            self.start_record = self.can_matched(line, self.start_line_pattern_flag)
+
+        if self.start_record:
+            if self.end_line_pattern_flag is not None and self.can_matched(line, self.end_line_pattern_flag):
+                self.start_record = False
+
+        return self.start_record
+
+    def can_matched(self, content: str, pattern: str):
+        result = re.findall(pattern, content)
+        return len(result) != 0
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b03a70928ab4e016a0099daf0c29cb34c5b9d89
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/model_compare_config.py
@@ -0,0 +1,306 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os.path as ospath
+
+from typing import NamedTuple, Union, List, Mapping
+
+from dltest.log_parser import DEFAULT_NEAREST_MATCH_CHARS
+
+
+class LogComparatorArgs(NamedTuple):
+    threshold: Union[float, Mapping]
+    patterns: List[str] = None
+    pattern_names: List[str] = None
+    use_re: bool = False
+    nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS
+    start_line_pattern_flag: str = None
+    end_line_pattern_flag: str = None
+    split_pattern: Union[str, List] = None
+    split_sep: List = None
+    split_idx: List = None
+    only_last: bool = True
+    allow_greater_than: bool = True
+
+    def to_dict(self):
+        return self._asdict()
+
+
+class ArgsModelsTuple(NamedTuple):
+
+    args: LogComparatorArgs
+    models: List[str]
+
+
+class BaseConfig:
+
+    def __getitem__(self, item):
+        return self.__class__.__dict__[item]
+
+    def __getattr__(self, item):
+        return self.__class__.__dict__[item]
+
+    def __iter__(self):
+        for attr, value in self.__class__.__dict__.items():
+            if isinstance(value, ArgsModelsTuple):
+                yield attr
+
+    def iter_items(self):
+        for attr, value in self.__class__.__dict__.items():
+            if isinstance(value, ArgsModelsTuple):
+                yield attr, value
+
+
+class _TFComparatorConfig(BaseConfig):
+
+    cnn_benchmarks = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["Accuracy @ 1 =", "Accuracy @ 5 ="],
+            pattern_names=["Acc@1", "Acc@5"]
+        ),
+        models=["alexnet", "inceptionv3", "resnet50", "resnet101", "vgg16"]
+    )
+
+    dist_cnn_becnmarks = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_sep=[' ', ' '],
+            split_idx=[9, 10],
+            split_pattern="[\s\S]*?images/sec:[\s\S]*?jitter",
+            pattern_names=['Acc@1', 'Acc@5']
+        ),
+        models=[
+            "alexnet_dist", "inceptionv3_dist", "resnet50_dist", "resnet101_dist", "vgg16_dist"
+        ]
+    )
+
+    bert = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["eval_accuracy ="],
+            pattern_names=["Accuracy"]
+        ),
+        models=["bert"]
+    )
+
+    ssd = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["acc="],
+            pattern_names=["Acc@1"]
+        ),
+        models=["ssd"]
+    )
+
+    yolov3 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.8,
+            patterns=["mAP"]
+        ),
+        models=["yolov3"]
+    )
+
+    vnet = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["background_dice", "anterior_dice", "posterior_dice"]
+        ),
+        models=["vnet"]
+    )
+
+
+class _TorchComparatorConfig(BaseConfig):
+    classification = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=8.0, patterns=['Acc@1', 'Acc@5'],
+            start_line_pattern_flag="Start training",
+        ),
+        models=[
+            'googlenet', 'inceptionv3', 'mobilenetv3', 'resnet', 'shufflenetv2',
+            'vgg', 'resnet50_dali', 'resnext', 'densenet'
+        ]
+    )
+
+    detection = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.03,
+            patterns=[
+                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
+            ],
+            pattern_names=["mAP"],
+            start_line_pattern_flag="IoU metric: bbox",
+            end_line_pattern_flag="IoU metric: segm"
+        ),
+        models=[
+            'maskrcnn', 'retinanet', 'ssd'
+        ]
+    )
+
+    bert_cola = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['mcc']
+        ),
+        models=['bert_cola']
+    )
+
+    bert_mrpc = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['acc']
+        ),
+        models=['bert_mrpc']
+    )
+
+    bert_pretrain_apex = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['eval_mlm_accaracy']
+        ),
+        models=['bert_pretrain_apex']
+    )
+
+    segmentation = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=8.0,
+            patterns=['mean IoU:'],
+            pattern_names=['mIoU']
+        ),
+        models=[
+            'deeplabv3', 'fcn'
+        ]
+    )
+
+    t5 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=5.0,
+            split_pattern="eval_bleu[\s\S]*?=",
+            split_sep=["="],
+            split_idx=[1],
+            pattern_names=['EvalBleu']
+        ),
+        models=['t5']
+    )
+
+    yolov3 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["mAP"]
+        ),
+        models=['yolov3']
+    )
+
+    yolov5 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=[
+                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
+            ],
+            pattern_names=["mAP"],
+        ),
+        models=['yolov5'],
+    )
+
+    yolov5s_coco128 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['yolov5s_coco128']
+    )
+    
+    centernet_resnet18 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['centernet_resnet18']
+    )
+    
+    fcos_resnet50_fpn = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['fcos_resnet50_fpn']
+    )
+
+    ocr_recognition = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.5,  patterns=["0_word_acc"],
+        ),
+        models=[
+            "sar", "satrn"
+        ]
+    )
+
+
+
+class ComparatorConfig:
+
+    _configs = dict(tf=_TFComparatorConfig(), torch=_TorchComparatorConfig())
+
+    @classmethod
+    def get_frameworks(cls) -> List:
+        return list(cls._configs.keys())
+
+    @classmethod
+    def get(cls, tf_or_torch, name, default=None):
+        for model_kind, comb in cls._configs[tf_or_torch].iter_items():
+            if name in comb.models:
+                return comb.args
+        if default is not None:
+            return default
+        raise KeyError("Not found config, but got {name} for {fw}".format(name=name, fw=tf_or_torch))
+
+    @classmethod
+    def find_config(cls, script_path: str) -> LogComparatorArgs:
+        tf_or_torch = script_path.split('.')[-2].split('_')[-1]
+
+        # Find by the name of script
+        script_name = ospath.basename(script_path).rsplit('.', maxsplit=1)[0]
+        if script_name.startswith('train_'):
+            script_name = script_name.replace("train_", "", 1)
+        while script_name not in [None, "", "/", "\\"]:
+            try:
+                config = cls.get(tf_or_torch, script_name)
+                return config
+            except:
+                pass
+            script_name = script_name.rsplit('_', maxsplit=1)
+            if len(script_name) <= 1:
+                break
+            script_name = script_name[0]
+
+        # Find by the name of model's dir
+        model_dir_name = ospath.basename(ospath.dirname(script_path))
+        try:
+            config = cls.get(tf_or_torch, model_dir_name)
+            return config
+        except:
+            raise RuntimeError("Not found for", script_path)
+
+
+def get_compare_config_with_full_path(script_path: str, to_dict=True):
+    config = ComparatorConfig.find_config(script_path)
+    if to_dict:
+        return config.to_dict()
+    return config
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..91562d0db152a22bb3222284e09a1f5a33384209
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/base_cli.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from argparse import ArgumentParser
+from abc import abstractmethod
+
+
+class BaseCLI:
+
+    def __init__(self, parser=None, *args, **kwargs):
+        if parser is None:
+            self.parser = ArgumentParser(description=self.description ,*args, **kwargs)
+
+    def __call__(self):
+        self.run()
+
+    @property
+    def description(self):
+        return None
+
+    @abstractmethod
+    def command_name(self):
+        pass
+
+    def predefine_args(self):
+        pass
+
+    def parse_args(self, *args, **kwargs):
+        self.predefine_args()
+        return self.parser.parse_args(*args, **kwargs)
+
+    @abstractmethod
+    def run(self):
+        pass
+
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..91193331241c2f18ffee83478e02ca60ce20f854
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/get_env.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+import os
+from collections import defaultdict
+import os.path as osp
+import subprocess
+import sys
+
+
+def get_envinfo():
+    import torch
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+    if cuda_available:
+        from torch.utils.cpp_extension import CUDA_HOME
+        env_info['CUDA_HOME'] = CUDA_HOME
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    f'"{nvcc}" -V | tail -n1', shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            env_info['GPU ' + ','.join(devids)] = name
+
+    gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+    gcc = gcc.decode('utf-8').strip()
+    env_info['GCC'] = gcc
+
+    env_info['PyTorch'] = torch.__version__
+
+    return env_info
+
+
+def get_gpu_type():
+    import torch
+    if "DEBUG_GPU_TYPE" in os.environ:
+        return os.environ["DEBUG_GPU_TYPE"]
+
+    if not torch.cuda.is_available():
+        return "BI"
+    dev_name = torch.cuda.get_device_name(0)
+    if 'IX BI' in dev_name or getattr(torch, "corex", False):
+        _type = "BI"
+    else:
+        _type = "NV"
+
+    return _type
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
new file mode 100644
index 0000000000000000000000000000000000000000..adcdefc52099557b1bd0b0111e891a70c16266c6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/iluvatar.py
@@ -0,0 +1,32 @@
+import sys
+import subprocess
+from enum import Enum
+
+__all__ = ["get_iluvatar_card_type", "IluvatarGPU"]
+
+class IluvatarGPU(Enum):
+    UNKNOWN = -1
+    MR50 = 0
+    MR100 = 1
+    BI150 = 2
+
+card_ixsmi_names = {
+        "BI150": IluvatarGPU.BI150,
+        "BI-V150": IluvatarGPU.BI150,
+        "MR100": IluvatarGPU.MR100,
+        "MR-V100": IluvatarGPU.MR100,
+        "MR50": IluvatarGPU.MR50,
+        "MR-V50": IluvatarGPU.MR50,
+}
+
+def get_iluvatar_card_type():
+    command = 'ixsmi -L | grep "GPU \{1,\}0"'
+    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+    if result.returncode == 0:
+        for key, value in card_ixsmi_names.items():
+            if key in result.stdout:
+                return value
+        else:
+            return IluvatarGPU.UNKNOWN
+    else:
+        return IluvatarGPU.UNKNOWN
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c1d6c7f1579bb38f66af77571ed1354d1a75a6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/infer_args.py
@@ -0,0 +1,87 @@
+import os
+
+from typing import Union, List, Dict, Any, Mapping
+from argparse import Namespace, ArgumentParser
+import json
+
+
+def _obj_to_dict(obj) -> Dict:
+    if isinstance(obj, Mapping):
+        return obj
+
+    try:
+        from absl import flags
+        if isinstance(obj, flags.FlagValues):
+            return obj.flag_values_dict()
+    except:
+        pass
+    if isinstance(obj, Namespace):
+        return obj.__dict__
+    elif isinstance(obj, List):
+        new_obj = dict()
+        for _o in obj:
+            _o_dict = _obj_to_dict(_o)
+            new_obj.update(_o_dict)
+        return new_obj
+    elif not isinstance(obj, Dict):
+        if hasattr(obj, "__dict__"):
+            return obj.__dict__
+    try:
+        typename = type(obj).__name__
+    except:
+        typename = str(obj)
+    return {typename: str(obj)}
+
+
+def json_dump_obj(o):
+    if hasattr(o, "__name__"):
+        return o.__name__
+    return str(o)
+
+
+def show_infer_arguments(args: Union[List, Dict, Any]):
+    """ print running arguments
+    Example 1: For ArgumentParser
+        >>> parser = ArgumentParser("Test")
+        >>> parser.add_argument("--arg0", type=str)
+        >>> args = parser.parse_args()
+        >>> show_infer_arguments(args)
+
+    Example 2: For dict
+        >>> args = dict(arg=1)
+        >>> show_infer_arguments(args)
+
+    Example 3: For custom object
+        >>> from collections import namedtuple
+        >>> ArgsType = namedtuple("ArgsType", ["arg"])
+        >>> args = ArgsType(arg=123)
+        >>> show_infer_arguments(args)
+
+    Example 4: For absl
+        >>> from absl import flags
+        >>> flags.DEFINE_string("arg", "123", "test")
+        >>> show_infer_arguments(flags.FLAGS)
+
+    Example 5: For multi args
+        >>> args1 = dict(a=1)
+        >>> args2 = dict(b=2)
+        >>> show_infer_arguments([args1, args2])
+
+    """
+    if not "SHOW_RUNNING_ARGS" in os.environ:
+        return
+
+    if os.environ["SHOW_RUNNING_ARGS"].lower() in ["0", "f", "false"]:
+        return
+
+    if "LOCAL_RANK" in os.environ:
+        if os.environ["LOCAL_RANK"] != "0":
+            return
+    args = _obj_to_dict(args)
+    args = json.dumps(args, default=json_dump_obj)
+    print("[RunningArguments]", args)
+
+
+if __name__ == '__main__':
+    os.environ["SHOW_RUNNING_ARGS"] = "1"
+    show_infer_arguments([dict(a=1), dict(b=1), object()])
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8cfacfbc041d0242503ef1a6cd898ecfa5ccf2d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/misc.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+import copy
+import os
+
+
+def get_full_path(fname):
+    pwd = os.getcwd()
+    if fname.startswith('/'):
+        return fname
+    return os.path.join(pwd, fname)
+
+
+def is_main_proc(rank):
+    return str(rank) in ["None", "-1", "0"]
+
+
+def main_proc_print(*args, **kwargs):
+    if "RANK" in os.environ:
+        if is_main_proc(os.environ["RANK"]):
+            print(*args, **kwargs)
+            return
+
+    if "LOCAL_RANK" in os.environ:
+        if is_main_proc(os.environ["LOCAL_RANK"]):
+            print(*args, **kwargs)
+            return
+
+    print(*args, **kwargs)
+
+
+def create_subproc_env():
+    env = copy.copy(os.environ)
+    env["USE_DLTEST"] = "1"
+    return env
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
new file mode 100644
index 0000000000000000000000000000000000000000..e23230de1831fc100efbb81bec4597efe7a2f712
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/real_tempfile.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os
+import os.path as ospath
+from pathlib import Path
+import tempfile
+
+
+class TemporaryFile:
+
+    def __init__(self, with_open=False, mode='r'):
+        self.name = None
+        self.with_open = with_open
+        self.mode = mode
+
+        self.file = None
+
+    def create(self):
+        self.name = tempfile.mktemp()
+        file_path = Path(self.name)
+        file_path.touch()
+
+    def delete(self):
+        if self.name is not None and ospath.exists(self.name):
+            os.unlink(self.name)
+
+    def read(self):
+        self._check_file_status()
+        return self.file.read()
+
+    def readlines(self):
+        self._check_file_status()
+        return self.file.readlines()
+
+    def _check_file_status(self):
+        if self.file is None:
+            raise RuntimeError("File is closed, please reopen it.")
+
+    def __enter__(self):
+        self.create()
+        if self.with_open:
+            self.file = open(self.name, mode=self.mode)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.with_open:
+            self.file.close()
+        self.delete()
+
+
+
+
+
+
+
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..135faa89e670a64675cde81312a437ac8f2b5a1d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/dltest/utils/subprocess_tools.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import subprocess
+from typing import Callable, Union, List
+
+from dltest.utils.real_tempfile import TemporaryFile
+from dltest.utils import misc
+
+
+def get_output_with_pipe(command, shell=None, callback: Callable[[list], None]=None, *args, **kwargs):
+    if shell is None:
+        shell = True
+
+    if shell and not isinstance(command, str):
+        command = " ".join(command)
+
+    stream = subprocess.Popen(
+        command, shell=shell,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        *args, **kwargs
+    )
+    outputs = []
+    while 1:
+        exit_code = stream.poll()
+        if exit_code is None:
+            if stream.stdout.readable():
+                outputs.append(stream.stdout.readline().decode("utf8").rstrip())
+                if callback is not None:
+                    callback(outputs[-1:])
+                print(outputs[-1])
+        else:
+            if stream.stdout.readable():
+                lines = stream.stdout.readlines()
+                lines = [line.decode("utf8".rstrip()) for line in lines]
+                outputs.extend(lines)
+                if callback is not None:
+                    callback(outputs[-1:])
+                print('\n'.join(lines))
+            break
+
+    return outputs
+
+
+def get_output_with_tempfile(command, *args, **kwargs):
+    if not isinstance(command, (list, tuple)):
+        command = [command]
+    stdout = None
+    with TemporaryFile(with_open=True) as file:
+        command.extend(['|', 'tee', file.name])
+        command = " ".join(command)
+
+        res = subprocess.run(command, stdout=stdout, stderr=subprocess.STDOUT, shell=True, *args, **kwargs)
+        output = file.readlines()
+
+    return output
+
+def execute_shell(command, *args, **kwargs):
+    if "env" not in kwargs:
+        kwargs["env"] = misc.create_subproc_env()
+
+    if not isinstance(command, (list, tuple)):
+        command = [command]
+
+    command = " ".join(command)
+    res = subprocess.run(command,
+                         shell=True, *args, **kwargs)
+    return res
+
+def get_output(command: List, capture_output_method: str = 'tempfile', *args, **kwargs):
+    if "env" not in kwargs:
+        kwargs["env"] = misc.create_subproc_env()
+
+    if capture_output_method == "tempfile":
+        return get_output_with_tempfile(command, *args, **kwargs)
+    return get_output_with_pipe(command, *args, **kwargs)
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..52d5db6f62e74906ac2ea0a55f7e82b57cfe02e5
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/dltest/setup.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from setuptools import setup, find_packages
+from dltest.cli.entry_points import make_execute_path
+
+setup(
+    name="dltest",
+    version="0.1",
+    description='Iluvatar Corex AI Toolbox',
+    packages=find_packages(exclude=('examples')),
+    include_package_data=True,
+    zip_safe=False,
+    entry_points = {
+        'console_scripts': make_execute_path(),
+    },
+    install_requires=[
+        'psutil'
+    ]
+)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c40a978ea07b4a2bd107cd0cbba1c63ecea7256
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_bert.py
@@ -0,0 +1,582 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import (
+    FusionCustomFC,
+    FusionCustomFCActivation,
+    FusionCustomFCGPT2,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_videobert_attention import FusionVideoBertAttention
+from passes.fusion_vit_attention import FusionVITAttention
+from passes.fusion_xsoftmax import FusionXSoftmax
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class BertOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class BertOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        self.attention_fusion.apply()
+        FusionAlbertAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        ).apply()
+        FusionVideoBertAttention(self).apply()
+        FusionVITAttention(self).apply()
+        FusionSwinLAttention(self).apply()
+        FusionGptAttentionNoPast(self).apply()
+        # Only relevant in models with Q-DQ nodes
+        self.qordered_attention_fusion.apply()
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_gpt2_classify(self):
+        fusion = FusionCustomFCGPT2(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def fuse_custom_xsoftmax(self):
+        fusion = FusionXSoftmax(self)
+        fusion.apply()
+
+    def fuse_disentangled_attention(self):
+        fusion = FusionDisentangledAttention(self)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self, self.hidden_size)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_swint_opt:
+            self.fuse_custom_fc()
+            self.fuse_swinT_serial_bias_add()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        if options.enable_gpt2_classify or options.enable_vit:
+            self.fuse_custom_fc_gpt2_classify()
+
+        if options.enable_vit:
+            self.fuse_custom_fc()
+
+        if (options is None) or options.enable_attention:
+            if options is not None:
+                self.attention_mask.set_mask_format(options.attention_mask_format)
+            self.fuse_attention()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        self.fuse_custom_fc()
+
+        self.fuse_custom_xsoftmax()
+
+        self.fuse_disentangled_attention()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.fuse_custom_fc_activation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a250a9ea05c5d7b625523e62b976f94fa7ab6cff
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_conformer.py
@@ -0,0 +1,576 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_conformer_attention import FusionConformerAttention
+from passes.fusion_conformer_xsoftmax import FusionConformerXSoftmax
+from passes.fusion_customfc import (
+    FusionConformerCustomFCActivation,
+    FusionCustomFC,
+    FusionCustomFCGPT2,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_gpt_attention_no_past import FusionGptAttentionNoPast
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_splitQKV import FusionSplitQKV
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_vit_attention import FusionVITAttention
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class ConformerOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class conformerOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        FusionConformerAttention(self, self.hidden_size, self.num_heads).apply()
+        # Only relevant in models with Q-DQ nodes
+        self.qordered_attention_fusion.apply()
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_conformer_activation(self):
+        fusion = FusionConformerCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_gpt2_classify(self):
+        fusion = FusionCustomFCGPT2(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def fuse_custom_xsoftmax(self):
+        fusion = FusionConformerXSoftmax(self)
+        fusion.apply()
+
+    def fuse_disentangled_attention(self):
+        fusion = FusionDisentangledAttention(self)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self, self.hidden_size)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_split_qkv(self):
+        fusion = FusionSplitQKV(self, self.hidden_size, self.num_heads)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_swint_opt:
+            self.fuse_custom_fc()
+            self.fuse_swinT_serial_bias_add()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        if options.enable_gpt2_classify or options.enable_vit:
+            self.fuse_custom_fc_gpt2_classify()
+
+        if options.enable_vit:
+            self.fuse_custom_fc()
+
+        self.fuse_custom_fc()
+        self.fuse_custom_xsoftmax()
+
+        self.fuse_attention()
+
+        self.fuse_split_qkv()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.remove_unused_constant()
+        self.fuse_custom_fc_conformer_activation()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..85889319916199298dad2e9d2b47cde052c7c746
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_roformer.py
@@ -0,0 +1,540 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_albert_attention import FusionAlbertAttention
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import (
+    FusionCustomFC,
+    FusionCustomFCActivation,
+    FusionCustomFcRoformer,
+)
+from passes.fusion_disentangled_attention import FusionDisentangledAttention
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import (
+    FusionLayerNormalization,
+    FusionLayerNormalizationKeras,
+    FusionLayerNormalizationTF,
+)
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_roformer_attention import FusionRoformerCrossAttention
+from passes.fusion_rope import FusionRoPE
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_swinl_attention import FusionSwinLAttention
+from passes.fusion_utils import FusionUtils
+from passes.fusion_videobert_attention import FusionVideoBertAttention
+from passes.fusion_vit_attention import FusionVITAttention
+from passes.fusion_xsoftmax import FusionXSoftmax
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class RoformerOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_attention(self):
+        FusionRoformerCrossAttention(self).apply()
+
+    def fuse_format_roformer(self):
+        # FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_custom_fc_roformer(self):
+        fusion = FusionCustomFcRoformer(self)
+        fusion.apply()
+
+    def fuse_rope(self):
+        fusion = FusionRoPE(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalizationKeras(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        if options.enable_format_roformer:
+            self.fuse_format_roformer()
+
+        self.fuse_custom_fc_roformer()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        self.fuse_custom_fc()
+
+        if (options is None) or options.enable_attention:
+            if options is not None:
+                self.attention_mask.set_mask_format(options.attention_mask_format)
+            self.fuse_attention()
+
+        self.fuse_rope()
+
+        self.fuse_shape()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.fuse_custom_fc_activation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b1d6b5fec3bfa10533527a72e475cca1bc63b86
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_t5.py
@@ -0,0 +1,519 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+import onnx
+from onnx import GraphProto, ModelProto, TensorProto, ValueInfoProto, helper
+from passes.fusion_attention import AttentionMask, FusionAttention
+from passes.fusion_biasgelu import FusionBiasGelu
+from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
+from passes.fusion_embedlayer import FusionEmbedLayerNormalization
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_qordered_matmul import FusionQOrderedMatMul
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_rms_norm import FusionRMSNorm
+from passes.fusion_shape import FusionShape
+from passes.fusion_skiplayernorm import (
+    FusionBiasSkipLayerNormalization,
+    FusionSkipLayerNormalization,
+)
+from passes.fusion_t5_attention import FusionT5Attention
+from passes.fusion_utils import FusionUtils
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class BertOptimizationOptions(FusionOptions):
+    """This class is deprecated"""
+
+    def __init__(self, model_type):
+        logger.warning(
+            f"BertOptimizationOptions is depreciated. Please use FusionOptions instead."
+        )
+        super().__init__(model_type)
+
+
+class T5OnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize T5 ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+
+        super().__init__(model)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.attention_mask = AttentionMask(self)
+        self.attention_fusion = FusionAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.qordered_attention_fusion = FusionQOrderedAttention(
+            self, self.hidden_size, self.num_heads, self.attention_mask
+        )
+        self.utils = FusionUtils(self)
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_bias_gelu(self, is_fastgelu):
+        fusion = FusionBiasGelu(self, is_fastgelu)
+        fusion.apply()
+
+    def gelu_approximation(self):
+        fusion = FusionGeluApproximation(self)
+        fusion.apply()
+
+    def fuse_add_bias_skip_layer_norm(self):
+        fusion = FusionBiasSkipLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_embed_layer(self):
+        fusion = FusionEmbedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_rms_norm(self):
+        fusion = FusionRMSNorm(self)
+        fusion.apply()
+
+    def fuse_t5_attention(self):
+        fusion = FusionT5Attention(self)
+        fusion.apply()
+        # pass
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def fuse_skip_layer_norm(self):
+        fusion = FusionSkipLayerNormalization(self)
+        fusion.apply()
+
+    # Only relevant in models with Q-DQ nodes
+    def fuse_qordered_mamtul(self):
+        fusion = FusionQOrderedMatMul(self)
+        fusion.apply()
+
+    def get_graph_inputs_from_node_type(
+        self, op_type: str, input_indices: List[int], casted: bool
+    ):
+        """
+        Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
+        Returns a list of the graph input names based on the filter whether it is casted or not.
+        """
+        graph_inputs = []
+
+        output_name_to_node = self.output_name_to_node()
+        nodes = self.get_nodes_by_op_type(op_type)
+        for node in nodes:
+            bert_inputs = [node.input[i] for i in input_indices if i < len(node.input)]
+            for bert_input in bert_inputs:
+                if self.find_graph_input(bert_input):
+                    if not casted:
+                        graph_inputs.append(bert_input)
+                elif bert_input in output_name_to_node:
+                    parent = output_name_to_node[bert_input]
+                    if (
+                        parent.op_type == "Cast"
+                        and self.find_graph_input(parent.input[0]) is not None
+                    ):
+                        if casted:
+                            graph_inputs.append(parent.input[0])
+        return graph_inputs
+
+    def get_graph_inputs_from_fused_nodes(self, casted: bool):
+        inputs = self.get_graph_inputs_from_node_type(
+            "EmbedLayerNormalization", [0, 1, 7], casted
+        )
+        inputs += self.get_graph_inputs_from_node_type("Attention", [3], casted)
+        return inputs
+
+    def change_graph_input_type(
+        self,
+        graph: GraphProto,
+        graph_input: ValueInfoProto,
+        new_type: int = TensorProto.INT32,
+    ):
+        """Change graph input type, and add Cast node if needed.
+
+        Args:
+            graph (GraphProto): graph
+            graph_input (TensorProto): input of the graph
+            new_type (int, optional): new data type. Defaults to TensorProto.INT32.
+
+        Returns:
+            NodeProto: a new Cast node that added. None if Cast node is not added.
+            List[NodeProto]: Cast nodes that have been removed.
+        """
+        assert isinstance(graph, GraphProto)
+        assert isinstance(graph_input, ValueInfoProto)
+        assert self.find_graph_input(graph_input.name)
+
+        if graph_input.type.tensor_type.elem_type == int(new_type):
+            return None, []
+
+        new_cast_node = None
+        nodes_to_remove = []
+
+        input_name_to_nodes = self.input_name_to_nodes()
+        if graph_input.name in input_name_to_nodes:
+            nodes = input_name_to_nodes[graph_input.name]
+
+            # For children that is not Cast node, insert a Cast node to convert int32 to original data type.
+            nodes_not_cast = [node for node in nodes if node.op_type != "Cast"]
+            if nodes_not_cast:
+                node_name = self.create_node_name("Cast")
+                output_name = node_name + "_" + graph_input.name
+                new_value_info = graph.value_info.add()
+                new_value_info.CopyFrom(graph_input)
+                new_value_info.name = output_name
+                new_cast_node = helper.make_node(
+                    "Cast",
+                    [graph_input.name],
+                    [output_name],
+                    to=int(graph_input.type.tensor_type.elem_type),
+                    name=node_name,
+                )
+                graph.node.extend([new_cast_node])
+
+                for node in nodes_not_cast:
+                    OnnxModel.replace_node_input(node, graph_input.name, output_name)
+
+            # For children that is Cast node, no need to insert Cast.
+            # When the children is Cast to int32, we can remove that Cast node since input type is int32 now.
+            nodes_cast = [node for node in nodes if node.op_type == "Cast"]
+            for node in nodes_cast:
+                if OnnxModel.get_node_attribute(node, "to") == int(new_type):
+                    self.replace_input_of_all_nodes(node.output[0], graph_input.name)
+                if not self.find_graph_output(node.output[0]):
+                    nodes_to_remove.append(node)
+            if nodes_to_remove:
+                self.remove_nodes(nodes_to_remove)
+
+        graph_input.type.tensor_type.elem_type = int(new_type)
+        return new_cast_node, nodes_to_remove
+
+    def change_graph_inputs_to_int32(self):
+        """Change data type of all graph inputs to int32 type, and add Cast node if needed."""
+        graph = self.graph()
+        add_cast_count = 0
+        remove_cast_count = 0
+        for graph_input in graph.input:
+            new_node, removed_nodes = self.change_graph_input_type(
+                graph, graph_input, TensorProto.INT32
+            )
+            if new_node:
+                add_cast_count += 1
+            remove_cast_count += len(removed_nodes)
+        logger.info(
+            f"Graph inputs are changed to int32. Added {add_cast_count} Cast nodes, and removed {remove_cast_count} Cast nodes."
+        )
+
+    def use_dynamic_axes(
+        self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_seq_len"
+    ):
+        """
+        Update input and output shape to use dynamic axes.
+        """
+        bert_graph_inputs = self.get_graph_inputs_from_fused_nodes(
+            casted=True
+        ) + self.get_graph_inputs_from_fused_nodes(casted=False)
+
+        dynamic_batch_inputs = {}
+        for input in self.model.graph.input:
+            if input.name in bert_graph_inputs:
+                dim_proto = input.type.tensor_type.shape.dim[0]
+                dim_proto.dim_param = dynamic_batch_dim
+                if dynamic_seq_len is not None:
+                    dim_proto = input.type.tensor_type.shape.dim[1]
+                    dim_proto.dim_param = dynamic_seq_len
+
+        for output in self.model.graph.output:
+            dim_proto = output.type.tensor_type.shape.dim[0]
+            dim_proto.dim_param = dynamic_batch_dim
+
+    def preprocess(self):
+        self.adjust_reshape_and_expand()
+        return
+
+    def adjust_reshape_and_expand(self):
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Reshape":
+                # Clean up unneccessary reshape nodes.
+                # Find reshape nodes with no actually data in "shape" attribute and remove.
+                reshape_shape = self.get_constant_value(node.input[1])
+                if reshape_shape is not None and reshape_shape.size == 0:
+                    nodes_to_remove.extend([node])
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    continue
+
+                # Find path "Slice" -> "Reshape" -> "Expand" -> "Expand" -> current "Reshape", simplify the graph by
+                # changing current reshape's input to output of slice.
+                reshape_path = self.match_parent_path(
+                    node,
+                    ["Expand", "Expand", "Reshape", "Slice"],
+                    [0, 0, 0, 0],
+                    self.output_name_to_node(),
+                )
+                if reshape_path is not None:
+                    expand_node = reshape_path[-3]
+                    expand_shape_value = self.get_constant_value(expand_node.input[1])
+
+                    reshape_before_expand = reshape_path[-2]
+                    shape_value = self.get_constant_value(
+                        reshape_before_expand.input[1]
+                    )
+
+                    slice_node = reshape_path[-1]
+                    if (
+                        expand_shape_value is not None
+                        and shape_value is not None
+                        and len(expand_shape_value) == 2
+                        and len(shape_value) == 1
+                        and expand_shape_value[1] == shape_value[0]
+                    ):
+                        node.input[0] = slice_node.output[0]
+
+        if nodes_to_remove:
+            self.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed Reshape and Expand count: {len(nodes_to_remove)}")
+
+    def clean_graph(self):
+        output_name_to_node = self.output_name_to_node()
+        nodes_to_remove = []
+        for node in self.nodes():
+            # Before:
+            #  input_ids --> Shape --> Gather(indices=0) --> Unsqueeze ------+
+            #          |                                                     |
+            #          |                                                     v
+            #          +----> Shape --> Gather(indices=1) --> Unsqueeze--->  Concat --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # After:
+            #  input_ids --> Shape                                                  --> ConstantOfShape -->Cast --> EmbedLayerNormaliation/ReduceSum
+            # TODO: merge ConstantOfShape -->Cast to ConstantOfShape (need update the data type of value)
+            op_input_id = {"EmbedLayerNormalization": 1, "ReduceSum": 0, "Attention": 3}
+            if node.op_type in op_input_id:
+                i = op_input_id[node.op_type]
+                parent_nodes = self.match_parent_path(
+                    node,
+                    [
+                        "Cast",
+                        "ConstantOfShape",
+                        "Concat",
+                        "Unsqueeze",
+                        "Gather",
+                        "Shape",
+                    ],
+                    [i, 0, 0, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    (
+                        cast,
+                        constantOfShape,
+                        concat,
+                        unsqueeze,
+                        gather,
+                        shape,
+                    ) = parent_nodes
+                    if shape.input[0] == self.graph().input[0].name:
+                        constantOfShape.input[0] = shape.output[0]
+                        output_name_to_node = self.output_name_to_node()
+
+            if node.op_type == "Attention":
+                # Before:
+                #   input_ids --> Shape -->ConstantOfShape -->Cast --> ReduceSum --> Attention
+                # After:
+                #   remove this path, and remove the optional mask_index input of Attention node.
+                parent_nodes = self.match_parent_path(
+                    node,
+                    ["ReduceSum", "Cast", "ConstantOfShape", "Shape"],
+                    [3, 0, 0, 0],
+                    output_name_to_node,
+                )
+                if parent_nodes is not None:
+                    if parent_nodes[-1].input[0] == self.graph().input[0].name:
+                        attention_node = helper.make_node(
+                            "Attention",
+                            inputs=node.input[0 : len(node.input) - 1],
+                            outputs=node.output,
+                            name=node.name + "_remove_mask",
+                        )
+                        attention_node.domain = "com.microsoft"
+                        attention_node.attribute.extend(
+                            [helper.make_attribute("num_heads", self.num_heads)]
+                        )
+                        self.add_node(
+                            attention_node, self.get_graph_by_node(attention_node).name
+                        )
+                        nodes_to_remove.append(node)
+        self.remove_nodes(nodes_to_remove)
+
+    def postprocess(self):
+        self.clean_graph()
+        self.prune_graph()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.preprocess()
+
+        self.fuse_reshape()
+
+        if (options is None) or options.enable_skip_layer_norm:
+            self.fuse_skip_layer_norm()
+
+        # Perform the MatMul fusion after the Attention fusion as we do not
+        # want to fuse the MatMuls inside the Attention subgraphs
+        if (options is None) or options.enable_qordered_matmul:
+            self.fuse_qordered_mamtul()
+
+        self.fuse_shape()
+
+        self.fuse_rms_norm()
+
+        self.fuse_t5_attention()
+
+        if (options is None) or options.enable_embed_layer_norm:
+            self.fuse_embed_layer()
+
+        # Remove reshape nodes that having same shape of input and output based on symbolic shape inference.
+        self.utils.remove_useless_reshape_nodes()
+
+        self.postprocess()
+
+        # Bias fusion is done after postprocess to avoid extra Reshape between bias and Gelu/FastGelu/SkipLayerNormalization
+        if (options is None) or options.enable_bias_gelu:
+            # Fuse Gelu and Add Bias before it.
+            self.fuse_bias_gelu(is_fastgelu=True)
+            self.fuse_bias_gelu(is_fastgelu=False)
+
+        if (options is None) or options.enable_bias_skip_layer_norm:
+            # Fuse SkipLayerNormalization and Add Bias before it.
+            self.fuse_add_bias_skip_layer_norm()
+
+        if options is not None and options.enable_gelu_approximation:
+            self.gelu_approximation()
+
+        self.remove_unused_constant()
+
+        # Use symbolic batch dimension in input and output.
+        if add_dynamic_axes:
+            self.use_dynamic_axes()
+
+        logger.info(f"opset version: {self.get_opset_version()}")
+
+    def get_fused_operator_statistics(self):
+        """
+        Returns node count of fused operators.
+        """
+        op_count = {}
+        ops = [
+            "EmbedLayerNormalization",
+            "Attention",
+            "QOrderedAttention",
+            "Gelu",
+            "QOrderedGelu",
+            "FastGelu",
+            "BiasGelu",
+            "LayerNormalization",
+            "QOrderedLayerNormalization",
+            "SkipLayerNormalization",
+            "QOrderedMatMul",
+        ]
+        for op in ops:
+            nodes = self.get_nodes_by_op_type(op)
+            op_count[op] = len(nodes)
+        logger.info(f"Optimized operators:{op_count}")
+        return op_count
+
+    def is_fully_optimized(self):
+        """
+        Returns True when the model is fully optimized.
+        """
+        op_count = self.get_fused_operator_statistics()
+        embed = op_count["EmbedLayerNormalization"]
+        attention = op_count["Attention"] + op_count["QOrderedAttention"]
+        gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
+        layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
+        is_perfect = (
+            (embed > 0)
+            and (attention > 0)
+            and (attention == gelu)
+            and (layer_norm >= 2 * attention)
+        )
+
+        if layer_norm == 0:
+            logger.debug("Layer Normalization not fused")
+
+        if gelu == 0:
+            logger.debug("Gelu/FastGelu not fused")
+
+        if embed == 0:
+            logger.debug("Embed Layer not fused")
+
+        if attention == 0:
+            logger.warning("Attention not fused")
+
+        return is_perfect
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
new file mode 100644
index 0000000000000000000000000000000000000000..57982d0cc739fd766b5cc87a51479c62dabb22be
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/onnx_model_yolo.py
@@ -0,0 +1,114 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Optional
+
+from onnx import ModelProto
+from passes.fuse_series_bias_add import FusionSerialBiasAdd
+from passes.fusion_customfc import FusionCustomFC, FusionCustomFCActivation
+from passes.fusion_fastgelu import FusionFastGelu
+from passes.fusion_format_roformer import (
+    FusionFormatInvalidMask,
+    FusionRemoveUselessElementwise,
+)
+from passes.fusion_gelu import FusionGelu
+from passes.fusion_gelu_approximation import FusionGeluApproximation
+from passes.fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
+from passes.fusion_options import FusionOptions
+from passes.fusion_qordered_attention import FusionQOrderedAttention
+from passes.fusion_qordered_gelu import FusionQOrderedGelu
+from passes.fusion_qordered_layernorm import FusionQOrderedLayerNormalization
+from passes.fusion_reshape import FusionReshape
+from passes.fusion_shape import FusionShape
+from passes.fusion_utils import FusionUtils
+from passes.fusion_yolov5_decoder import FusionYoloV5Decoder
+from passes.onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class YoloOnnxModel(OnnxModel):
+    def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0):
+        """Initialize BERT ONNX Model.
+
+        Args:
+            model (ModelProto): the ONNX model
+            num_heads (int, optional): number of attention heads. Defaults to 0 (detect the parameter automatically).
+            hidden_size (int, optional): hidden dimension. Defaults to 0 (detect the parameter automatically).
+        """
+        assert (num_heads == 0 and hidden_size == 0) or (
+            num_heads > 0 and hidden_size % num_heads == 0
+        )
+        super().__init__(model)
+        self.utils = FusionUtils(self)
+
+    def fuse_format_roformer(self):
+        FusionRemoveUselessElementwise(self).apply()
+        fusion = FusionFormatInvalidMask(self)
+        fusion.apply()
+
+    def fuse_custom_fc(self):
+        fusion = FusionCustomFC(self)
+        fusion.apply()
+
+    def fuse_custom_fc_activation(self):
+        fusion = FusionCustomFCActivation(self)
+        fusion.apply()
+
+    def fuse_swinT_serial_bias_add(self):
+        fusion = FusionSerialBiasAdd(self)
+        fusion.apply()
+
+    def fuse_gelu(self):
+        fusion = FusionGelu(self)
+        fusion.apply()
+        fusion = FusionFastGelu(self)
+        fusion.apply()
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedGelu(self)
+        fusion.apply()
+
+    def fuse_reshape(self):
+        fusion = FusionReshape(self)
+        fusion.apply()
+
+    def fuse_shape(self):
+        fusion = FusionShape(self)
+        fusion.apply()
+
+    def fuse_layer_norm(self):
+        fusion = FusionLayerNormalization(self, 0)
+        fusion.apply()
+
+        fusion = FusionLayerNormalizationTF(self)
+        fusion.apply()
+
+        # Only relevant in models with Q-DQ nodes
+        fusion = FusionQOrderedLayerNormalization(self)
+        fusion.apply()
+
+    def optimize(
+        self, options: Optional[FusionOptions] = None, add_dynamic_axes: bool = False
+    ):
+        if (options is not None) and not options.enable_shape_inference:
+            self.disable_shape_inference()
+
+        self.utils.remove_identity_nodes()
+
+        # Remove cast nodes that having same data type of input and output based on symbolic shape inference.
+        self.utils.remove_useless_cast_nodes()
+
+        if (options is None) or options.enable_layer_norm:
+            self.fuse_layer_norm()
+
+        if (options is None) or options.enable_gelu:
+            self.fuse_gelu()
+
+        self.fuse_reshape()
+
+        FusionYoloV5Decoder(self).apply()
+        self.remove_unused_constant()
+        logger.info(f"opset version: {self.get_opset_version()}")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..dc823d366b327141bd5646e7d3aef153349cea8e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.md
@@ -0,0 +1,51 @@
+# IxRT optimizer
+
+## 1. optimizer 简介
+`optimizer` 是一个 ixrt 中集成的图融合工具，用于将onnx图中的op融合成对应的ixrt plugin；
+
+## 2. optimizer 功能说明
+| 功能           | 说明  |
+| -------------- | ---- |
+| 多 batchsize 支持 | 支持设置不同 batchsize 进行推理测试 |
+| 动态图支持 | 支持融合动态图和静态图 |
+| 模型支持 | 目前测试通过videobert, roberta, deberta, swinL, roformer, albert等模型 |
+
+## 3. optimizer 运行参数
+| 参数           | 说明  |
+| -------------- | ---- |
+| `--onnx`       | 必选 ，指定要运行的 onnx 模型路径 |
+| `--num_heads`  | 可选 ，指定模型对应Attention模块注意力头的个数 |
+|`--hidden_size`    | 可选， 模型模型隐藏层的大小|
+|`--input_shapes` | 可选 ，指定模型输入数据类型，示例 --input_shapes "input_name1:3x224x224, input_name2:3x224x224"类型 |
+| `--dump_onnx` | 可选 ，用于图融合过程中dump出中间的onnx图 |
+|`--model_type`        | 可选 ，可以指定要融合的模型类型，默认是"bert", 可选["bert", "swint", "roformer"]|
+|`--log_level`     |可选 ，指定ixrt运行时显示日志的等级， 可指定为debug、info、error，默认为 info|
+
+
+## 4. 运行示例
+
+###  4.1 示例1：融合albert|videobert|roberta|deberta
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH}
+```
+
+###  4.2 示例2：融合swinL
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH} --input_shapes pixel_values.1:${BS}x3x384x384 --model_type swint
+```
+
+###  4.3 示例3：融合roformer
+```bash
+cd oss/tools/optimizer
+python3 optimizer.py --onnx ${MODEL_PATH} --model_type roformer
+```
+
+### 4.4 精度验证
+
+请参考[高级话题](5_advanced_topics.md)中的<u>精度对比工具</u>一节，了解详细使用方法和原理。
+
+也可以用[C++ API 使用简介](3_cpp_api.md)或 [Python API 使用简介](4_python_api.md)
+
+具体使用方法可以参考oss/samples
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..701bd7a41f9a7b87249b1af5e6e8aaac6db4d53d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/optimizer.py
@@ -0,0 +1,195 @@
+import argparse
+import logging
+import time
+from typing import Dict, Optional
+
+import onnx
+from onnx import ModelProto, helper, load_model
+from onnx_model_bert import BertOnnxModel
+from onnx_model_roformer import RoformerOnnxModel
+from onnx_model_conformer import conformerOnnxModel
+from onnx_model_t5 import T5OnnxModel
+from onnx_model_yolo import YoloOnnxModel
+from onnxsim import simplify
+from passes.fusion_options import FusionOptions
+from passes.symbolic_shape_infer import SymbolicShapeInference
+
+logger = logging.getLogger(__name__)
+MODEL_TYPES = {
+    "bert": (BertOnnxModel, None, "pytorch", 1),
+    "swint": (BertOnnxModel, None, "pytorch", 1),
+    "roformer": (RoformerOnnxModel, None, "tf2onnx", 1),
+    "gpt2": (BertOnnxModel, None, "pytorch", 1),
+    "t5": (T5OnnxModel, None, "tf2onnx", 1),
+    "yolo": (YoloOnnxModel, None, "pytorch", 1),
+    "vit": (BertOnnxModel, None, "pytorch", 1),
+    "conformer": (conformerOnnxModel, None, "pytorch", 1),
+}
+
+
+def optimize_by_fusion(
+    model: ModelProto,
+    model_type: str = "bert",
+    num_heads: int = 0,
+    hidden_size: int = 0,
+    optimization_options: Optional[FusionOptions] = None,
+):
+    """Optimize Model by graph fusion logic.
+
+    Note that ONNXRuntime graph optimizations (like constant folding) will not be applied. So it is better to enable
+    constant folding during exporting ONNX model, or run optimize_by_onnxruntime on the model first like optimize_model.
+
+    For BERT model, num_heads and hidden_size are optional. For other model types, you need specify these parameters.
+
+    Args:
+        model (ModelProto): model object
+        model_type (str, optional): model type - like bert, bert_tf, bert_keras or gpt2. Defaults to 'bert'.
+        num_heads (int, optional): number of attention heads. Defaults to 0.
+                                   0 allows detect the parameter from graph automatically (for model_type "bert" only).
+        hidden_size (int, optional): hidden size. Defaults to 0.
+                                     0 allows detect the parameter from graph automatically (for model_type "bert" only).
+        optimization_options (FusionOptions, optional): optimization options that turn on/off some fusions. Defaults to None.
+
+     Returns:
+        object of an optimizer class.
+    """
+    if model_type != "bert" and (num_heads == 0 or hidden_size == 0):
+        logger.warning(
+            "Please specify parameters of num_heads and hidden_size when model_type is not 'bert'"
+        )
+
+    (optimizer_class, transformer_class, producer, _) = MODEL_TYPES[model_type]
+
+    if model.producer_name and producer != model.producer_name:
+        logger.warning(
+            f'Model producer not matched: Expected "{producer}", Got "{model.producer_name}".'
+            "Please specify correct --model_type parameter."
+        )
+
+    if optimization_options is None:
+        optimization_options = FusionOptions(model_type)
+
+    optimizer = optimizer_class(model, num_heads, hidden_size)
+
+    optimizer.optimize(optimization_options)
+
+    optimizer.topological_sort()
+
+    return optimizer, transformer_class
+
+
+def optimize_to_ixrt(args):
+    onnx_name = args.onnx[:-5]
+    model = onnx.load(args.onnx)
+
+    logger.info("simplify..")
+    simplified_model, check = simplify(model)
+    logger.info("simplify model end...")
+    if args.dump_onnx:
+        onnx.save(simplified_model, onnx_name + "_sim.onnx")
+
+    # transfer to static shape and optimize it
+    static_sim_model = simplified_model
+    if args.input_shapes:
+        for input_tensor in simplified_model.graph.input:
+            if input_tensor.name in args.input_shapes.keys():
+                new_shape = args.input_shapes[input_tensor.name]
+                dim_list = []
+                for dim in new_shape:
+                    if isinstance(dim, int):
+                        dim_proto = onnx.TensorShapeProto.Dimension()
+                        dim_proto.dim_value = dim
+                        dim_list.append(dim_proto)
+                    elif isinstance(dim, str):
+                        dim_proto = onnx.TensorShapeProto.Dimension()
+                        dim_proto.dim_param = dim
+                        dim_list.append(dim_proto)
+
+                del input_tensor.type.tensor_type.shape.dim[:]
+                input_tensor.type.tensor_type.shape.dim.extend(dim_list)
+
+    try:
+        auto_merge = False
+        if args.model_type in ["roformer"]:
+            auto_merge = True
+        static_model = SymbolicShapeInference.infer_shapes(
+            simplified_model, 2**31 - 1, auto_merge, False, 3
+        )
+        static_sim_model, check = simplify(static_model)
+        if args.dump_onnx:
+            onnx.save(static_sim_model, onnx_name + "_sim_static_sim.onnx")
+    except Exception as e:
+        static_model = static_sim_model = simplified_model
+
+    if args.dump_onnx:
+        onnx.save(static_model, onnx_name + "_sim_static.onnx")
+
+    logger.info("start fusion..")
+    opt_model, _ = optimize_by_fusion(
+        static_sim_model, args.model_type, args.num_heads, args.hidden_size
+    )
+    opt_model.save_model_to_file(onnx_name + "_end.onnx")
+    logger.info("done..")
+
+
+def parse_params(params_str):
+    params = {}
+    for item in params_str.replace(" ", "").split(","):
+        key, value = item.split(":")
+        params[key] = [int(x) if x.isdigit() else x for x in value.split("x")]
+    return params
+
+
+def args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--onnx", type=str, default=None, required=False, help="ONNX model file path"
+    )
+    parser.add_argument(
+        "--num_heads",
+        type=int,
+        default=0,
+        help="Used in model optimization. The num of the head used in the network",
+    )
+    parser.add_argument(
+        "--hidden_size",
+        type=int,
+        default=0,
+        help="Used in model optimization. The hidden_size used in the network",
+    )
+    parser.add_argument(
+        "--input_shapes",
+        type=parse_params,
+        help='Static input_shapes to the inference, format is --input_shapes "input_name1:3x224x224, input_name2:3x224x224"',
+    )
+    parser.add_argument(
+        "--dump_onnx",
+        action="store_true",
+        help="Whether to dump onnx",
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="bert",
+        choices=["bert", "swint", "roformer", "t5", "yolo", "gpt2", "vit", "conformer"],
+        help="Which kind of model to optimize",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="info",
+        choices=["debug", "info", "error"],
+        help="Which kind of model to optimize",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = args_parser()
+    if args.log_level == "info":
+        logging.basicConfig(level=logging.INFO)
+    elif args.log_level == "debug":
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.ERROR)
+    optimize_to_ixrt(args)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
new file mode 100644
index 0000000000000000000000000000000000000000..437e72fce0a316ab9d5041e86c6e9e864272a0b2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/float16.py
@@ -0,0 +1,394 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
+# Modifications: keep_io_types can be list of names; convert initializers if needed to preserve precision; add force_fp16_initializers option.
+
+import itertools
+import logging
+from typing import Dict, List
+
+import numpy as np
+import onnx
+from onnx import helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+from packaging import version
+
+logger = logging.getLogger(__name__)
+
+
+def _npfloat16_to_int(np_list):
+    """
+    Convert numpy float16 to python int.
+
+    :param np_list: numpy float16 list
+    :return int_list: python int list
+    """
+    return [int(bin(_.view("H"))[2:].zfill(16), 2) for _ in np_list]
+
+
+def convert_np_to_float16(np_array, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """
+    Convert float32 numpy array to float16 without changing sign or finiteness.
+    Positive values less than min_positive_val are mapped to min_positive_val.
+    Positive finite values greater than max_finite_val are mapped to max_finite_val.
+    Similar for negative values. NaN, 0, inf, and -inf are unchanged.
+    """
+
+    def between(a, b, c):
+        return np.logical_and(a < b, b < c)
+
+    np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
+    np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
+    np_array = np.where(between(max_finite_val, np_array, float("inf")), max_finite_val, np_array)
+    np_array = np.where(between(float("-inf"), np_array, -max_finite_val), -max_finite_val, np_array)
+    return np.float16(np_array)
+
+
+def convert_tensor_float_to_float16(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """Convert tensor float to float16.
+
+    Args:
+        tensor (TensorProto): the tensor to convert.
+        min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
+        max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
+
+    Raises:
+        ValueError: input type is not TensorProto.
+
+    Returns:
+        TensorProto: the converted tensor.
+    """
+
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+
+    if tensor.data_type == onnx_proto.TensorProto.FLOAT:
+        tensor.data_type = onnx_proto.TensorProto.FLOAT16
+        # convert float_data (float type) to float16 and write to int32_data
+        if tensor.float_data:
+            float16_data = convert_np_to_float16(np.array(tensor.float_data), min_positive_val, max_finite_val)
+            int_list = _npfloat16_to_int(float16_data)
+            tensor.int32_data[:] = int_list
+            tensor.float_data[:] = []
+        # convert raw_data (bytes type)
+        if tensor.raw_data:
+            # convert n.raw_data to float
+            float32_list = np.frombuffer(tensor.raw_data, dtype="float32")
+            # convert float to float16
+            float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
+            # convert float16 to bytes and write back to raw_data
+            tensor.raw_data = float16_list.tobytes()
+    return tensor
+
+
+def make_value_info_from_tensor(tensor):
+    shape = numpy_helper.to_array(tensor).shape
+    return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
+
+
+DEFAULT_OP_BLOCK_LIST = [
+    "ArrayFeatureExtractor",
+    "Binarizer",
+    "CastMap",
+    "CategoryMapper",
+    "DictVectorizer",
+    "FeatureVectorizer",
+    "Imputer",
+    "LabelEncoder",
+    "LinearClassifier",
+    "LinearRegressor",
+    "Normalizer",
+    "OneHotEncoder",
+    "SVMClassifier",
+    "SVMRegressor",
+    "Scaler",
+    "TreeEnsembleClassifier",
+    "TreeEnsembleRegressor",
+    "ZipMap",
+    "NonMaxSuppression",
+    "TopK",
+    "RoiAlign",
+    "Resize",
+    "Range",
+    "CumSum",
+    "Min",
+    "Max",
+    "Upsample",
+]
+
+
+class InitializerTracker:
+    """Class for keeping track of initializer."""
+
+    def __init__(self, initializer: onnx_proto.TensorProto):
+        self.initializer = initializer
+        self.fp32_nodes = []
+        self.fp16_nodes = []
+
+    def add_node(self, node: onnx_proto.NodeProto, is_node_blocked):
+        if is_node_blocked:
+            self.fp32_nodes.append(node)
+        else:
+            self.fp16_nodes.append(node)
+
+
+def convert_float_to_float16(
+    model,
+    min_positive_val=5.96e-08,
+    max_finite_val=65504.0,
+    keep_io_types=False,
+    disable_shape_infer=False,
+    op_block_list=None,
+    node_block_list=None,
+    force_fp16_initializers=False,
+):
+    """Convert model tensor float type in the ONNX ModelProto input to tensor float16.
+
+    Args:
+        model (ModelProto): The ONNX model to convert.
+        min_positive_val (float, optional): minimal positive value. Defaults to 5.96e-08.
+        max_finite_val (float, optional): maximal finite value of float16. Defaults to 65504.
+        keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
+                                                          If True, model inputs/outputs should be left as float32. Defaults to False.
+        disable_shape_infer (bool, optional): Skips running onnx shape/type inference. Useful if shape inference has been done. Defaults to False.
+        op_block_list (List[str], optional): List of op types to leave as float32.
+                                             Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
+        node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
+        force_fp16_initializers(bool): force converting all float initializers to float16.
+                                       Default to false, which will convert only the one needed to avoid precision loss.
+    Raises:
+        ValueError: input type is not ModelProto.
+
+    Returns:
+        ModelProto: converted model.
+    """
+    assert (
+        min_positive_val >= 5.96e-08
+    ), "invalid min_positive_val. smallest positive float16 value: subnormal 5.96e-08, and normalized 6.104e-05"
+    assert max_finite_val <= float(np.finfo(np.float16).max), "invalid max_finite_val. largest float16 value: 65504"
+
+    func_infer_shape = None
+    if not disable_shape_infer and version.parse(onnx.__version__) >= version.parse("1.2.0"):
+        try:
+            from onnx.shape_inference import infer_shapes
+
+            func_infer_shape = infer_shapes
+        finally:
+            pass
+
+    if not isinstance(model, onnx_proto.ModelProto):
+        raise ValueError("Expected model type is an ONNX ModelProto but got %s" % type(model))
+
+    # create blocklists
+    if op_block_list is None:
+        op_block_list = DEFAULT_OP_BLOCK_LIST
+    if node_block_list is None:
+        node_block_list = []
+    op_block_list = set(op_block_list)
+    node_block_list = set(node_block_list)
+
+    logger.debug(
+        f"fp16 parameters: min_positive_val={min_positive_val} max_finite_val={max_finite_val} keep_io_types={keep_io_types} disable_shape_infer={disable_shape_infer} op_block_list={op_block_list} node_block_list={node_block_list} force_fp16_initializers={force_fp16_initializers}"
+    )
+
+    # create a queue for BFS
+    queue = []
+    value_info_list = []
+    node_list = []
+    # type inference on input model
+    if func_infer_shape is not None:
+        model = func_infer_shape(model)
+    queue.append(model)
+    name_mapping = {}
+    graph_io_to_skip = set()
+    io_casts = set()
+
+    fp32_inputs = [n.name for n in model.graph.input if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
+    fp32_outputs = [n.name for n in model.graph.output if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
+    if isinstance(keep_io_types, list):
+        fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]
+        fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]
+    elif not keep_io_types:
+        fp32_inputs = []
+        fp32_outputs = []
+
+    for i, n in enumerate(model.graph.input):
+        if n.name in fp32_inputs:
+            output_name = "graph_input_cast_" + str(i)
+            name_mapping[n.name] = output_name
+            graph_io_to_skip.add(n.name)
+
+            node_name = "graph_input_cast" + str(i)
+            new_value_info = model.graph.value_info.add()
+            new_value_info.CopyFrom(n)
+            new_value_info.name = output_name
+            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+            # add Cast node (from tensor(float) to tensor(float16) after graph input
+            new_node = [helper.make_node("Cast", [n.name], [output_name], to=10, name=node_name)]
+            model.graph.node.extend(new_node)
+            value_info_list.append(new_value_info)
+            io_casts.add(node_name)
+
+    for i, n in enumerate(model.graph.output):
+        if n.name in fp32_outputs:
+            input_name = "graph_output_cast_" + str(i)
+            name_mapping[n.name] = input_name
+            graph_io_to_skip.add(n.name)
+
+            node_name = "graph_output_cast" + str(i)
+            # add Cast node (from tensor(float16) to tensor(float) before graph output
+            new_value_info = model.graph.value_info.add()
+            new_value_info.CopyFrom(n)
+            new_value_info.name = input_name
+            new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+            new_node = [helper.make_node("Cast", [input_name], [n.name], to=1, name=node_name)]
+            model.graph.node.extend(new_node)
+            value_info_list.append(new_value_info)
+            io_casts.add(node_name)
+
+    fp32_initializers: Dict[str, InitializerTracker] = {}
+    while queue:
+        next_level = []
+        for q in queue:
+            # if q is model, push q.graph (GraphProto)
+            if isinstance(q, onnx_proto.ModelProto):
+                next_level.append(q.graph)
+            # if q is model.graph, push q.node.attribute (AttributeProto)
+            if isinstance(q, onnx_proto.GraphProto):
+                for n in q.initializer:  # TensorProto type
+                    if n.data_type == onnx_proto.TensorProto.FLOAT:
+                        assert n.name not in fp32_initializers
+                        fp32_initializers[n.name] = InitializerTracker(n)
+
+                for n in q.node:
+                    # if n is in the block list (doesn't support float16), no conversion for the node,
+                    # and save the node for further processing
+                    if n.name in io_casts:
+                        continue
+                    for i in range(len(n.input)):
+                        if n.input[i] in name_mapping:
+                            n.input[i] = name_mapping[n.input[i]]
+                    for i in range(len(n.output)):
+                        if n.output[i] in name_mapping:
+                            n.output[i] = name_mapping[n.output[i]]
+
+                    is_node_blocked = n.op_type in op_block_list or n.name in node_block_list
+                    for input in n.input:
+                        if input in fp32_initializers:
+                            fp32_initializers[input].add_node(n, is_node_blocked)
+
+                    if is_node_blocked:
+                        node_list.append(n)
+                    else:
+                        if n.op_type == "Cast":
+                            for attr in n.attribute:
+                                if attr.name == "to" and attr.i == 1:
+                                    attr.i = 10
+                                    break
+                        for attr in n.attribute:
+                            next_level.append(attr)
+            # if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)
+            # and process node.attribute.t and node.attribute.tensors (TensorProto)
+            if isinstance(q, onnx_proto.AttributeProto):
+                next_level.append(q.g)
+                for n in q.graphs:
+                    next_level.append(n)
+                q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))
+                for n in q.tensors:
+                    n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)
+            # if q is graph, process input, output and value_info (ValueInfoProto)
+            if isinstance(q, onnx_proto.GraphProto):
+                # Note that float initializers tracked by fp32_initializers will be processed later.
+                # for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to
+                # tensor(float16) except map and seq(map). And save them in value_info_list for further processing
+                for n in itertools.chain(q.input, q.output, q.value_info):
+                    if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                        if n.name not in graph_io_to_skip:
+                            n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+                            value_info_list.append(n)
+                    if n.type.HasField("sequence_type"):
+                        if n.type.sequence_type.elem_type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
+                            if n.name not in graph_io_to_skip:
+                                n.type.sequence_type.elem_type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
+                                value_info_list.append(n)
+
+        queue = next_level
+
+    for key, value in fp32_initializers.items():
+        # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.
+        if force_fp16_initializers or value.fp16_nodes:
+            value.initializer = convert_tensor_float_to_float16(value.initializer, min_positive_val, max_finite_val)
+            value_info_list.append(make_value_info_from_tensor(value.initializer))
+            if value.fp32_nodes and not force_fp16_initializers:
+                logger.info(
+                    "initializer is used by both fp32 and fp16 nodes. Consider add these nodes to block list:{}".format(
+                        value.fp16_nodes
+                    )
+                )
+
+    # process the nodes in block list that doesn't support tensor(float16)
+    for node in node_list:
+        # if input's name is in the value_info_list meaning input is tensor(float16) type,
+        # insert a float16 to float Cast node before the node,
+        # change current node's input name and create new value_info for the new name
+        for i in range(len(node.input)):
+            input = node.input[i]
+            for value_info in value_info_list:
+                if input == value_info.name:
+                    # create new value_info for current node's new input name
+                    new_value_info = model.graph.value_info.add()
+                    new_value_info.CopyFrom(value_info)
+                    output_name = node.name + "_input_cast_" + str(i)
+                    new_value_info.name = output_name
+                    new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
+                    # add Cast node (from tensor(float16) to tensor(float) before current node
+                    node_name = node.name + "_input_cast" + str(i)
+                    new_node = [helper.make_node("Cast", [input], [output_name], to=1, name=node_name)]
+                    model.graph.node.extend(new_node)
+                    # change current node's input name
+                    node.input[i] = output_name
+                    break
+        # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to
+        # float16 Cast node after the node, change current node's output name and create new value_info for the new name
+        for i in range(len(node.output)):
+            output = node.output[i]
+            for value_info in value_info_list:
+                if output == value_info.name:
+                    # create new value_info for current node's new output
+                    new_value_info = model.graph.value_info.add()
+                    new_value_info.CopyFrom(value_info)
+                    input_name = node.name + "_output_cast_" + str(i)
+                    new_value_info.name = input_name
+                    new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
+                    # add Cast node (from tensor(float) to tensor(float16) after current node
+                    node_name = node.name + "_output_cast" + str(i)
+                    new_node = [helper.make_node("Cast", [input_name], [output], to=10, name=node_name)]
+                    model.graph.node.extend(new_node)
+                    # change current node's input name
+                    node.output[i] = input_name
+                    break
+    return model
+
+
+def float_to_float16_max_diff(tensor, min_positive_val=5.96e-08, max_finite_val=65504.0):
+    """Measure the maximum absolute difference after converting a float tensor to float16."""
+    if not isinstance(tensor, onnx_proto.TensorProto):
+        raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+    if tensor.data_type != onnx_proto.TensorProto.FLOAT:
+        raise ValueError("Expected tensor data type is float.")
+
+    float32_data = None
+    if tensor.float_data:
+        float32_data = np.array(tensor.float_data)
+
+    if tensor.raw_data:
+        float32_data = np.frombuffer(tensor.raw_data, dtype="float32")
+
+    if float32_data is None:
+        raise RuntimeError("external data not loaded!")
+
+    float16_data = convert_np_to_float16(float32_data, min_positive_val, max_finite_val)
+    return np.amax(np.abs(float32_data - np.float32(float16_data)))
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1fde76f93917ecdce2b22defc5dc5d4bd5bdaea
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fuse_series_bias_add.py
@@ -0,0 +1,65 @@
+from logging import getLogger
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+import numpy as np
+import onnx
+
+logger = getLogger(__name__)
+
+
+class FusionSerialBiasAdd(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Add", "Softmax")
+
+    def match_parent_path_from_dict(self, start_node, path_dict):
+        res_path = None
+        res_nodes = None
+        for k, v in path_dict.items():
+            res_nodes = self.model.match_parent_path(start_node, v[0], v[1])
+            if res_nodes is None:
+                continue
+            return res_nodes, k
+        return res_nodes, res_path
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        paths = {
+            "path1": (["Reshape", "Add", "Reshape", "Add"], [0, 0, 0, 0]),
+        }
+        series_nodes, path_chosen = self.match_parent_path_from_dict(node, paths)
+        if not series_nodes:
+            return
+        last_reshape, add_2nd, _, add_1st = series_nodes
+
+        biases = [
+            self.model.get_initializer(add_1st.input[1]),
+            self.model.get_initializer(add_2nd.input[1])
+        ]
+        if not all(biases):
+            return
+
+        bias_arr_1st = NumpyHelper.to_array(biases[0])
+        bias_arr_2nd = NumpyHelper.to_array(biases[1]).squeeze(0)
+        try:
+            relative_position_bias = bias_arr_1st + bias_arr_2nd
+        except Exception as e:
+            print("Two bias are unrelated:", e)
+            return
+
+        # Fuse
+        add_name = self.model.create_node_name("Add", "Add")
+        B = biases[0]
+        B.CopyFrom(numpy_helper.from_array(relative_position_bias, B.name))
+
+        fused_node = helper.make_node(
+            "Add",
+            inputs=[add_1st.input[0], B.name],
+            outputs=last_reshape.output,
+            name=add_name,
+        )
+        fused_node.domain = "com.iluvatar"
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend(series_nodes)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..47b8ec777a026d97256547d80e5a3c9d6ef77c2d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_albert_attention.py
@@ -0,0 +1,602 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import List, Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_attention import AttentionMask
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionAlbertAttention(Fusion):
+    """
+    Fuse Albert subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(
+                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
+            )
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(
+                    f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value."
+                )
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = (
+                    False  # Do not show the warning more than once
+                )
+
+        return num_heads, hidden_size
+
+    def get_add_qk_str(self, add_qk: NodeProto):
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
+        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
+
+        if input_0_shape is None or input_1_shape is None:
+            logger.debug(f"one of the inputs of {add_qk} is None")
+            return None
+
+        if input_0_shape != input_1_shape:
+            logger.debug(f"the shape of two inputs of {add_qk} is not same")
+            return None
+
+        return add_qk.input[1]
+
+    def create_attention_node(
+        self,
+        mask_index: str,
+        q_matmul: NodeProto,
+        k_matmul: NodeProto,
+        v_matmul: NodeProto,
+        q_add: NodeProto,
+        k_add: NodeProto,
+        v_add: NodeProto,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        add_qk_str: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            mask_index (str): mask input
+            q_matmul (NodeProto): MatMul node in fully connection for Q
+            k_matmul (NodeProto): MatMul node in fully connection for  K
+            v_matmul (NodeProto): MatMul node in fully connection for  V
+            q_add (NodeProto): Add bias node in fully connection for Q
+            k_add (NodeProto): Add bias node in fully connection for K
+            v_add (NodeProto): Add bias node in fully connection for V
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        q_weight = self.model.get_initializer(q_matmul.input[1])
+        k_weight = self.model.get_initializer(k_matmul.input[1])
+        v_weight = self.model.get_initializer(v_matmul.input[1])
+        q_bias = self.model.get_initializer(
+            q_add.input[1]
+        ) or self.model.get_initializer(q_add.input[0])
+        k_bias = self.model.get_initializer(
+            k_add.input[1]
+        ) or self.model.get_initializer(k_add.input[0])
+        v_bias = self.model.get_initializer(
+            v_add.input[1]
+        ) or self.model.get_initializer(v_add.input[0])
+
+        if q_weight is None:
+            print(
+                f"{q_matmul.input[1]} is not an initializer. "
+                "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
+            )
+            return None
+        if not (k_weight and v_weight and q_bias and k_bias):
+            return None
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        # assert q and k have same shape as expected
+        assert qw.shape == kw.shape
+
+        qw_in_size = qw.shape[0]
+        kw_in_size = kw.shape[0]
+        vw_in_size = vw.shape[0]
+
+        assert qw_in_size == kw_in_size == vw_in_size
+
+        if hidden_size > 0 and hidden_size != qw_in_size:
+            logger.warning(
+                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
+                "Please provide a correct input hidden size or pass in 0"
+            )
+
+        is_qkv_diff_dims = False
+
+        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
+        # For 2d weights, the shapes would be [in_size, out_size].
+        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        qkv_weight_dim = 0
+        qkv_weight = np.concatenate((qw, kw, vw), axis=1)
+        qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
+
+        qb = NumpyHelper.to_array(q_bias)
+        kb = NumpyHelper.to_array(k_bias)
+        vb = NumpyHelper.to_array(v_bias)
+
+        q_bias_shape = np.prod(qb.shape)
+        k_bias_shape = np.prod(kb.shape)
+        v_bias_shape = np.prod(vb.shape)
+
+        assert q_bias_shape == k_bias_shape == qw_out_size
+        assert v_bias_shape == vw_out_size
+
+        qkv_bias_dim = 0
+        if is_qkv_diff_dims:
+            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
+            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
+        else:
+            qkv_bias = np.stack((qb, kb, vb), axis=0)
+            qkv_bias_dim = 3 * q_bias_shape
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        weight = helper.make_tensor(
+            name=attention_node_name + "_qkv_weight",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_weight_dim, qw_in_size],
+            vals=qkv_weight.transpose(1, 0).flatten().tolist(),
+        )
+
+        # Sometimes weights and bias are stored in fp16
+        if q_weight.data_type == 10:
+            weight.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(weight).astype(np.float16), weight.name
+                )
+            )
+        self.model.add_initializer(weight, self.this_graph_name)
+
+        bias = helper.make_tensor(
+            name=attention_node_name + "_qkv_bias",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_bias_dim],
+            vals=qkv_bias.flatten().tolist(),
+        )
+        if q_bias.data_type == 10:
+            bias.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
+                )
+            )
+        self.model.add_initializer(bias, self.this_graph_name)
+
+        fc_output_tensor = helper.make_tensor_value_info(
+            attention_node_name + "_input", TensorProto.FLOAT, [None, None, None]
+        )
+        fc_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[input],
+            outputs=[fc_output_tensor.name],
+            name=self.model.create_node_name("AttentionFC", "MatMul_AddBias_"),
+        )
+        fc_node.domain = "com.iluvatar"
+        b = NumpyHelper.to_array(bias)
+        fc_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fc_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fc_node.attribute.extend([helper.make_attribute("W", weight)])
+        fc_node.attribute.extend([helper.make_attribute("B", bias)])
+        fc_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fc_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fc_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fc_node)
+
+        attention_inputs = [fc_node.output[0]]
+        if mask_index is not None:
+            attention_inputs.append(mask_index)
+        else:
+            attention_inputs.append("")
+
+        if add_qk_str is not None:
+            attention_inputs.append("")
+            attention_inputs.append(add_qk_str)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+
+        if is_qkv_diff_dims:
+            attention_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size]
+                    )
+                ]
+            )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+            else:
+                return
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [None, None, 0, 0, 0],
+        )
+        if qkv_nodes is None:
+            qkv_nodes = self.model.match_parent_path(
+                start_node,
+                ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+                [1, None, 0, 0, 0],
+            )
+        einsum_node = None
+        if qkv_nodes is not None:
+            (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+        else:
+            # Match Albert
+            qkv_nodes = self.model.match_parent_path(
+                start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
+            )
+            if qkv_nodes is not None:
+                (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
+            else:
+                return
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match flaubert                     Mask
+                                            |
+        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
+        if mul_before_layernorm is not None:
+            mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
+            if mul_children is not None and len(mul_children) == 2:
+                layernorm_node = mul_children[1]
+                if layernorm_node.op_type == "LayerNormalization":
+                    root_input = layernorm_node.output[0]
+                else:
+                    return
+            elif mul_children is not None and len(mul_children) == 5:
+                root_input = mul_before_layernorm.output[0]
+            else:
+                return
+        elif normalize_node.op_type == "LayerNormalization":
+            children = input_name_to_nodes[root_input]
+            for child in children:
+                if child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        children = input_name_to_nodes[root_input]
+        children_types = [child.op_type for child in children]
+        if children_types.count("MatMul") != 3:
+            return
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
+        )
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (_, _, add_v, matmul_v) = v_nodes
+
+        is_distill = False
+        is_distill_add = False
+        qk_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
+            "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
+            "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
+            "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
+        }
+
+        qk_nodes = None
+        for k, v in qk_paths.items():
+            qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
+            if qk_nodes is None:
+                continue
+            if k == "path3":
+                is_distill = True
+            if k == "path4":
+                is_distill_add = True
+            break
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        add_qk = None
+        matmul_qk = None
+        where_qk = None
+        if is_distill:
+            (_, where_qk, matmul_qk, _) = qk_nodes
+        elif is_distill_add:
+            (_, add_qk, where_qk, matmul_qk) = qk_nodes
+        else:
+            (_, add_qk, _, matmul_qk) = qk_nodes
+
+        q_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]
+        )
+        if q_nodes is None:
+            q_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Div", "Transpose", "Reshape", "Add", "MatMul"],
+                [0, 0, 0, 0, None],
+            )
+            if q_nodes is None:
+                logger.debug("fuse_attention: failed to match q path")
+                return
+        reshape_q = q_nodes[-3]
+        add_q = q_nodes[-2]
+        matmul_q = q_nodes[-1]
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]
+        )
+        if k_nodes is None:
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
+                [1, 0, 0, 0, None],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+        add_k = k_nodes[-2]
+        matmul_k = k_nodes[-1]
+
+        # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
+        mask_nodes = None
+        add_qk_str = None
+        if is_distill:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Expand", "Reshape", "Equal"], [0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                    (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+        elif is_distill_add:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+            if add_qk is not None:
+                add_qk_str = self.get_add_qk_str(add_qk)
+                if add_qk_str is None:
+                    logger.debug(
+                        f"fuse_attention: failed to verify shape inference of {add_qk}"
+                    )
+                    return
+        else:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                add_qk,
+                [
+                    (
+                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
+                        [None, 0, 1, 0, 0],
+                    ),
+                    (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
+                    (["Mul", "Sub", "Cast", "Unsqueeze"], [None, 0, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+
+        if (
+            matmul_v.input[0] == root_input
+            and matmul_q.input[0] == root_input
+            and matmul_k.input[0] == root_input
+        ):
+            # mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+            if mask_nodes[0].op_type == "Mul":
+                mask_val = self.model.get_initializer(mask_nodes[0].input[1])
+                if mask_val is not None:
+                    mask_val_arr = NumpyHelper.to_array(mask_val)
+                    mask_val_arr = np.where(mask_val_arr <= -100, -100, 0.0).astype(
+                        np.float32
+                    )
+                    mask_val.CopyFrom(
+                        numpy_helper.from_array(mask_val_arr, mask_val.name)
+                    )
+            mask_index = mask_nodes[0].output[0]
+
+            attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
+
+            q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+            # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
+            # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
+            new_node = self.create_attention_node(
+                mask_index,
+                matmul_q,
+                matmul_k,
+                matmul_v,
+                add_q,
+                add_k,
+                add_v,
+                q_num_heads,
+                q_hidden_size,
+                root_input,
+                attention_last_node.output[0],
+                add_qk_str,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            if einsum_node is not None:
+                unique_index = einsum_node.input[0]
+                new_edge = "edge_modified_" + unique_index
+                shape_tensor = helper.make_tensor(
+                    name="shape_modified_tensor" + unique_index,
+                    data_type=TensorProto.INT64,
+                    dims=[4],
+                    vals=np.int64(
+                        [0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]
+                    ).tobytes(),
+                    raw=True,
+                )
+                self.model.add_initializer(shape_tensor, self.this_graph_name)
+                self.model.add_node(
+                    helper.make_node(
+                        "Reshape",
+                        [attention_last_node.output[0], shape_tensor.name],
+                        [new_edge],
+                        "reshape_modified_" + unique_index,
+                    ),
+                    self.this_graph_name,
+                )
+                einsum_node.input[0] = new_edge
+
+            self.nodes_to_remove.extend(
+                [attention_last_node, transpose_qkv, matmul_qkv]
+            )
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..c750721836750e7826a06e83a71138001dc79510
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_attention.py
@@ -0,0 +1,571 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class AttentionMask:
+    """
+    Fuse Attention subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel):
+        self.model = model
+        # A lookup table with mask input as key, and mask index output as value
+        self.mask_indice = {}
+        # A lookup table with mask input as key, and cast (to int32) output as value
+        self.mask_casted = {}
+        self.utils = FusionUtils(model)
+        self.mask_format = AttentionMaskFormat.MaskIndexEnd
+
+    def set_mask_format(self, mask_format: AttentionMaskFormat):
+        self.mask_format = mask_format
+
+    def set_mask_indice(self, mask, mask_index):
+        if mask in self.mask_indice:
+            assert mask_index == self.mask_indice[mask]
+        self.mask_indice[mask] = mask_index
+
+    def get_first_mask(self):
+        assert len(self.mask_indice) > 0
+        return next(iter(self.mask_indice))
+
+    def process_mask(self, input: str) -> str:
+        if self.mask_format == AttentionMaskFormat.NoMask:
+            return None
+
+        if input in self.mask_indice:
+            return self.mask_indice[input]
+
+        # Add cast to convert int64 to int32
+        if self.model.find_graph_input(input):
+            casted, input_name = self.utils.cast_graph_input_to_int32(input)
+        else:
+            input_name, cast_node = self.utils.cast_input_to_int32(input)
+            casted = True
+
+        if casted:
+            self.mask_casted[input] = input_name
+
+        # Attention supports int32 attention mask (2D) since 1.4.0
+        if self.mask_format == AttentionMaskFormat.AttentionMask:
+            self.mask_indice[input] = input_name
+            return input_name
+
+        # Add a mask processing node to convert attention mask to mask index (1D)
+        output_name = self.model.create_node_name("mask_index")
+        mask_index_node = helper.make_node(
+            "ReduceSum",
+            inputs=[input_name],
+            outputs=[output_name],
+            name=self.model.create_node_name("ReduceSum", "MaskReduceSum"),
+        )
+        mask_index_node.attribute.extend([helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)])
+        self.model.add_node(mask_index_node)
+
+        self.mask_indice[input] = output_name
+        return output_name
+
+
+class FusionAttention(Fusion):
+    """
+    Fuse Attention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        super().__init__(model, "Attention", ["SkipLayerNormalization", "LayerNormalization"])
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = False  # Do not show the warning more than once
+
+        return num_heads, hidden_size
+
+    def get_add_qk_str(self, add_qk: NodeProto):
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
+        input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
+
+        if input_0_shape is None or input_1_shape is None:
+            logger.debug(f"one of the inputs of {add_qk} is None")
+            return None
+
+        if input_0_shape != input_1_shape:
+            logger.debug(f"the shape of two inputs of {add_qk} is not same")
+            return None
+
+        return add_qk.input[1]
+
+    def create_attention_node(
+        self,
+        mask_index: str,
+        q_matmul: NodeProto,
+        k_matmul: NodeProto,
+        v_matmul: NodeProto,
+        q_add: NodeProto,
+        k_add: NodeProto,
+        v_add: NodeProto,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        add_qk_str: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            mask_index (str): mask input
+            q_matmul (NodeProto): MatMul node in fully connection for Q
+            k_matmul (NodeProto): MatMul node in fully connection for  K
+            v_matmul (NodeProto): MatMul node in fully connection for  V
+            q_add (NodeProto): Add bias node in fully connection for Q
+            k_add (NodeProto): Add bias node in fully connection for K
+            v_add (NodeProto): Add bias node in fully connection for V
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        q_weight = self.model.get_initializer(q_matmul.input[1])
+        k_weight = self.model.get_initializer(k_matmul.input[1])
+        v_weight = self.model.get_initializer(v_matmul.input[1])
+        q_bias = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0])
+        k_bias = self.model.get_initializer(k_add.input[1]) or self.model.get_initializer(k_add.input[0])
+        v_bias = self.model.get_initializer(v_add.input[1]) or self.model.get_initializer(v_add.input[0])
+
+        if q_weight is None:
+            print(
+                f"{q_matmul.input[1]} is not an initializer. "
+                "Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
+            )
+            return None
+        if not (k_weight and v_weight and q_bias and k_bias):
+            return None
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        # assert q and k have same shape as expected
+        assert qw.shape == kw.shape
+
+        qw_in_size = qw.shape[0]
+        kw_in_size = kw.shape[0]
+        vw_in_size = vw.shape[0]
+
+        assert qw_in_size == kw_in_size == vw_in_size
+
+        if hidden_size > 0 and hidden_size != qw_in_size:
+            logger.warning(
+                f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
+                "Please provide a correct input hidden size or pass in 0"
+            )
+
+        is_qkv_diff_dims = False
+        if qw.shape != vw.shape:
+            is_qkv_diff_dims = True
+
+        # All the matrices can have the same shape or q, k matrics can have the same shape with v being different
+        # For 2d weights, the shapes would be [in_size, out_size].
+        # For 3d weights, shape would be [in_size, a, b] where a*b = out_size
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        qkv_weight_dim = 0
+        if is_qkv_diff_dims:
+            qkv_weight = np.concatenate((qw, kw, vw), axis=1)
+            qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
+        else:
+            qkv_weight = np.stack((qw, kw, vw), axis=1)
+            qkv_weight_dim = 3 * qw_out_size
+
+        qb = NumpyHelper.to_array(q_bias)
+        kb = NumpyHelper.to_array(k_bias)
+        vb = NumpyHelper.to_array(v_bias)
+
+        q_bias_shape = np.prod(qb.shape)
+        k_bias_shape = np.prod(kb.shape)
+        v_bias_shape = np.prod(vb.shape)
+
+        assert q_bias_shape == k_bias_shape == qw_out_size
+        assert v_bias_shape == vw_out_size
+
+        qkv_bias_dim = 0
+        if is_qkv_diff_dims:
+            qkv_bias = np.concatenate((qb, kb, vb), axis=0)
+            qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
+        else:
+            qkv_bias = np.stack((qb, kb, vb), axis=0)
+            qkv_bias_dim = 3 * q_bias_shape
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        weight = helper.make_tensor(
+            name=attention_node_name + "_qkv_weight",
+            data_type=TensorProto.FLOAT,
+            dims=[qw_in_size, qkv_weight_dim],
+            vals=qkv_weight.flatten().tolist(),
+        )
+
+        # Sometimes weights and bias are stored in fp16
+        if q_weight.data_type == 10:
+            weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
+        self.model.add_initializer(weight, self.this_graph_name)
+
+        bias = helper.make_tensor(
+            name=attention_node_name + "_qkv_bias",
+            data_type=TensorProto.FLOAT,
+            dims=[qkv_bias_dim],
+            vals=qkv_bias.flatten().tolist(),
+        )
+        if q_bias.data_type == 10:
+            bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
+        self.model.add_initializer(bias, self.this_graph_name)
+
+        attention_inputs = [
+            input,
+            attention_node_name + "_qkv_weight",
+            attention_node_name + "_qkv_bias",
+        ]
+        if mask_index is not None:
+            attention_inputs.append(mask_index)
+        else:
+            attention_inputs.append("")
+
+        if add_qk_str is not None:
+            attention_inputs.append("")
+            attention_inputs.append(add_qk_str)
+
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+
+        if is_qkv_diff_dims:
+            attention_node.attribute.extend(
+                [helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
+            )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+            else:
+                return
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [None, None, 0, 0, 0],
+        )
+        einsum_node = None
+        if qkv_nodes is not None:
+            (_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+        else:
+            # Match Albert
+            qkv_nodes = self.model.match_parent_path(
+                start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
+            )
+            if qkv_nodes is not None:
+                (_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
+            else:
+                return
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match flaubert                     Mask
+                                            |
+        Mul --> LayerNormalization -->  Attention --> MatMul --> Add
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
+        if mul_before_layernorm is not None:
+            mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
+            if mul_children is not None and len(mul_children) == 2:
+                layernorm_node = mul_children[1]
+                if layernorm_node.op_type == "LayerNormalization":
+                    root_input = layernorm_node.output[0]
+                else:
+                    return
+            elif mul_children is not None and len(mul_children) == 5:
+                root_input = mul_before_layernorm.output[0]
+            else:
+                return
+        elif normalize_node.op_type == "LayerNormalization":
+            children = input_name_to_nodes[root_input]
+            for child in children:
+                if child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        children = input_name_to_nodes[root_input]
+        children_types = [child.op_type for child in children]
+        if children_types.count("MatMul") != 3:
+            return
+
+        v_nodes = self.model.match_parent_path(matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (_, _, add_v, matmul_v) = v_nodes
+
+        is_distill = False
+        is_distill_add = False
+        qk_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
+            "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
+            "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
+            "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
+        }
+
+        qk_nodes = None
+        for k, v in qk_paths.items():
+            qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
+            if qk_nodes is None:
+                continue
+            if k == "path3":
+                is_distill = True
+            if k == "path4":
+                is_distill_add = True
+            break
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        add_qk = None
+        matmul_qk = None
+        where_qk = None
+        if is_distill:
+            (_, where_qk, matmul_qk, _) = qk_nodes
+        elif is_distill_add:
+            (_, add_qk, where_qk, matmul_qk) = qk_nodes
+        else:
+            (_, add_qk, _, matmul_qk) = qk_nodes
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None])
+        if q_nodes is None:
+            q_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Div", "Transpose", "Reshape", "Add", "MatMul"],
+                [0, 0, 0, 0, None],
+            )
+            if q_nodes is None:
+                logger.debug("fuse_attention: failed to match q path")
+                return
+        reshape_q = q_nodes[-3]
+        add_q = q_nodes[-2]
+        matmul_q = q_nodes[-1]
+
+        k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
+        if k_nodes is None:
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
+                [1, 0, 0, 0, None],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+        add_k = k_nodes[-2]
+        matmul_k = k_nodes[-1]
+
+        # Note that Cast might be removed by OnnxRuntime so we match two patterns here.
+        mask_nodes = None
+        add_qk_str = None
+        if is_distill:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Expand", "Reshape", "Equal"], [0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                    (["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+        elif is_distill_add:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                where_qk,
+                [
+                    (["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
+                    (["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
+                ],
+                output_name_to_node,
+            )
+            if add_qk is not None:
+                add_qk_str = self.get_add_qk_str(add_qk)
+                if add_qk_str is None:
+                    logger.debug(f"fuse_attention: failed to verify shape inference of {add_qk}")
+                    return
+        else:
+            _, mask_nodes, _ = self.model.match_parent_paths(
+                add_qk,
+                [
+                    (
+                        ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
+                        [None, 0, 1, 0, 0],
+                    ),
+                    (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+
+        if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
+            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+
+            attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
+
+            q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+            # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
+            # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
+            new_node = self.create_attention_node(
+                mask_index,
+                matmul_q,
+                matmul_k,
+                matmul_v,
+                add_q,
+                add_k,
+                add_v,
+                q_num_heads,
+                q_hidden_size,
+                root_input,
+                attention_last_node.output[0],
+                add_qk_str,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            if einsum_node is not None:
+                unique_index = einsum_node.input[0]
+                new_edge = "edge_modified_" + unique_index
+                shape_tensor = helper.make_tensor(
+                    name="shape_modified_tensor" + unique_index,
+                    data_type=TensorProto.INT64,
+                    dims=[4],
+                    vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]).tobytes(),
+                    raw=True,
+                )
+                self.model.add_initializer(shape_tensor, self.this_graph_name)
+                self.model.add_node(
+                    helper.make_node(
+                        "Reshape",
+                        [attention_last_node.output[0], shape_tensor.name],
+                        [new_edge],
+                        "reshape_modified_" + unique_index,
+                    ),
+                    self.this_graph_name,
+                )
+                einsum_node.input[0] = new_edge
+
+            self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaf742a45f7c8a56f1166a32d3b803fb497fe041
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_base.py
@@ -0,0 +1,82 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import List, Union
+
+from onnx import GraphProto
+
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class Fusion:
+    def __init__(
+        self,
+        model: OnnxModel,
+        fused_op_type: str,
+        search_op_types: Union[str, List[str]],
+        description: str = None,
+    ):
+        self.search_op_types: List[str] = (
+            [search_op_types] if isinstance(search_op_types, str) else search_op_types
+        )
+        self.fused_op_type: str = fused_op_type
+        self.description: str = (
+            f"{fused_op_type}({description})" if description else fused_op_type
+        )
+        self.model: OnnxModel = model
+        self.nodes_to_remove: List = []
+        self.nodes_to_add: List = []
+        self.prune_graph: bool = False
+        self.node_name_to_graph_name: dict = {}
+        self.this_graph_name: str = None
+        # It is optional that subclass updates fused_count since we will also check nodes_to_add to get counter.
+        self.fused_count: int = 0
+
+    def apply(self):
+        logger.debug(f"start {self.description} fusion...")
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        output_name_to_node = self.model.output_name_to_node()
+
+        # This assumes that two search ops will not be fused at same time!
+        for search_op_type in self.search_op_types:
+            for node in self.model.get_nodes_by_op_type(search_op_type):
+                graph = self.model.get_graph_by_node(node)
+                if graph is None:
+                    raise Exception("Can not find node in any graphs")
+                self.this_graph_name = graph.name
+                self.fuse(node, input_name_to_nodes, output_name_to_node)
+
+        op_list = [node.op_type for node in self.nodes_to_add]
+        count = max(self.fused_count, op_list.count(self.fused_op_type))
+        if count > 0:
+            logger.info(f"Fused {self.description} count: {count}")
+
+        self.model.remove_nodes(self.nodes_to_remove)
+        self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name)
+
+        if self.prune_graph:
+            self.model.prune_graph()
+        elif self.nodes_to_remove or self.nodes_to_add:
+            self.model.update_graph()
+
+    def match_parent_path_from_dict(
+        self, start_node, path_dict, output_name_to_node=None, return_indice=None
+    ):
+        res_path = None
+        res_nodes = None
+        for k, v in path_dict.items():
+            res_nodes = self.model.match_parent_path(
+                start_node,
+                v[0],
+                v[1],
+                output_name_to_node=output_name_to_node,
+                return_indice=return_indice,
+            )
+            if res_nodes is None:
+                continue
+            return res_nodes, k
+        return res_nodes, res_path
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3406c7f231b04b6367b4311da315bf8eb3f7df
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_biasgelu.py
@@ -0,0 +1,66 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionBiasGelu(Fusion):
+    def __init__(self, model: OnnxModel, is_fastgelu):
+        if is_fastgelu:
+            super().__init__(model, "FastGelu", "FastGelu", "add bias")
+        else:
+            super().__init__(model, "BiasGelu", "Gelu")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        gelu_op_type = node.op_type
+        fuse_op_type = "BiasGelu" if gelu_op_type == "Gelu" else "FastGelu"
+
+        if len(node.input) != 1:
+            return
+
+        nodes = self.model.match_parent_path(node, ["Add", "MatMul"], [0, None])
+        if nodes is None:
+            return
+        (add, matmul) = nodes
+
+        bias_weight = None
+        # bias should be one dimension
+        bias_index = -1
+        for i, input in enumerate(add.input):
+            initializer = self.model.get_initializer(input)
+            if initializer is None:
+                continue
+            bias_index = i
+            bias_weight = NumpyHelper.to_array(initializer)
+            break
+        if bias_weight is None:
+            return
+        if len(bias_weight.shape) != 1:
+            return
+
+        subgraph_nodes = [node, add]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        fused_node = helper.make_node(
+            fuse_op_type,
+            inputs=[matmul.output[0], add.input[bias_index]],
+            outputs=node.output,
+            name=self.model.create_node_name(fuse_op_type, gelu_op_type + "_AddBias_"),
+        )
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e825f95cbe698d9831b7291d5b03336797a8db85
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_attention.py
@@ -0,0 +1,150 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionConformerAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["Concat"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+
+    def get_num_heads_and_hidden_size(
+        self, atten_matmul: NodeProto, div: NodeProto
+    ) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
+        div_initializer = self.model.get_initializer(div.input[1])
+
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+
+        atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
+        head_dim = math.ceil(div_value * div_value)
+        hidden_size = atten_matul_shape_value[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self, num_heads: int, hidden_size: int, inputs: str, outputs: str
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=outputs,
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+
+        return attention_node
+
+    def fuse_reshape(self, shape_data_name):
+
+        shape_tensor = helper.make_tensor(
+            name=shape_data_name,
+            data_type=TensorProto.INT64,
+            dims=[3],
+            vals=np.int64([128, -1, self.hidden_size // self.num_heads]).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor, self.this_graph_name)
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+
+        paths = {
+            "path": (
+                ["Unsqueeze", "Mul", "Gather", "Shape", "LayerNormalization"],
+                [None, None, None, None, None],
+            ),
+        }
+
+        reshape_nodes, reshape_path = self.match_parent_path_from_dict(
+            start_node, paths
+        )
+        if reshape_nodes is None:
+            return
+
+        self.nodes_to_remove.append(start_node)
+
+        self.nodes_to_remove.extend(reshape_nodes[:-1])
+        self.fuse_reshape(start_node.output[0])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..78a40973f37cb0ac9f089ad971a293cb906ffa53
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_conformer_xsoftmax.py
@@ -0,0 +1,129 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+import numpy as np
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionConformerXSoftmax(Fusion):
+    """
+    Fuse Where + Softmax + Where into one node: XSoftmax
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "XSoftmax_IxRT", "Softmax")
+
+    def create_xsoftmax_node(
+        self, data_input: str, mask_input: str, output: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+
+        unique_index = data_input
+        new_edge = "edge_modified_" + unique_index
+        shape_tensor = helper.make_tensor(
+            name="shape_modified_tensor_" + unique_index,
+            data_type=TensorProto.INT64,
+            dims=[4],
+            vals=np.int64(
+                [-1, 8, 128, 128]  # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
+            ).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor, self.this_graph_name)
+        self.model.add_node(
+            helper.make_node(
+                "Reshape",
+                [data_input, shape_tensor.name],
+                [new_edge],
+                "reshape_modified_" + unique_index,
+            ),
+            self.this_graph_name,
+        )
+
+        new_edge2 = "edge_modified2_" + unique_index
+        xsoftmax_node_name = self.model.create_node_name("XSoftmax")
+
+        xsoftmax_node = helper.make_node(
+            "XSoftmax_IxRT",
+            inputs=[new_edge, mask_input],
+            outputs=[new_edge2],
+            name=xsoftmax_node_name,
+        )
+        xsoftmax_node.domain = "com.iluvatar"
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("is_conformer", 1)])
+
+        shape_tensor2 = helper.make_tensor(
+            name="shape_modified_tensor2_" + unique_index,
+            data_type=TensorProto.INT64,
+            dims=[3],
+            vals=np.int64(
+                [-1, 128, 128]  # (BSZ, HEAD_NUM, SEQ_LEN, SEQ_LEN)
+            ).tobytes(),
+            raw=True,
+        )
+        self.model.add_initializer(shape_tensor2, self.this_graph_name)
+        self.model.add_node(
+            helper.make_node(
+                "Reshape",
+                [new_edge2, shape_tensor2.name],
+                [output],
+                "reshape_modified2_" + unique_index,
+            ),
+            self.this_graph_name,
+        )
+
+        return xsoftmax_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        xsoftmax_paths = {
+            "path": (["Add", "Where", "Reshape", "Expand"], [None, None, None, None]),
+        }
+        xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
+            node, xsoftmax_paths
+        )
+
+        if xsoftmax_nodes is None:
+            logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
+            return
+        else:
+            (add_node, where_node, reshape_node, expand_node) = xsoftmax_nodes
+
+            mask_input = expand_node.input[0]
+
+            data_output = node.output[0]
+
+            data_input = add_node.input[0]
+            if where_node.output[0] == add_node.input[0]:
+                data_input = add_node.input[1]
+            xsoftmax_node = self.create_xsoftmax_node(
+                data_input, mask_input, data_output
+            )
+
+            self.nodes_to_remove.extend(xsoftmax_nodes)
+            self.nodes_to_add.append(xsoftmax_node)
+            self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e4011509e00ecedd3c5237e4320d3cd1a7d316
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_customfc.py
@@ -0,0 +1,344 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionCustomFCGPT2(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Reshape"], "gpt2")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        nodes = self.model.match_parent_path(node, ["Gemm", "Reshape"], [0, 0])
+
+        if nodes is None:
+            return False
+
+        (matmul, reshape_before_matmul) = nodes
+
+        matmul_weight = self.model.get_initializer(matmul.input[1])
+        matmul_bias = self.model.get_initializer(matmul.input[2])
+
+        if matmul_weight is None or matmul_bias is None:
+            return False
+
+        w = NumpyHelper.to_array(matmul_weight)
+        b = NumpyHelper.to_array(matmul_bias)
+
+        transB = 0
+        for attr in matmul.attribute:
+            if attr.name == "transB":
+                transB = attr.i
+                break
+
+        trans_matmul_weight = w
+        if transB == 0:
+            trans_matmul_weight = w.transpose(1, 0)
+        if matmul_weight.name not in self.model.initializer_visited.keys():
+            self.model.initializer_visited[matmul_weight.name] = True
+            if matmul_weight.data_type == 10:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(
+                        trans_matmul_weight.astype(np.float16), matmul_weight.name
+                    )
+                )
+            else:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
+                )
+
+        if matmul_bias.data_type == 10:
+            matmul_bias.CopyFrom(
+                numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
+            )
+        else:
+            matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[reshape_before_matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend([matmul, node, reshape_before_matmul])
+
+
+class FusionCustomFcRoformer(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"], "roformer fc")
+
+        # For model Roformer.
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 2:
+            return False
+
+        fc_paths = {
+            "path1": (["Reshape", "MatMul", "Reshape"], [0, 0, 0]),
+            "path2": (["Reshape", "MatMul", "Reshape"], [1, 0, 0]),
+        }
+
+        nodes, paths = self.match_parent_path_from_dict(node, fc_paths)
+        if nodes is None:
+            return False
+
+        reshape_after_matmul = nodes[0]
+        matmul = nodes[1]
+        reshape_before_matmul = nodes[2]
+
+        reshape_before_shape = None
+        reshape_after_shape = None
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_before_matmul.input[0]:
+                reshape_before_shape = len(value_info.type.tensor_type.shape.dim)
+                break
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_after_matmul.output[0]:
+                reshape_after_shape = len(value_info.type.tensor_type.shape.dim)
+                break
+        if reshape_before_shape != reshape_after_shape:
+            return False
+
+        weight = self.model.get_initializer(matmul.input[1])
+        bias = self.model.get_initializer(node.input[1]) or self.model.get_initializer(
+            node.input[0]
+        )
+
+        if weight is None or bias is None:
+            return False
+
+        w = NumpyHelper.to_array(weight)
+        w_in_size = w.shape[0]
+        weight_dim = np.prod(w.shape[1:])
+
+        b = NumpyHelper.to_array(bias)
+        bias_dim = np.prod(b.shape)
+        trans_matmul_weight = w.transpose(1, 0)
+        weight.CopyFrom(onnx.numpy_helper.from_array(trans_matmul_weight, weight.name))
+        # Sometimes weights and bias are stored in fp16
+        if weight.data_type == 10:
+            weight.CopyFrom(
+                numpy_helper.from_array(
+                    trans_matmul_weight.astype(np.float16), weight.name
+                )
+            )
+        bias_arr = onnx.numpy_helper.to_array(bias).flatten()
+        bias.CopyFrom(onnx.numpy_helper.from_array(bias_arr, bias.name))
+        if bias.data_type == 10:
+            bias.CopyFrom(
+                numpy_helper.from_array(
+                    NumpyHelper.to_array(bias).astype(np.float16), bias.name
+                )
+            )
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[reshape_before_matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+
+        self.nodes_to_remove.extend([node])
+        self.nodes_to_remove.extend(nodes)
+        return True
+
+
+class FusionCustomFC(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomFCPluginDynamic_IxRT", ["Add"])
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if self.fuse_1(node, input_name_to_nodes, output_name_to_node):
+            return
+
+    def fuse_1(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 2:
+            return False
+        nodes = self.model.match_parent_path(node, ["MatMul"], [None])
+
+        if nodes is None:
+            return False
+        matmul = nodes[0]
+
+        matmul_weight = self.model.get_initializer(matmul.input[1])
+        matmul_bias = self.model.get_initializer(
+            node.input[1]
+        ) or self.model.get_initializer(node.input[0])
+
+        if matmul_weight is None or matmul_bias is None:
+            return False
+
+        w = NumpyHelper.to_array(matmul_weight)
+        b = NumpyHelper.to_array(matmul_bias)
+
+        trans_matmul_weight = w.transpose(1, 0)
+        if matmul_weight.name not in self.model.initializer_visited.keys():
+            self.model.initializer_visited[matmul_weight.name] = True
+            if matmul_weight.data_type == 10:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(
+                        trans_matmul_weight.astype(np.float16), matmul_weight.name
+                    )
+                )
+            else:
+                matmul_weight.CopyFrom(
+                    numpy_helper.from_array(trans_matmul_weight, matmul_weight.name)
+                )
+
+        if matmul_bias.data_type == 10:
+            matmul_bias.CopyFrom(
+                numpy_helper.from_array(b.astype(np.float16), matmul_bias.name)
+            )
+        else:
+            matmul_bias.CopyFrom(numpy_helper.from_array(b, matmul_bias.name))
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[matmul.input[0]],
+            outputs=node.output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", b.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", matmul_weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", matmul_bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        self.nodes_to_remove.extend([matmul, node])
+        return True
+
+
+class FusionCustomFCActivation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomFCPluginDynamic_IxRT",
+            ["Gelu", "Relu", "CustomGeluPluginDynamic_IxRT", "Mul"],
+            "with activation",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if node.op_type == "Mul":
+            return_indice = []
+            nodes = self.model.match_parent_path(
+                node,
+                ["Sigmoid", "Mul", "CustomFCPluginDynamic_IxRT"],
+                [None, 0, 0],
+                return_indice=return_indice,
+            )
+            if nodes is None:
+                return
+
+            (sigmoid_node, mul_node, custom_fc_node) = nodes
+            if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
+                return
+
+            activation_type = 20
+            for attr in custom_fc_node.attribute:
+                if attr.name == "act_type":
+                    attr.i = activation_type
+                    break
+
+            custom_fc_node.output[0] = node.output[0]
+            self.nodes_to_add.append(custom_fc_node)
+            self.nodes_to_remove.extend([node, sigmoid_node, mul_node, custom_fc_node])
+            self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
+        else:
+            nodes = self.model.match_parent_path(
+                node, ["CustomFCPluginDynamic_IxRT"], [0]
+            )
+
+            if nodes is None:
+                logger.debug("CustomFCActivation: failed to match fc+gelu/relu path")
+                return
+
+            fc_node = nodes[0]
+            activation_type = 3
+            if node.op_type == "Gelu":
+                activation_type = 21
+            if node.op_type == "Relu":
+                activation_type = 4
+
+            for attr in fc_node.attribute:
+                if attr.name == "act_type":
+                    attr.i = activation_type
+                    break
+
+            fc_node.output[0] = node.output[0]
+            self.nodes_to_add.append(fc_node)
+            self.nodes_to_remove.extend([node, fc_node])
+            self.node_name_to_graph_name[fc_node.name] = self.this_graph_name
+
+
+class FusionConformerCustomFCActivation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomFCPluginDynamic_IxRT",
+            ["Mul"],
+            "with activation",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        # return_indice = []
+        nodes = self.model.match_parent_path(
+            node,
+            ["Sigmoid", "CustomFCPluginDynamic_IxRT"],
+            [
+                None,
+                0,
+            ],
+            # return_indice=return_indice,
+        )
+        if nodes is None:
+            return
+        (sigmoid_node, custom_fc_node) = nodes
+        # if output_name_to_node[node.input[1 - return_indice[0]]] != custom_fc_node:
+        #     return
+        activation_type = 20
+        for attr in custom_fc_node.attribute:
+            if attr.name == "act_type":
+                attr.i = activation_type
+                break
+        custom_fc_node.attribute.extend([helper.make_attribute("swish_alpha", 1.0)])
+        custom_fc_node.output[0] = node.output[0]
+        self.nodes_to_add.append(custom_fc_node)
+        self.nodes_to_remove.extend([node, sigmoid_node, custom_fc_node])
+        self.node_name_to_graph_name[custom_fc_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..04eb863f81fb8f026c74fa52ce5e2ca959cee13c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_disentangled_attention.py
@@ -0,0 +1,109 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import List, Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionDisentangledAttention(Fusion):
+    """
+    Match Disentangled Attention
+        -------------------------------------------
+                                                  |
+        GatherElements          -->   Add  -->   Add  -->
+                                       |
+        GatherElements --> Transpose  ->
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "DisentangledAttention_IxRT", "Add")
+
+    def create_disentangled_attention_node(
+        self,
+        inputs: List[str],
+        outputs: List[str],
+    ) -> Union[NodeProto, None]:
+        """Create an disentangled attention node.
+
+        Args:
+            inputs List[str]: data input names
+            outputs List[str]: data output names
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        disentangled_attention_node_name = self.model.create_node_name(
+            "DisentangledAttention"
+        )
+
+        disentangled_attention_node = helper.make_node(
+            "DisentangledAttention_IxRT",
+            inputs=inputs,
+            outputs=outputs,
+            name=disentangled_attention_node_name,
+        )
+        disentangled_attention_node.domain = "com.iluvatar"
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("plugin_namespace", "")]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("plugin_version", "1")]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("factor", 0.1)]
+        )
+        disentangled_attention_node.attribute.extend(
+            [helper.make_attribute("span", 512)]
+        )
+
+        return disentangled_attention_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        disentangled_attention_path1 = {
+            "path": (["Add", "GatherElements", "MatMul"], [None, None, None]),
+        }
+
+        disentangled_attention_path2 = {
+            "path": (
+                ["Add", "Transpose", "GatherElements", "MatMul"],
+                [None, None, None, None],
+            ),
+        }
+
+        nodes1, _ = self.match_parent_path_from_dict(node, disentangled_attention_path1)
+        nodes2, _ = self.match_parent_path_from_dict(node, disentangled_attention_path2)
+
+        if nodes1 is not None and nodes2 is not None:
+            if nodes1[0] == nodes2[0]:
+                (head_add, first_gather, first_matmul) = nodes1
+                (_, transpose, second_gather, second_matmul) = nodes2
+                tail_add = node
+
+                first_input = [i for i in tail_add.input if i != head_add.output[0]][0]
+                second_input = first_matmul.output[0]
+                third_input = second_matmul.output[0]
+                output = tail_add.output[0]
+
+                disentangled_attention_node = self.create_disentangled_attention_node(
+                    [first_input, second_input, third_input], [output]
+                )
+                self.nodes_to_add.append(disentangled_attention_node)
+                self.node_name_to_graph_name[
+                    disentangled_attention_node.name
+                ] = self.this_graph_name
+                self.nodes_to_remove.append(tail_add)
+                self.nodes_to_remove.append(head_add)
+                self.nodes_to_remove.append(first_gather)
+                self.nodes_to_remove.append(transpose)
+                self.nodes_to_remove.append(second_gather)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
new file mode 100644
index 0000000000000000000000000000000000000000..90bddbf89ece285a7be5b4e4f45a55defbdd138f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_embedlayer.py
@@ -0,0 +1,703 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict, List, Tuple, Union
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import NodeProto, TensorProto, helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionEmbedLayerNoMask(Fusion):
+    """
+    Fuse embedding layer into one node (EmbedLayerNormalization).
+    It supports the following model types: BERT, DistilBert, ALBert.
+    """
+
+    def __init__(self, model: OnnxModel, description: str = "no mask"):
+        super().__init__(
+            model,
+            "EmbedLayerNormalization",
+            ["LayerNormalization", "SkipLayerNormalization"],
+            description,
+        )
+        self.utils = FusionUtils(model)
+        self.shape_infer_helper = self.model.infer_runtime_shape({}, update=True)
+        # The following will be reset in each fuse call of FusionEmbedLayerNormalization
+        self.attention = None
+        self.embed_node = None
+
+    def match_two_gather(self, add: NodeProto) -> Union[None, Tuple[NodeProto, NodeProto]]:
+        gather_0_path = self.model.match_parent_path(add, ["Gather"], [0])
+        if gather_0_path is None:
+            return None
+
+        gather_1_path = self.model.match_parent_path(add, ["Gather"], [1])
+        if gather_1_path is None:
+            return None
+
+        return gather_0_path[0], gather_1_path[0]
+
+    def check_attention_subgraph(
+        self,
+        layernorm: NodeProto,
+        input_name_to_nodes: Dict[str, List[NodeProto]],
+        is_distil_bert: bool,
+    ) -> bool:
+        """Check that LayerNormalization has a child of Attention node or subgraph like Attention.
+
+        Args:
+            layernorm (NodeProto): LayerNormalization node
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            is_distil_bert (bool): whether it is DistilBert or not
+
+        Returns:
+            bool: whether there is Attention node or subgraph like Attention
+        """
+        self.attention = self.model.find_first_child_by_type(
+            layernorm, "Attention", input_name_to_nodes, recursive=False
+        )
+        if self.attention is None:
+            # In case user disables attention fusion, check whether subgraph looks like Attention.
+            if layernorm.output[0] not in input_name_to_nodes:
+                return False
+            children = input_name_to_nodes[layernorm.output[0]]
+
+            # For Albert, there is MatMul+Add after embedding layer before attention.
+            if len(children) == 1 and children[0].op_type == "MatMul" and children[0].output[0] in input_name_to_nodes:
+                grandchildren = input_name_to_nodes[children[0].output[0]]
+                if (
+                    len(grandchildren) == 1
+                    and grandchildren[0].op_type == "Add"
+                    and grandchildren[0].output[0] in input_name_to_nodes
+                ):
+                    nodes = input_name_to_nodes[grandchildren[0].output[0]]
+                    for node in nodes:
+                        if node.op_type == "Attention":
+                            self.attention = node
+                            return True
+                    children_types = sorted([child.op_type for child in nodes])
+            else:
+                children_types = sorted([child.op_type for child in children])
+
+            # Two Shape nodes might be merged by ORT
+            if is_distil_bert:
+                # SkipLayerNormailization might exist when model has been optimized by ORT first.
+                if (
+                    children_types != ["MatMul", "MatMul", "MatMul", "Shape", "SkipLayerNormalization"]
+                    and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape", "Shape"]
+                    and children_types != ["Add", "MatMul", "MatMul", "MatMul", "Shape"]
+                ):
+                    logger.debug("No Attention like subgraph in children of LayerNormalization")
+                    return False
+            else:
+                if children_types != ["Add", "MatMul", "MatMul", "MatMul",] and children_types != [
+                    "MatMul",
+                    "MatMul",
+                    "MatMul",
+                    "SkipLayerNormalization",
+                ]:
+                    logger.debug("No Attention like subgraph in children of LayerNormalization")
+                    return False
+        return True
+
+    def match_position_embedding_distilbert(self, position_embedding_gather, input_ids, output_name_to_node):
+        """  Match position embedding path from input_ids to Gather for DistilBert.
+
+        Pattern is like the following:
+                 (input_ids)
+                      |
+                     Shape
+                       |   \
+                       |    Gather (indices=1)
+                       |       |
+                       |      Cast (optional)
+                       |       |
+                       |      Range (start=0, end=*, delta=1)
+                       |       |
+                       |    Unsqueeze
+                       |    /
+                      Expand
+                        |
+                      Gather
+        """
+        # remove after tests pass
+        path1 = self.model.match_parent_path(position_embedding_gather, ["Expand", "Shape"], [1, 1])
+        if path1 is None:
+            path1 = self.model.match_parent_path(
+                position_embedding_gather,
+                ["Expand", "Where", "Reshape", "Shape"],
+                [1, 1, 2, 0],
+            )
+            if path1 is None:
+                return False
+
+        expand, shape = path1[0], path1[-1]
+        if shape.input[0] != input_ids:
+            return False
+
+        _, path2, _ = self.model.match_parent_paths(
+            expand,
+            [
+                (["Unsqueeze", "Range", "Cast", "Gather", "Shape"], [0, 0, 1, 0, 0]),
+                (["Unsqueeze", "Range", "Gather", "Shape"], [0, 0, 1, 0]),
+            ],
+            output_name_to_node,
+        )
+        if path2 is None:
+            return False
+
+        range_node = path2[1]
+        if not (
+            self.utils.check_node_input_value(range_node, 0, 0) and self.utils.check_node_input_value(range_node, 2, 1)
+        ):
+            return False
+
+        gather_node = path2[-2]
+        if not (self.utils.check_node_input_value(gather_node, 1, 1)):
+            return False
+
+        shape_node = path2[-1]
+        if shape_node.input[0] != input_ids:
+            return False
+
+        return True
+
+    def match_position_embedding_roberta(self, position_embedding_gather, input_ids, output_name_to_node):
+        """Match position embedding path from input_ids to Gather for Roberta.
+
+        Roberta Embedding Layer Pattern (* is optional since it might be removed by ORT, ? is the padding word id):
+          (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Mul -- Cast(to=7) -- Add(B=1) -- Cast(to=7)* --> Gather
+                                                |                              ^
+                                                V                              |
+                                                +------------------------------+
+
+        Roberta new pattern from transformers v4.9:
+           (input_ids) --> Equal(B=?) -- Not -- Cast(to=6) -- CumSum(axis=1) -- Add(B=0) -- Mul -- Cast(to=7) -- Add(B=1) --> Gather
+                                                |                                           ^
+                                                V                                           |
+                                                +-------------------------------------------+
+
+        start_node = position_embedding_gather
+        start_index = 1
+
+        # match optional Cast node.
+        parent = self.model.get_parent(start_node, start_index, output_name_to_node)
+        if parent is None:
+            return
+        if parent.op_type == "Cast":
+            if OnnxModel.get_node_attribute(parent, "to") != 7:
+                return
+            start_node = parent
+            start_index = 0
+
+        i, path, return_indices = self.model.match_parent_paths(
+            start_node,
+            [ (['Add', 'Cast', 'Mul', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0]),
+              (['Add', 'Cast', 'Mul', 'Add', 'CumSum', 'Cast', 'Not', 'Equal'], [start_index, 0, 0, 0, 0, 0, 0, 0])],
+            output_name_to_node)
+
+        if path is not None:
+            # constant input of Add shall be 1.
+            i, value = self.model.get_constant_input(path[0])
+            if value != 1:
+                return False
+
+            _, self.padding_word_id = self.model.get_constant_input(path[-1])
+
+            return input_ids == path[-1].input[0]
+        """
+
+        return False
+
+    def match_position_embedding_bert(self, position_embedding_gather, input_ids, output_name_to_node):
+        """  Match position embedding path from input_ids to Gather for BERT.
+
+        BERT Embedding Layer Pattern:       
+                                    (input_ids)
+                                   /         \
+                                 /          Shape
+                                /              |
+                              /              Gather (indices=1)
+                             /                  |
+                            /                  Add (optional, B=0)
+                           /                    |
+                        Gather (segment_ids) Unsqueeze (axes=0)
+                           \        |           |
+                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
+                              \    /            |
+                                Add          Gather 
+                                   \       /
+                                      Add
+                                       |
+                                LayerNormalization
+        """
+        path = self.model.match_parent_path(
+            position_embedding_gather,
+            ["Slice", "Unsqueeze"],
+            [1, 2],
+            output_name_to_node,
+        )
+        if path is None:
+            return False
+
+        slice, unsqueeze = path
+        slice_weight = self.model.get_constant_value(slice.input[0])
+        if not (
+            slice_weight is not None
+            and len(slice_weight.shape) == 2
+            and slice_weight.shape[0] == 1
+            and self.utils.check_node_input_value(slice, 1, [0])
+            and self.utils.check_node_input_value(slice, 3, [1])
+            and (len(slice.input) == 4 or self.utils.check_node_input_value(slice, 4, [1]))
+        ):
+            return False
+
+        opset_version = self.model.get_opset_version()
+        if opset_version < 13:
+            if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
+                return False
+        else:
+            if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
+                return False
+
+        node = self.model.get_parent(unsqueeze, 0, output_name_to_node)
+        if node is None:
+            return False
+        if node.op_type == "Add":
+            if not self.utils.check_node_input_value(node, 1, 0):
+                return False
+            gather = self.model.get_parent(node, 0, output_name_to_node)
+        else:
+            gather = node
+
+        if gather is None or gather.op_type != "Gather":
+            return False
+        if not (self.utils.check_node_input_value(gather, 1, 1)):
+            return False
+
+        shape = self.model.get_parent(gather, 0, output_name_to_node)
+        if shape is None or shape.op_type != "Shape":
+            return False
+
+        return input_ids == shape.input[0]
+
+    def match_position_embedding(self, position_embedding_gather, input_ids, output_name_to_node):
+        if self.match_position_embedding_bert(position_embedding_gather, input_ids, output_name_to_node):
+            return True
+
+        # TODO: Support roberta (position starts from 2 instead of 0) in EmbedLayerNormalization kernel
+        #       related: https://github.com/huggingface/transformers/issues/10736
+        # if self.match_position_embedding_roberta(position_embedding_gather, input_ids, output_name_to_node):
+        #    return True
+
+        if self.match_position_embedding_distilbert(position_embedding_gather, input_ids, output_name_to_node):
+            return True
+
+        return False
+
+    def check_embedding(self, word_embedding_gather, segment_embedding_gather, position_embedding_gather):
+        """Sanity check of embedding weights, and match hidden_size of weights and shape of inputs."""
+        input_ids = word_embedding_gather.input[1]
+        segment_ids = segment_embedding_gather.input[1] if segment_embedding_gather else None
+        position_ids = position_embedding_gather.input[1]
+
+        if self.shape_infer_helper is not None:
+            input_ids_shape = self.shape_infer_helper.get_edge_shape(input_ids)
+            position_ids_shape = self.shape_infer_helper.get_edge_shape(position_ids)
+            assert input_ids_shape and position_ids_shape
+            if not (
+                len(input_ids_shape) == 2
+                and len(position_ids_shape) == 2
+                and input_ids_shape[1] == position_ids_shape[1]
+            ):
+                logger.info(
+                    "Cannot fuse EmbedLayerNormalization: input_ids and position_ids not matched in 2nd dimension: {} vs {}".format(
+                        input_ids_shape, position_ids_shape
+                    )
+                )
+                return False
+
+            if segment_ids and not self.shape_infer_helper.compare_shape(input_ids, segment_ids):
+                logger.info(
+                    "Cannot fuse EmbedLayerNormalization: input_ids and segment_ids does not have same shape: {} != {}".format(
+                        input_ids_shape,
+                        self.shape_infer_helper.get_edge_shape(segment_ids),
+                    )
+                )
+                return False
+
+        word_embedding_table = self.model.get_constant_value(word_embedding_gather.input[0])
+        if word_embedding_table is None or len(word_embedding_table.shape) != 2:
+            logger.info("Cannot fuse EmbedLayerNormalization: word embedding table is not expected")
+            return False
+
+        position_embedding_table = self.model.get_constant_value(position_embedding_gather.input[0])
+        if (
+            position_embedding_table is None
+            or len(position_embedding_table.shape) != 2
+            or (word_embedding_table.shape[1] != position_embedding_table.shape[1])
+        ):
+            logger.info("Cannot fuse EmbedLayerNormalization: position embedding table is not expected")
+            return False
+
+        if segment_ids:
+            segment_embedding_table = self.model.get_constant_value(segment_embedding_gather.input[0])
+            if (
+                segment_embedding_table is None
+                or len(segment_embedding_table.shape) != 2
+                or (word_embedding_table.shape[1] != segment_embedding_table.shape[1])
+            ):
+                logger.info("Cannot fuse EmbedLayerNormalization: segment embedding table is not expected")
+                return False
+
+        # In normal case, word embeding table is the largest, and segment embedding table is the smallest, while postion embedding table is in between.
+        # TODO: use other information (like initializer names) to identify different embedding weights automatically.
+        if word_embedding_table.shape[0] <= position_embedding_table.shape[0]:
+            logger.warning(
+                f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]}"
+            )
+
+        if segment_ids:
+            if word_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
+                logger.warning(
+                    f"word_embedding_table ({word_embedding_gather.input[0]}) size {word_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
+                )
+
+            if position_embedding_table.shape[0] <= segment_embedding_table.shape[0]:
+                logger.warning(
+                    f"position_embedding_table ({position_embedding_gather.input[0]}) size {position_embedding_table.shape[0]} <= segment_embedding_table ({segment_embedding_gather.input[0]}) size {segment_embedding_table.shape[0]}"
+                )
+
+        return True
+
+    def cast_to_int32(self, input_name: str) -> Tuple[str, Union[None, NodeProto]]:
+        """Cast a graph input or node input to int32.
+
+        Args:
+            input_name (str): name of graph input or node input
+
+        Returns:
+            A tuple of casted input name and the cast node.
+            int32_output (str): If input is int32, it is the input name, Otherwise it is output name of Cast node.
+            input_cast_node (Union[None, NodeProto]): Cast node. It could be None if input is int32.
+        """
+        input_cast_node = None
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input is not None:
+            if graph_input.type.tensor_type.elem_type != TensorProto.INT32:
+                int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name)
+            else:
+                int32_output = input_name
+        else:
+            int32_output, input_cast_node = self.utils.cast_input_to_int32(input_name)
+
+        return int32_output, input_cast_node
+
+    def create_fused_node(
+        self,
+        input_ids: str,
+        layernorm: NodeProto,
+        word_embedding_gather: NodeProto,
+        position_embedding_gather: NodeProto,
+        segment_embedding_gather: Union[None, NodeProto],
+        position_ids: str = None,
+        embedding_sum_output=False,
+    ):
+        """Create an EmbedLayerNormalization node. Note that segment embedding is optional.
+
+        Args:
+            input_ids (str): input_ids for word embeddings
+            layernorm (NodeProto): LayerNormalization or SkipLayerNormalization node.
+            word_embedding_gather (NodeProto): the Gather node for word embedding
+            position_embedding_gather (NodeProto): the Gather node for position embedding
+            segment_embedding_gather (Union[None, NodeProto]): the Gather node for segment embedding, or None.
+
+        Returns:
+            NodeProto: the EmbedLayerNormalization node created.
+        """
+        nodes_to_add = []
+        input_ids, _ = self.cast_to_int32(input_ids)
+
+        node_name = self.model.create_node_name("EmbedLayerNormalization")
+
+        if layernorm.op_type == "LayerNormalization":
+            gamma = layernorm.input[1]
+            beta = layernorm.input[2]
+        else:  # SkipLayerNormalization
+            gamma = layernorm.input[2]
+            beta = layernorm.input[3]
+
+        embed_node_inputs = None
+        if segment_embedding_gather is not None:
+            segment_ids, _ = self.cast_to_int32(segment_embedding_gather.input[1])
+
+            embed_node_inputs = [
+                input_ids,
+                segment_ids,
+                word_embedding_gather.input[0],
+                position_embedding_gather.input[0],
+                segment_embedding_gather.input[0],
+                gamma,
+                beta,
+            ]
+        else:  # no segment embedding
+            embed_node_inputs = [
+                input_ids,
+                "",
+                word_embedding_gather.input[0],
+                position_embedding_gather.input[0],
+                "",
+                gamma,
+                beta,
+            ]
+
+        if position_ids is not None:
+            # Adding an empty input for mask before position_ids
+            embed_node_inputs.append("")
+            position_ids, _ = self.cast_to_int32(position_ids)
+            embed_node_inputs.append(position_ids)
+
+        embed_node_outputs = [node_name + "_output", node_name + "_dummy_mask_index"]
+        if embedding_sum_output:
+            embed_node_outputs.append(node_name + "_embedding_sum")
+
+        embed_node = helper.make_node(
+            "EmbedLayerNormalization",
+            embed_node_inputs,
+            outputs=embed_node_outputs,
+            name=node_name,
+        )
+
+        embed_node.domain = "com.microsoft"
+
+        # Pass attribute "epsilon" from normalize node to EmbedLayerNormalization.
+        for att in layernorm.attribute:
+            if att.name == "epsilon":
+                embed_node.attribute.extend([att])
+
+        # Set default value to 1e-12 if no attribute is found.
+        # OnnxRuntime 1.2.0 or older has no epsilon attribute. The optimized model can only work for 1.3.0 or later.
+        if len(embed_node.attribute) == 0:
+            embed_node.attribute.extend([helper.make_attribute("epsilon", 1.0e-12)])
+
+        # Make sure new EmbedLayerNormalization node is the last one in self.nodes_to_add.
+        nodes_to_add.append(embed_node)
+        for node in nodes_to_add:
+            self.node_name_to_graph_name[node.name] = self.this_graph_name
+        self.nodes_to_add.extend(nodes_to_add)
+
+        self.embed_node = embed_node
+        return embed_node
+
+    def finish_fusion(self, layernorm, embed_node):
+        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
+        # use prune graph to remove nodes that is not needed
+        self.prune_graph = True
+
+    def is_embedding_sum_needed(self, add_before_layer_norm):
+        """Check that Add before layer norm has an output to add before next layernorm
+
+        Args:
+            add_before_layer_norm (NodeProto): Add before any LayerNormalization node in topological order of graph
+
+        Returns:
+            bool: whether there is an extra output needed out of embed layer norm node
+        """
+
+        nodes = self.model.get_children(add_before_layer_norm)
+
+        return len(nodes) > 1
+
+    def fuse_gpt2(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        # graph checks
+        # gpt2 has no segment embedding, subgraph pattern is like
+        #     input_ids  position_ids
+        #        |        |
+        #     Gather    Gather
+        #          \   /
+        #           Add _ _ _ _ _
+        #            |           |
+        #    LayerNormalization  |
+        #            |           |
+        #         Attention      |
+        #            |           |
+        #          Matmul        |
+        #            |          /
+        #           Add        /
+        #             \       /
+        #                Add
+        two_gather = self.match_two_gather(add_before_layernorm)
+        if two_gather is None:
+            return False
+
+        add_output = add_before_layernorm.output[0]
+
+        word_embedding_gather, position_embedding_gather = two_gather
+        input_ids = word_embedding_gather.input[1]
+        position_ids = position_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=False):
+            return False
+
+        if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
+            return False
+
+        optional_embedding_sum_output = False
+        if self.is_embedding_sum_needed(add_before_layernorm):
+            optional_embedding_sum_output = True
+
+        # make the fused node
+        embed_node = self.create_fused_node(
+            input_ids,
+            layernorm,
+            word_embedding_gather,
+            position_embedding_gather,
+            None,
+            position_ids,
+            optional_embedding_sum_output,
+        )
+
+        # direct the output to another add too
+        self.model.replace_input_of_all_nodes(layernorm.output[0], embed_node.output[0])
+        if optional_embedding_sum_output:
+            self.model.replace_input_of_all_nodes(add_output, embed_node.output[2])
+
+        return True
+
+    def fuse_distilbert(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        """Fuse embedding layer for DistilBert
+        Args:
+            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
+            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
+        """
+
+        # DistilBert has no segment embedding, subgraph pattern is like
+        #       input_ids
+        #        |      \
+        #        |     (position_embedding_subgraph)
+        #        |        |
+        #     Gather    Gather
+        #          \   /
+        #           Add
+        #            |
+        #    LayerNormalization
+        two_gather = self.match_two_gather(add_before_layernorm)
+        if two_gather is None:
+            return False
+
+        word_embedding_gather, position_embedding_gather = two_gather
+        input_ids = word_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=True):
+            return False
+
+        if not self.match_position_embedding(position_embedding_gather, input_ids, output_name_to_node):
+            return False
+
+        if not self.check_embedding(word_embedding_gather, None, position_embedding_gather):
+            return False
+
+        embed_node = self.create_fused_node(
+            input_ids, layernorm, word_embedding_gather, position_embedding_gather, None
+        )
+        self.finish_fusion(layernorm, embed_node)
+        return True
+
+    def fuse_bert(self, layernorm, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+        """Fuse embedding layer for Bert
+        Args:
+            layernorm (NodeProto): node of LayerNormalization or SkipLayerNormalization
+            add_before_layernorm (NodeProto): the Add node before LayerNormalization, or the SkipLayerNormalization itself
+            input_name_to_nodes (Dict[str, List[NodeProto]]): map from input name to nodes
+            output_name_to_node (Dict[str, List[NodeProto]]): map from output name to nodes
+        """
+
+        add_2_gather = self.model.match_parent_path(add_before_layernorm, ["Add"], [0])
+        if add_2_gather is None:
+            return False
+
+        two_gather = self.match_two_gather(add_2_gather[0])
+        if two_gather is None:
+            return False
+
+        word_embedding_gather, segment_embedding_gather = two_gather
+
+        input_ids = word_embedding_gather.input[1]
+
+        if not self.check_attention_subgraph(layernorm, input_name_to_nodes, is_distil_bert=False):
+            return False
+
+        position_embedding_path = self.model.match_parent_path(add_before_layernorm, ["Gather"], [1])
+        if position_embedding_path is None:
+            return False
+
+        position_embedding_gather = position_embedding_path[0]
+        if not self.match_position_embedding(position_embedding_gather, input_ids, output_name_to_node):
+            if not self.match_position_embedding(segment_embedding_gather, input_ids, output_name_to_node):
+                return False
+            # position and segment are switched
+            temp = segment_embedding_gather
+            segment_embedding_gather = position_embedding_gather
+            position_embedding_gather = temp
+
+        if not self.check_embedding(word_embedding_gather, segment_embedding_gather, position_embedding_gather):
+            return False
+
+        embed_node = self.create_fused_node(
+            input_ids,
+            layernorm,
+            word_embedding_gather,
+            position_embedding_gather,
+            segment_embedding_gather,
+        )
+        self.finish_fusion(layernorm, embed_node)
+        return True
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if node.op_type == "LayerNormalization":
+            first_add_path = self.model.match_parent_path(node, ["Add"], [0])
+            if first_add_path is None:
+                return
+            add_before_layernorm = first_add_path[0]
+        else:  # SkipLayerNormalization
+            add_before_layernorm = node  # Add is fused into SkipLayerNormalization
+
+        if self.fuse_gpt2(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_distilbert(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_bert(node, add_before_layernorm, input_name_to_nodes, output_name_to_node):
+            return
+
+
+class FusionEmbedLayerNormalization(FusionEmbedLayerNoMask):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "with mask")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        # Reset attention and embed_node so that we know fusion is successful when they are not None.
+        self.attention = None
+        self.embed_node = None
+        super().fuse(node, input_name_to_nodes, output_name_to_node)
+
+        if self.attention and self.embed_node:
+            mask_index = self.attention.input[3]
+            if mask_index in output_name_to_node:
+                node = output_name_to_node[mask_index]
+                if node.op_type == "ReduceSum":
+                    embed_node = self.embed_node
+                    mask_input_name = node.input[0]
+                    self.nodes_to_remove.extend([node])
+                    embed_node.input.append(mask_input_name)
+                    embed_node.output[1] = mask_index
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e24a9dd7e018bd949f4a2bba18de7d2c909ce2b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_fastgelu.py
@@ -0,0 +1,404 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict, Optional
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionFastGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomGeluPluginDynamic_IxRT", "Tanh")
+
+    def fuse(self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if self.fuse_1(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_2(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+        if self.fuse_3(tanh_node, input_name_to_nodes, output_name_to_node):
+            return
+
+    def fuse_1(
+        self, tanh_node, input_name_to_nodes, output_name_to_node
+    ) -> Optional[bool]:
+        """
+        Fuse Gelu with tanh into one node:
+              +---------------------------+
+              |                           |
+              |                           v
+            [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul
+              |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)     ^
+              |                                                              |
+              +------> Mul(B=0.5)--------------------------------------------+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_tanh = children[0]
+
+        mul_half = self.model.match_parent(
+            mul_after_tanh, "Mul", None, output_name_to_node
+        )
+        if mul_half is None:
+            return
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        root_input = mul_half.input[0 if i == 1 else 1]
+
+        # root_node could be None when root_input is graph input
+        root_node = self.model.get_parent(
+            mul_half, 0 if i == 1 else 1, output_name_to_node
+        )
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
+        if i < 0:
+            return
+
+        add_before_tanh = self.model.match_parent(
+            mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
+        )
+        if add_before_tanh is None:
+            return
+
+        mul_after_pow = self.model.match_parent(
+            add_before_tanh,
+            "Mul",
+            None,
+            output_name_to_node,
+            exclude=[root_node] if root_node else [],
+        )
+        if mul_after_pow is None:
+            return
+
+        i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
+        if i < 0:
+            return
+
+        pow = self.model.match_parent(
+            mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
+        )
+        if pow is None:
+            return
+
+        if not self.model.has_constant_input(pow, 3.0):
+            return
+
+        if pow.input[0] != root_input:
+            return
+
+        subgraph_nodes = [
+            mul_after_tanh,
+            mul_half,
+            add_after_tanh,
+            tanh_node,
+            mul_before_tanh,
+            add_before_tanh,
+            mul_after_pow,
+            pow,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_after_tanh.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_input],
+            outputs=mul_after_tanh.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_2(
+        self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
+    ) -> Optional[bool]:
+        """
+        This pattern is from Tensorflow model.
+        Fuse Gelu with tanh into one node:
+              +---------------------------+
+              |                           |
+              |                           v
+            [root] --> Pow --> Mul -----> Add  --> Mul --> Tanh --> Add --> Mul(B=0.5)-->Mul-->
+              |       (Y=3)   (B=0.0447...)       (B=0.7978...)    (B=1)                  ^
+              |                                                                           |
+              +---------------------------------------------------------------------------+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_half = children[0]
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_mul_half = children[0]
+
+        root_node = self.model.get_parent(
+            mul_after_mul_half,
+            0 if mul_after_mul_half.input[1] == mul_half.output[0] else 1,
+            output_name_to_node,
+        )
+        if root_node is None:
+            return
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        i = self.model.find_constant_input(mul_before_tanh, 0.7978, delta=0.0001)
+        if i < 0:
+            return
+
+        add_before_tanh = self.model.match_parent(
+            mul_before_tanh, "Add", 0 if i == 1 else 1, output_name_to_node
+        )
+        if add_before_tanh is None:
+            return
+
+        mul_after_pow = self.model.match_parent(
+            add_before_tanh, "Mul", None, output_name_to_node, exclude=[root_node]
+        )
+        if mul_after_pow is None:
+            return
+
+        i = self.model.find_constant_input(mul_after_pow, 0.0447, delta=0.0001)
+        if i < 0:
+            return
+
+        pow = self.model.match_parent(
+            mul_after_pow, "Pow", 0 if i == 1 else 1, output_name_to_node
+        )
+        if pow is None:
+            return
+
+        if not self.model.has_constant_input(pow, 3.0):
+            return
+
+        if pow.input[0] != root_node.output[0]:
+            return
+
+        subgraph_nodes = [
+            mul_after_mul_half,
+            mul_half,
+            add_after_tanh,
+            tanh_node,
+            mul_before_tanh,
+            add_before_tanh,
+            mul_after_pow,
+            pow,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_after_mul_half.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_node.output[0]],
+            outputs=mul_after_mul_half.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_3(
+        self, tanh_node, input_name_to_nodes: Dict, output_name_to_node: Dict
+    ) -> Optional[bool]:
+        """
+        OpenAI's gelu implementation, also used in Megatron:
+           Gelu(x) = x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1.0 + 0.044715 * x * x)))
+
+        Fuse subgraph into a FastGelu node:
+            +------------ Mul (B=0.79788456) -------------------+
+            |                                                   |
+            +-------------------------------+                   |
+            |                               |                   |
+            |                               v                   v
+          [root] --> Mul (B=0.044715) --> Mul --> Add(B=1) --> Mul --> Tanh --> Add(B=1) --> Mul-->
+            |                                                                                 ^
+            |                                                                                 |
+            +-----------> Mul (B=0.5) --------------------------------------------------------+
+        """
+        if tanh_node.output[0] not in input_name_to_nodes:
+            return
+
+        children = input_name_to_nodes[tanh_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_tanh = children[0]
+
+        if not self.model.has_constant_input(add_after_tanh, 1.0):
+            return
+
+        if add_after_tanh.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_tanh.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_last = children[0]
+
+        mul_half = self.model.match_parent(mul_last, "Mul", None, output_name_to_node)
+        if mul_half is None:
+            return
+
+        i = self.model.find_constant_input(mul_half, 0.5)
+        if i < 0:
+            return
+
+        root_input = mul_half.input[0 if i == 1 else 1]
+
+        mul_before_tanh = self.model.match_parent(
+            tanh_node, "Mul", 0, output_name_to_node
+        )
+        if mul_before_tanh is None:
+            return
+
+        add_1 = self.model.match_parent(
+            mul_before_tanh, "Add", None, output_name_to_node
+        )
+        if add_1 is None:
+            return
+        j = self.model.find_constant_input(add_1, 1.0)
+        if j < 0:
+            return
+
+        mul_7978 = self.model.match_parent(
+            mul_before_tanh, "Mul", None, output_name_to_node
+        )
+        if mul_7978 is None:
+            return
+        k = self.model.find_constant_input(mul_7978, 0.7978, delta=0.0001)
+        if k < 0:
+            return
+        if mul_7978.input[0 if k == 1 else 1] != root_input:
+            return
+
+        mul_before_add_1 = self.model.match_parent(
+            add_1, "Mul", 0 if j == 1 else 1, output_name_to_node
+        )
+        if mul_before_add_1 is None:
+            return
+
+        if mul_before_add_1.input[0] == root_input:
+            another = 1
+        elif mul_before_add_1.input[1] == root_input:
+            another = 0
+        else:
+            return
+
+        mul_0447 = self.model.match_parent(
+            mul_before_add_1, "Mul", another, output_name_to_node
+        )
+        if mul_0447 is None:
+            return
+        m = self.model.find_constant_input(mul_0447, 0.0447, delta=0.0001)
+        if m < 0:
+            return
+
+        if mul_0447.input[0 if m == 1 else 1] != root_input:
+            return
+
+        subgraph_nodes = [
+            mul_0447,
+            mul_before_add_1,
+            add_1,
+            mul_before_tanh,
+            tanh_node,
+            add_after_tanh,
+            mul_7978,
+            mul_half,
+            mul_last,
+        ]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [mul_last.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node(
+            "CustomGeluPluginDynamic_IxRT",
+            inputs=[root_input],
+            outputs=mul_last.output,
+            name=self.model.create_node_name("CustomGeluPluginDynamic_IxRT"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b6d66ad3f6ae5e73a2f921c9b807fa22e439c33
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_format_roformer.py
@@ -0,0 +1,107 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionRemoveUselessElementwise(Fusion):
+    """
+    Fusion to remove useless elementwise in roformer model.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "Sqrt", "Sqrt")
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        paths = {
+            "path1": (
+                ["Max", "Min", "Add", "GlobalAveragePool"],
+                [None, None, None, None],
+            ),
+        }
+
+        pool_nodes, pool_path = self.match_parent_path_from_dict(node, paths)
+
+        if pool_nodes is None:
+            logger.debug("GlobalAveragePool: failed searching path after pool node.")
+            return
+
+        max_node = pool_nodes[0]
+        min_node = pool_nodes[1]
+        add_node = pool_nodes[2]
+        pool_node = pool_nodes[3]
+        if not self.model.has_constant_input(add_node, 9.999999960041972e-13):
+            return
+
+        if not self.model.has_constant_input(max_node, 0):
+            return
+
+        max_node.input[0] = pool_node.output[0]
+        self.nodes_to_remove.extend([min_node, add_node])
+
+
+class FusionFormatInvalidMask(Fusion):
+    """
+    Fusion to format invalid mask in roformer model.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "", ["Greater"])
+
+    def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
+        nodes = self.model.match_parent_path(
+            start_node,
+            [
+                "ReduceMin",
+                "Cast",
+                "Concat",
+                "Unsqueeze",
+                "Greater",
+                "ReduceMin",
+                "Cast",
+                "Concat",
+                "Unsqueeze",
+            ],
+            [0, 0, 0, 0, 0, 0, 0, 0, 0],
+        )
+
+        if nodes is None:
+            logger.debug("Roformer: unable to format the mask.")
+            return
+
+        unsqueeze_node = nodes[-1]
+
+        for node in self.model.graph().node:
+            for (id, input) in enumerate(node.input):
+                if start_node.output[0] == input:
+                    node.input[id] = unsqueeze_node.input[0]
+
+        self.nodes_to_remove.extend(nodes)
+        self.nodes_to_remove.extend([start_node])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c5c7e848dc033d69d5fad17834f1b20ed89bd0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu.py
@@ -0,0 +1,333 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict, Optional
+
+from .fusion_base import Fusion
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Gelu", "Erf")
+
+    def fuse(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if self.fuse_1(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        if self.fuse_2(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        if self.fuse_3(erf_node, input_name_to_nodes, output_name_to_node):
+            return
+        self.fuse_4(erf_node, input_name_to_nodes, output_name_to_node)
+
+    def fuse_1(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from PyTorch model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->
+                              (B=1.4142...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=1.4142...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return
+
+        if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            return
+
+        subgraph_input = div.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return
+            mul_half = children[0]
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.model.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return
+
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+
+            if subgraph_input not in mul_half.input:
+                return
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_2(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from Keras model
+        Fuse Gelu with Erf into one node:
+                       +------------------------------------------+
+                       |                                          |
+                       |                                          v
+                    [root] --> Div -----> Erf  --> Add --> Mul -->Mul
+                              (B=1.4142...)       (A=1)   (A=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        if not self.model.has_constant_input(mul_after_erf, 0.5):
+            return
+
+        if mul_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul = children[0]
+
+        div = self.model.match_parent(erf_node, "Div", 0, output_name_to_node)
+        if div is None:
+            return
+
+        sqrt_node = None
+        if self.model.find_constant_input(div, 1.4142, delta=0.001) != 1:
+            sqrt_node = self.model.match_parent(div, "Sqrt", 1, output_name_to_node)
+            if sqrt_node is None:
+                return
+            if not self.model.has_constant_input(sqrt_node, 2.0):
+                return
+
+        root_node = self.model.get_parent(div, 0, output_name_to_node)
+        if root_node is None:
+            return
+
+        if root_node.output[0] not in mul.input:
+            return
+
+        subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul]
+        if sqrt_node:
+            subgraph_nodes.append(sqrt_node)
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [mul.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_3(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+                       +----------------------------------------------+
+                       |                                              |
+                       |                                              v
+                    [root] --> Mul -----> Erf    -->   Add --> Mul -->Mul
+                               (A=0.7071067690849304)  (B=1)  (B=0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_half = children[0]
+
+        if not self.model.has_constant_input(mul_half, 0.5):
+            return
+
+        first_mul = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if first_mul is None:
+            return
+
+        i = self.model.find_constant_input(first_mul, 0.7071067690849304, delta=0.001)
+        if i < 0:
+            return
+
+        root_node = self.model.get_parent(first_mul, 0 if i == 1 else 1, output_name_to_node)
+        if root_node is None:
+            return
+
+        if mul_half.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[mul_half.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        last_mul = children[0]
+
+        if not (last_mul.input[0] == root_node.output[0] or last_mul.input[1] == root_node.output[0]):
+            return
+
+        subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [last_mul.output[0]],
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
+
+    def fuse_4(self, erf_node, input_name_to_nodes: Dict, output_name_to_node: Dict) -> Optional[bool]:
+        """
+        This pattern is from TensorFlow model
+        Fuse Gelu with Erf into one node:
+        Pattern 1:
+                       +-------Mul(0.5)---------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Mul -----> Erf  --> Add --> Mul -->
+                              (B=0.7071...)       (1)
+
+        Pattern 2:
+                       +------------------------------------+
+                       |                                    |
+                       |                                    v
+                    [root] --> Mul -----> Erf  --> Add --> Mul -->Mul -->
+                              (B=0.7071...)       (1)            (0.5)
+
+        Note that constant input for Add and Mul could be first or second input: like either A=0.5 or B=0.5 is fine.
+        """
+        if erf_node.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[erf_node.output[0]]
+        if len(children) != 1 or children[0].op_type != "Add":
+            return
+        add_after_erf = children[0]
+
+        if not self.model.has_constant_input(add_after_erf, 1):
+            return
+
+        if add_after_erf.output[0] not in input_name_to_nodes:
+            return
+        children = input_name_to_nodes[add_after_erf.output[0]]
+        if len(children) != 1 or children[0].op_type != "Mul":
+            return
+        mul_after_erf = children[0]
+
+        mul_before_erf = self.model.match_parent(erf_node, "Mul", 0, output_name_to_node)
+        if mul_before_erf is None:
+            return
+
+        if self.model.find_constant_input(mul_before_erf, 0.7071, delta=0.001) != 1:
+            return
+
+        subgraph_input = mul_before_erf.input[0]
+
+        another = 1 if mul_after_erf.input[0] == add_after_erf.output[0] else 0
+        if subgraph_input == mul_after_erf.input[another]:  # pattern 2
+            children = input_name_to_nodes[mul_after_erf.output[0]]
+            if len(children) != 1 or children[0].op_type != "Mul":
+                return
+            mul_half = children[0]
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+            subgraph_output = mul_half.output[0]
+        else:  # pattern 1
+            mul_half = self.model.match_parent(mul_after_erf, "Mul", another, output_name_to_node)
+            if mul_half is None:
+                return
+
+            if not self.model.has_constant_input(mul_half, 0.5):
+                return
+
+            if subgraph_input not in mul_half.input:
+                return
+
+            subgraph_output = mul_after_erf.output[0]
+
+        subgraph_nodes = [mul_before_erf, erf_node, add_after_erf, mul_after_erf, mul_half]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        fused_node = helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output])
+        fused_node.domain = "com.microsoft"
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        return True
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
new file mode 100644
index 0000000000000000000000000000000000000000..35f4b93a732e7cc73dd5f9ae917f75bd505c93a3
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gelu_approximation.py
@@ -0,0 +1,27 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from .fusion_base import Fusion
+from onnx import helper
+from .onnx_model import OnnxModel
+
+
+class FusionGeluApproximation(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "FastGelu", ["Gelu", "BiasGelu"], "GeluApproximation")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        new_node = helper.make_node(
+            "FastGelu",
+            inputs=node.input,
+            outputs=node.output,
+            name=self.model.create_node_name("FastGelu", node.op_type + "_Approximation"),
+        )
+        new_node.domain = "com.microsoft"
+        self.nodes_to_remove.append(node)
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b856dd19de9f03cd5b799eb1e042ed6bce193fd2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention.py
@@ -0,0 +1,473 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGptAttentionPastBase(Fusion):
+    """Base class for GPT Attention Fusion with past state"""
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, "Attention", "LayerNormalization", "with past")
+        self.num_heads = num_heads
+        self.utils = FusionUtils(model)
+        self.casted_attention_mask = {}  # map from name of attention mask to the name that casted to int32
+
+    def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
+        # Pattern 1:
+        #                      {past}
+        #                    /        \
+        #                   /          \
+        #    Gather(axes=0, indices=0)  Gather(indices=1)
+        #      |                          |
+        #    Transpose (perm=0,1,3,2)     |
+        #      |                          |
+        #  Concat_k                     Concat_v
+        #      |                        /
+        #  Transpose (perm=0,1,3,2)    /
+        #      |                      /
+        #  Unsqueeze        Unsqueeze
+        #        \        /
+        #         \      /
+        #           Concat
+        #             |
+        #         {present}
+        gather = self.model.get_parent(concat_v, 0, output_name_to_node)
+        if gather.op_type != "Gather":
+            logger.debug("match_past_pattern_1: expect Gather for past")
+            return None
+
+        if not self.model.find_constant_input(gather, 1) == 1:
+            logger.debug("match_past_pattern_1: expect indices=1 for Gather of past")
+            return None
+        past = gather.input[0]
+
+        parent = self.model.get_parent(concat_k, 0, output_name_to_node)
+        if parent.op_type == "Gather":
+            gather_past_k = parent
+        else:
+            past_k_nodes = self.model.match_parent_path(concat_k, ["Transpose", "Gather"], [0, 0])
+            if past_k_nodes is None:
+                logger.debug("match_past_pattern_1: failed match Transpose and Gather")
+                return None
+            gather_past_k = past_k_nodes[-1]
+
+        if not self.model.find_constant_input(gather_past_k, 0) == 1:
+            logger.debug("match_past_pattern_1: expect indices=0 for Gather k of past")
+            return None
+        past_k = gather_past_k.input[0]
+        if past != past_k:
+            logger.debug("match_past_pattern_1: expect past to be same")
+            return None
+
+        return past
+
+    def match_past_pattern_2(self, concat_k, concat_v, output_name_to_node):
+        # Pattern 2:
+        #      Split (QKV)
+        #      / |   |
+        #     /  |   +----------------------+
+        #        |                          |
+        #        |         {past}           |
+        #        |           |              |
+        #      Reshape     Split         Reshape
+        #        |         /    \           |
+        # Transpose_k  Squeeze  Squeeze  Transpose_v
+        #        |      |        \        /
+        #        +------|---+     \      /
+        #               |   |      \    /
+        #              Concat_k   Concat_v
+        #               |            |
+        #          Unsqueeze    Unsqueeze
+        #                \       /
+        #                 Concat
+        #                   |
+        #               {present}
+        #
+        squeeze = self.model.get_parent(concat_v, 0, output_name_to_node)
+        if squeeze.op_type != "Squeeze":
+            logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v")
+            return None
+
+        split = self.model.get_parent(squeeze, 0, output_name_to_node)
+        if split.op_type != "Split":
+            logger.debug("match_past_pattern_2: expect Split for past path")
+            return None
+
+        opset_version = self.model.get_opset_version()
+        if opset_version < 13:
+            if not FusionUtils.check_node_attribute(squeeze, "axes", [0]):
+                logger.debug("match_past_pattern_2: axes != [0] for Squeeze in past path")
+                return None
+
+            if not FusionUtils.check_node_attribute(split, "split", [1, 1]):
+                logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
+                return None
+        else:
+            if not self.utils.check_node_input_value(squeeze, 1, [0]):
+                logger.debug("match_past_pattern_2: axes != [0] for Squeeze in past path")
+                return None
+
+            if not self.utils.check_node_input_value(split, 1, [1, 1]):
+                logger.debug("match_past_pattern_2: split != [1, 1] for Split in past path")
+                return None
+
+        if not FusionUtils.check_node_attribute(split, "axis", 0, default_value=0):
+            logger.debug("match_past_pattern_2: attribute axis of Split are not expected in past path")
+            return None
+        past = split.input[0]
+
+        past_k_nodes = self.model.match_parent_path(concat_k, ["Squeeze", "Split"], [0, 0])
+        if past_k_nodes is None:
+            logger.debug("match_past_pattern_2: failed to match past_k_nodes path")
+            return None
+        past_k = past_k_nodes[-1].input[0]
+
+        if past != past_k:
+            logger.info("match_past_pattern_2: expect past to be same")
+            return None
+
+        return past
+
+    def match_present(self, concat_v, input_name_to_nodes):
+        unsqueeze_present_v = self.model.find_first_child_by_type(
+            concat_v, "Unsqueeze", input_name_to_nodes, recursive=False
+        )
+        if not unsqueeze_present_v:
+            logger.info("expect unsqueeze for present")
+            return None
+        concat_present = self.model.find_first_child_by_type(
+            unsqueeze_present_v, "Concat", input_name_to_nodes, recursive=False
+        )
+        if not concat_present:
+            logger.info("expect concat for present")
+            return None
+
+        present = concat_present.output[0]
+        return present
+
+    def cast_attention_mask(self, input_name):
+        if input_name in self.casted_attention_mask:
+            attention_mask_input_name = self.casted_attention_mask[input_name]
+        elif self.model.find_graph_input(input_name):
+            casted, attention_mask_input_name = self.utils.cast_graph_input_to_int32(input_name)
+            self.casted_attention_mask[input_name] = attention_mask_input_name
+        else:
+            attention_mask_input_name, cast_node = self.utils.cast_input_to_int32(input_name)
+            self.casted_attention_mask[input_name] = attention_mask_input_name
+        return attention_mask_input_name
+
+
+class FusionGptAttention(FusionGptAttentionPastBase):
+    """
+    Fuse GPT-2 Attention with past state subgraph into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, num_heads)
+
+    def create_attention_node(
+        self,
+        fc_weight,
+        fc_bias,
+        gemm_qkv,
+        past,
+        present,
+        input,
+        output,
+        mask,
+        is_unidirectional,
+    ):
+        attention_node_name = self.model.create_node_name("GptAttention")
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=[input, fc_weight, fc_bias, mask, past],
+            outputs=[attention_node_name + "_output", present],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend(
+            [
+                helper.make_attribute("num_heads", self.num_heads),
+                helper.make_attribute("unidirectional", 1 if is_unidirectional else 0),
+            ]
+        )
+
+        matmul_node = helper.make_node(
+            "MatMul",
+            inputs=[attention_node_name + "_output", gemm_qkv.input[1]],
+            outputs=[attention_node_name + "_matmul_output"],
+            name=attention_node_name + "_matmul",
+        )
+
+        add_node = helper.make_node(
+            "Add",
+            inputs=[attention_node_name + "_matmul_output", gemm_qkv.input[2]],
+            outputs=[output],
+            name=attention_node_name + "_add",
+        )
+        self.nodes_to_add.extend([attention_node, matmul_node, add_node])
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name
+        self.node_name_to_graph_name[add_node.name] = self.this_graph_name
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        past = None
+        present = None
+        return_indice = []
+        qkv_nodes = self.model.match_parent_path(
+            normalize_node,
+            ["Add", "Reshape", "Gemm", "Reshape", "Reshape", "Transpose", "MatMul"],
+            [0, None, 0, 0, 0, 0, 0],
+            output_name_to_node=output_name_to_node,
+            return_indice=return_indice,
+        )  # yapf: disable
+        if qkv_nodes is None:
+            return
+        (
+            add_qkv,
+            reshape_qkv,
+            gemm_qkv,
+            reshape_1,
+            reshape_2,
+            transpose_qkv,
+            matmul_qkv,
+        ) = qkv_nodes
+
+        another_input = add_qkv.input[1 - return_indice[0]]
+
+        v_nodes = self.model.match_parent_path(matmul_qkv, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (concat_v, transpose_v, reshape_v, split_fc) = v_nodes
+
+        fc_nodes = self.model.match_parent_path(
+            split_fc,
+            ["Reshape", "Gemm", "Reshape", "LayerNormalization"],
+            [0, 0, 0, 0],
+            output_name_to_node,
+        )
+        if fc_nodes is None:
+            fc_nodes = self.model.match_parent_path(
+                split_fc,
+                ["Add", "MatMul", "LayerNormalization"],
+                [0, None, 0],
+                output_name_to_node,
+            )
+            if fc_nodes is None:
+                logger.debug("fuse_attention: failed to match fc path")
+                return
+            fc_weight = fc_nodes[1].input[1]
+            i, _ = self.model.get_constant_input(fc_nodes[0])
+            fc_bias = fc_nodes[0].input[i]
+        else:
+            fc_weight = fc_nodes[1].input[1]
+            fc_bias = fc_nodes[1].input[2]
+
+        layernorm_before_attention = fc_nodes[-1]
+
+        if not another_input in layernorm_before_attention.input:
+            logger.debug("Add and LayerNormalization shall have one same input")
+            return
+
+        is_unidirectional = True
+        slice_mask = None
+        input_mask_nodes = None
+        concat_k_to_match = None
+        qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "Div", "MatMul"], [0, 0, 0, 0, 0])
+        if qk_nodes is not None:
+            (softmax_qk, sub_qk, mul_qk, div_qk, matmul_qk) = qk_nodes
+            mask_nodes = self.model.match_parent_path(
+                sub_qk,
+                [
+                    "Mul",
+                    "Sub",
+                    "Slice",
+                    "Slice",
+                    "Unsqueeze",
+                    "Sub",
+                    "Squeeze",
+                    "Slice",
+                    "Shape",
+                    "Div",
+                ],
+                [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],
+            )  # yapf: disable
+            if mask_nodes is None:
+                logger.debug("fuse_attention: failed to match unidirectional mask path")
+                return
+            div_mask = mask_nodes[-1]
+            slice_mask = mask_nodes[3]
+
+            if div_qk != div_mask:
+                logger.debug("fuse_attention: skip since div_qk != div_mask")
+                return
+        else:
+            # New pattern for gpt2 from PyTorch 1.5.0 and Transformers 2.9.0.
+            i, qk_nodes, _ = self.model.match_parent_paths(
+                matmul_qkv,
+                [
+                    (["Softmax", "Where", "Div", "MatMul"], [0, 0, 1, 0]),
+                    (["Softmax", "Add", "Where", "Div", "MatMul"], [0, 0, None, 1, 0]),
+                ],
+                output_name_to_node,
+            )
+            if qk_nodes is None:
+                logger.debug("fuse_attention: failed to match qk nodes")
+                return
+
+            where_qk = qk_nodes[-3]
+            div_qk = qk_nodes[-2]
+            matmul_qk = qk_nodes[-1]
+
+            if i == 1:
+                add_qk = qk_nodes[1]
+                _, input_mask_nodes, _ = self.model.match_parent_paths(
+                    add_qk,
+                    [
+                        (
+                            ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze", "Reshape"],
+                            [None, 0, 1, 0, 0, 0],
+                        ),
+                        (
+                            ["Mul", "Sub", "Unsqueeze", "Unsqueeze", "Reshape"],
+                            [None, 0, 1, 0, 0],
+                        ),
+                        (
+                            ["Mul", "Sub", "Unsqueeze", "Unsqueeze"],
+                            [None, 0, 1, 0],
+                        ),  # useless cast and reshape are removed.
+                    ],
+                    output_name_to_node,
+                )  # yapf: disable
+                if input_mask_nodes is None:
+                    logger.debug("fuse_attention: failed to match input attention mask path")
+                    return
+
+            mask_nodes = self.model.match_parent_path(
+                where_qk,
+                [
+                    "Cast",
+                    "Slice",
+                    "Slice",
+                    "Unsqueeze",
+                    "Sub",
+                    "Squeeze",
+                    "Slice",
+                    "Shape",
+                ],
+                [0, 0, 0, 1, 0, 0, 0, 0],
+                output_name_to_node,
+            )  # yapf: disable
+            if mask_nodes is None:
+                # TODO: match mask path for GPT2LMHeadModel_BeamSearchStep.
+                logger.debug("fuse_attention: failed to match mask path")
+                return
+
+            slice_mask = mask_nodes[2]
+
+            div_or_concat = self.model.get_parent(mask_nodes[-1], 0, output_name_to_node)
+            if div_or_concat.op_type == "Div":
+                div_mask = div_or_concat
+                if div_qk != div_mask:
+                    logger.debug("fuse_attention: skip since div_qk != div_mask")
+                    return
+            elif div_or_concat.op_type == "Concat":
+                concat_k_to_match = div_or_concat
+            else:
+                logger.debug("fuse_attention: failed to match mask path")
+
+        # Validate that the mask data is either lower triangular (unidirectional) or all ones
+        mask_data = numpy_helper.to_array(self.model.get_initializer(slice_mask.input[0]))
+        if not (
+            len(mask_data.shape) == 4 and mask_data.shape[:2] == (1, 1) and mask_data.shape[2] == mask_data.shape[3]
+        ):
+            logger.debug("fuse_attention: skip since mask shape is not 1x1xWxW")
+            return
+        if np.allclose(mask_data, np.ones_like(mask_data)):
+            is_unidirectional = False
+        elif not np.allclose(mask_data, np.tril(np.ones_like(mask_data))):
+            logger.debug("fuse_attention: skip since mask is neither lower triangular nor ones")
+            return
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0])
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (transpose_q, reshape_q, split_q) = q_nodes
+        if split_fc != split_q:
+            logger.debug("fuse_attention: skip since split_fc != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(matmul_qk, ["Concat", "Transpose", "Reshape", "Split"], [1, 1, 0, 0])
+        if k_nodes is None:
+            # This pattern is from pytorch 1.7.1 and transformers 4.6.1
+            k_nodes = self.model.match_parent_path(
+                matmul_qk,
+                ["Transpose", "Concat", "Transpose", "Reshape", "Split"],
+                [1, 0, 1, 0, 0],
+            )
+            if k_nodes is None:
+                logger.debug("fuse_attention: failed to match k path")
+                return
+            else:
+                (_, concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        else:
+            (concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        if split_fc != split_k:
+            logger.debug("fuse_attention: skip since split_fc != split_k")
+            return
+
+        if concat_k_to_match and concat_k != concat_k_to_match:
+            logger.debug("fuse_attention: skip since concat_k != concat_k_to_match")
+            return
+
+        attention_mask_input_name = ""
+        if input_mask_nodes is not None:
+            input_name = input_mask_nodes[-1].input[0]
+            attention_mask_input_name = self.cast_attention_mask(input_name)
+
+        # Match past and present paths
+        past = self.match_past_pattern_1(concat_k, concat_v, output_name_to_node) or self.match_past_pattern_2(
+            concat_k, concat_v, output_name_to_node
+        )
+        if past is None:
+            logger.info("fuse_attention: failed to match past path")
+            return
+        if not self.model.find_graph_input(past):
+            logger.debug("past is not graph input.")
+            # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
+
+        present = self.match_present(concat_v, input_name_to_nodes)
+        if present is None:
+            logger.info("fuse_attention: failed to match present path")
+            return
+        if not self.model.find_graph_output(present):
+            logger.info("expect present to be graph output")
+            return
+
+        self.create_attention_node(
+            fc_weight,
+            fc_bias,
+            gemm_qkv,
+            past,
+            present,
+            layernorm_before_attention.output[0],
+            reshape_qkv.output[0],
+            attention_mask_input_name,
+            is_unidirectional,
+        )
+
+        # we rely on prune_graph() to clean old subgraph nodes:
+        # qk_nodes + q_nodes + k_nodes + v_nodes + mask_nodes + [reshape_qkv, transpose_qkv, matmul_qkv]
+        self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
new file mode 100644
index 0000000000000000000000000000000000000000..8510ae42937b77d7c7d26941d1b0be9abe8b9679
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_megatron.py
@@ -0,0 +1,292 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_gpt_attention import FusionGptAttentionPastBase
+from .fusion_utils import FusionUtils
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+def is_close(value, expected_value):
+    return abs(value - expected_value) <= 1e-6
+
+
+class FusionGptAttentionMegatron(FusionGptAttentionPastBase):
+    """
+    Fuse GPT-2 Attention with past state subgraph from Megatron into one Attention node.
+    """
+
+    def __init__(self, model: OnnxModel, num_heads: int):
+        super().__init__(model, num_heads)
+
+    def fuse_attention_node(
+        self,
+        matmul_before_split,
+        add_before_split,
+        past,
+        present,
+        input,
+        reshape_qkv,
+        mask,
+    ):
+        attention_node_name = self.model.create_node_name("GptAttention")
+        int32_mask = self.cast_attention_mask(mask)
+        output = reshape_qkv.output[0]
+        i = 1 if (add_before_split.input[0] == matmul_before_split.output[0]) else 0
+        attention_node = helper.make_node(
+            "Attention",
+            inputs=[
+                input,
+                matmul_before_split.input[1],
+                add_before_split.input[i],
+                int32_mask,
+                past,
+            ],
+            outputs=[output, present],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.microsoft"
+        attention_node.attribute.extend(
+            [
+                helper.make_attribute("num_heads", self.num_heads),
+                helper.make_attribute("unidirectional", 0),  # unidirectional shall not be ON for 4D attention mask
+            ]
+        )
+
+        nodes_to_add = [attention_node]
+        self.nodes_to_add.extend(nodes_to_add)
+
+        for node in nodes_to_add:
+            self.node_name_to_graph_name[node.name] = self.this_graph_name
+
+        self.nodes_to_remove.append(reshape_qkv)
+
+        # we rely on prune_graph() to clean old subgraph nodes
+        self.prune_graph = True
+
+    def match_mask(self, sub_qk, mul_qk, matmul_qk, layernorm_before_attention):
+        mask_nodes = self.model.match_parent_path(
+            sub_qk, ["Mul", "Sub", "Slice", "Slice"], [1, 0, 1, 0]
+        )  # yapf: disable
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match unidirectional mask path")
+            return None
+        (mul_mask, sub_mask, last_slice_mask, slice_mask) = mask_nodes
+
+        if mul_qk.input[1] != last_slice_mask.output[0]:
+            logger.debug("fuse_attention failed: mul_qk.input[1] != last_slice_mask.output[0]")
+            return None
+
+        if not self.utils.check_node_input_value(mul_mask, 1, 10000.0):
+            logger.debug("fuse_attention failed: mul_mask input 1 is not constant 10000.0")
+            return None
+
+        if not self.utils.check_node_input_value(sub_mask, 0, 1.0):
+            logger.debug("fuse_attention failed: sub_mask input 0 is not constant 1.0")
+            return None
+
+        if not self.model.find_graph_input(slice_mask.input[0]):
+            logger.info("expect slick_mask input 0 to be graph input")
+            return None
+
+        if not self.utils.check_node_input_value(last_slice_mask, 1, [0]):
+            logger.debug("fuse_attention failed: last_slice_mask input 1 (starts) is not constant [0]")
+            return None
+
+        if not self.utils.check_node_input_value(last_slice_mask, 3, [3]):
+            logger.debug("fuse_attention failed: last_slice_mask input 3 (axes) is not constant [3]")
+            return False
+
+        if not self.utils.check_node_input_value(last_slice_mask, 4, [1]):
+            logger.debug("fuse_attention failed: last_slice_mask input 4 (steps) is not constant [1]")
+            return False
+
+        if not self.utils.check_node_input_value(slice_mask, 3, [2]):
+            logger.debug("fuse_attention failed: slice_mask input 3 (axes) is not constant [2]")
+            return None
+
+        if not self.utils.check_node_input_value(slice_mask, 4, [1]):
+            logger.debug("fuse_attention failed: slice_mask input 4 (steps) is not constant [1]")
+            return None
+
+        last_slice_path = self.model.match_parent_path(
+            last_slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
+        )
+        if last_slice_path is None or last_slice_path[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match last slice path")
+            return None
+
+        first_slice_path = self.model.match_parent_path(
+            slice_mask, ["Unsqueeze", "Gather", "Shape", "MatMul"], [2, 0, 0, 0]
+        )
+        if first_slice_path is None or first_slice_path[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match first slice path")
+            return None
+
+        first_slice_sub = self.model.match_parent_path(
+            slice_mask,
+            ["Unsqueeze", "Sub", "Gather", "Shape", "MatMul"],
+            [1, 0, 0, 0, 0],
+        )
+        if first_slice_sub is None or first_slice_sub[-1] != matmul_qk:
+            logger.debug("fuse_attention: failed to match last slice sub path")
+            return None
+
+        first_slice_sub_1 = self.model.match_parent_path(
+            slice_mask,
+            ["Unsqueeze", "Sub", "Gather", "Shape", "LayerNormalization"],
+            [1, 0, 1, 0, 0],
+        )
+        if first_slice_sub_1 is None or first_slice_sub_1[-1] != layernorm_before_attention:
+            logger.debug("fuse_attention: failed to match last slice sub path 1")
+            return None
+
+        return slice_mask.input[0]
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        past = None
+        present = None
+
+        qkv_nodes = self.model.match_parent_path(
+            normalize_node,
+            ["Add", "Add", "MatMul", "Reshape", "Transpose", "MatMul"],
+            [0, 1, None, 0, 0, 0],
+            output_name_to_node=output_name_to_node,
+        )  # yapf: disable
+        if qkv_nodes is None:
+            return
+        (
+            add_skip,
+            add_after_attention,
+            matmul_after_attention,
+            reshape_qkv,
+            transpose_qkv,
+            matmul_qkv,
+        ) = qkv_nodes
+
+        skip_input = add_skip.input[0]
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            [
+                "Concat",
+                "Transpose",
+                "Reshape",
+                "Split",
+                "Add",
+                "MatMul",
+                "LayerNormalization",
+            ],
+            [1, 1, 0, 0, 0, None, 0],
+        )  # yapf: disable
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (
+            concat_v,
+            transpose_v,
+            reshape_v,
+            split_v,
+            add_before_split,
+            matmul_before_split,
+            layernorm_before_attention,
+        ) = v_nodes
+        if skip_input != layernorm_before_attention.input[0]:
+            logger.debug("fuse_attention: skip_input != layernorm_before_attention.input[0]")
+            return
+
+        qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Sub", "Mul", "MatMul"], [0, 0, 0, 0])
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return None
+        (softmax_qk, sub_qk, mul_qk, matmul_qk) = qk_nodes
+        if self.model.get_node_attribute(softmax_qk, "axis") != 3:
+            logger.debug("fuse_attention failed: softmax_qk axis != 3")
+            return None
+
+        attention_mask = self.match_mask(sub_qk, mul_qk, matmul_qk, layernorm_before_attention)
+
+        q_nodes = self.model.match_parent_path(matmul_qk, ["Div", "Transpose", "Reshape", "Split"], [0, 0, 0, 0])
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (div_q, transpose_q, reshape_q, split_q) = q_nodes
+        if split_v != split_q:
+            logger.debug("fuse_attention: skip since split_v != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Div", "Transpose", "Concat", "Transpose", "Reshape", "Split"],
+            [1, 0, 0, 1, 0, 0],
+        )
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        (div_k, _, concat_k, transpose_k, reshape_k, split_k) = k_nodes
+        if split_v != split_k:
+            logger.debug("fuse_attention: skip since split_v != split_k")
+            return
+
+        i, value = self.model.get_constant_input(reshape_k)
+        if not (
+            isinstance(value, np.ndarray)
+            and list(value.shape) == [4]
+            and value[0] == 0
+            and value[1] == 0
+            and value[2] > 0
+            and value[3] > 0
+        ):
+            logger.debug("fuse_attention: reshape constant input is not [0, 0, N, H]")
+            return
+
+        num_heads = value[2]
+        if num_heads != self.num_heads:
+            logger.info(f"Detected num_heads={num_heads}. Ignore user specified value {self.num_heads}")
+            self.num_heads = num_heads
+
+        hidden_size_per_head = value[3]
+        i, value = self.model.get_constant_input(div_k)
+        expected_value = float(np.sqrt(np.sqrt(hidden_size_per_head)))
+        if not is_close(value, expected_value):
+            logger.debug(f"fuse_attention: div_k value={value} expected={expected_value}")
+            return
+
+        i, value = self.model.get_constant_input(div_q)
+        if not is_close(value, expected_value):
+            logger.debug(f"fuse_attention: div_q value={value} expected={expected_value}")
+            return
+
+        # Match past and present paths
+        past = self.match_past_pattern_2(concat_k, concat_v, output_name_to_node)
+        if past is None:
+            logger.debug("fuse_attention: match past failed")
+            return
+        if not self.model.find_graph_input(past):
+            logger.debug("fuse_attention: past is not graph input.")
+            # For GPT2LMHeadModel_BeamSearchStep, there is an extra Gather node to select beam index so it is not graph input.
+
+        present = self.match_present(concat_v, input_name_to_nodes)
+        if present is None:
+            logger.debug("fuse_attention: match present failed")
+            return
+        if not self.model.find_graph_output(present):
+            logger.info("fuse_attention: expect present to be graph output")
+            return
+
+        self.fuse_attention_node(
+            matmul_before_split,
+            add_before_split,
+            past,
+            present,
+            layernorm_before_attention.output[0],
+            reshape_qkv,
+            attention_mask,
+        )
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca88f144fb2fc0095c03a79bc040e8a369255603
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_gpt_attention_no_past.py
@@ -0,0 +1,252 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from logging import getLogger
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionGptAttentionNoPast(Fusion):
+    """
+    Fuse GPT-2 Attention without past state into one Attention node.
+    This does not support attention_mask graph input right now.
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+            "without past",
+        )
+        self.where_qk_shared = None
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, div: NodeProto
+    ) -> Tuple[int, int]:
+        div_initializer = self.model.get_initializer(div.input[1])
+
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.ceil(div_value * div_value)
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        where_qk: NodeProto,
+    ) -> Union[NodeProto, None]:
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_inputs = [input]
+        if where_qk is not None:
+            has_mask = 1
+            has_qk_bias = 1
+            attention_inputs.append(where_qk.output[0])
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        return_indice = []
+        add_qkv = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                add_qkv = add_before_layernorm
+
+        qkv_paths = {
+            "path1": (
+                ["CustomFCPluginDynamic_IxRT", "Reshape", "Transpose", "MatMul"],
+                [None, 0, 0, 0],
+            ),
+            "path2": (
+                ["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"],
+                [None, 0, 0],
+            ),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(
+            add_qkv,
+            qkv_paths,
+            output_name_to_node,
+            return_indice,
+        )  # yapf: disable
+
+        if qkv_nodes is None:
+            return
+        reshape_2 = None
+        if qkv_path == "path1":
+            (
+                custom_fc_after_attention,
+                reshape_2,
+                transpose_qkv,
+                matmul_qkv,
+            ) = qkv_nodes
+        else:
+            (
+                custom_fc_after_attention,
+                transpose_qkv,
+                matmul_qkv,
+            ) = qkv_nodes
+
+        another_input = add_qkv.input[1 - return_indice[0]]
+
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "Split", "CustomFCPluginDynamic_IxRT"],
+            [1, 0, 0, 0],
+        )  # yapf: disable
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        (
+            transpose_v,
+            reshape_v,
+            split_v,
+            custom_fc_before_attention,
+        ) = v_nodes
+
+        layernorm_before_attention = self.model.get_parent(
+            custom_fc_before_attention, 0, output_name_to_node
+        )
+        if (
+            layernorm_before_attention is None
+            or layernorm_before_attention.op_type != "LayerNormalization"
+        ):
+            if layernorm_before_attention.op_type != "Add":
+                logger.debug(
+                    f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}"
+                )
+                return
+
+        if not another_input in layernorm_before_attention.input:
+            # match openai-gpt
+            if not another_input in layernorm_before_attention.output:
+                logger.debug("Add and LayerNormalization shall have one same input")
+                return
+
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv, ["Softmax", "Add", "Where", "Div", "MatMul"], [0, None, 0, 1, 0]
+        )
+        where_qk = None
+        matmul_qk = None
+        mask_return_indices = []
+        if qk_nodes is not None:
+            (softmax_qk, add_qk, where_qk, div_qk, matmul_qk) = qk_nodes
+            mask_nodes = self.model.match_parent_path(
+                add_qk,
+                ["Mul", "Sub", "Cast", "Unsqueeze"],
+                [None, 0, 1, 0],
+                return_indice=mask_return_indices,
+            )  # yapf: disable
+            if mask_nodes is None:
+                logger.debug("fuse_attention: failed to match mask path")
+                return
+
+        q_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Split"], [0, 0, 0]
+        )
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        (transpose_q, reshape_q, split_q) = q_nodes
+        if split_v != split_q:
+            logger.debug("fuse_attention: skip since split_v != split_q")
+            return
+
+        k_nodes = self.model.match_parent_path(
+            matmul_qk, ["Transpose", "Reshape", "Split"], [1, 0, 0]
+        )
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        (transpose_k, reshape_k, split_k) = k_nodes
+        if split_v != split_k:
+            logger.debug("fuse_attention: skip since split_v != split_k")
+            return
+
+        if where_qk is None:
+            return
+
+        if self.where_qk_shared is None:
+            where_qk.input[1] = mask_nodes[0].output[0]
+            div_qk.output[0] = where_qk.output[0]
+            add_qk.input[1 - mask_return_indices[0]] = div_qk.output[0]
+            self.where_qk_shared = where_qk
+            self.nodes_to_remove.extend([softmax_qk, add_qk, div_qk, matmul_qk])
+        else:
+            self.nodes_to_remove.extend(
+                [softmax_qk, add_qk, where_qk, div_qk, matmul_qk]
+            )
+
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+            custom_fc_after_attention, div_qk
+        )
+        new_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            custom_fc_before_attention.output[0],
+            transpose_qkv.output[0] if reshape_2 is None else reshape_2.output[0],
+            self.where_qk_shared,
+        )
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+        if reshape_2 is not None:
+            self.nodes_to_remove.extend([reshape_2])
+        self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
+        self.nodes_to_remove.extend(q_nodes)
+        self.nodes_to_remove.extend(k_nodes)
+        self.nodes_to_remove.extend(v_nodes[:-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..727a1aa50848f7008ebb752a1aebc765efbc0e61
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_layernorm.py
@@ -0,0 +1,495 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict
+
+import numpy as np
+from onnx import TensorProto, helper
+
+from .fusion_base import Fusion
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel, hidden_size):
+        self.hidden_size = hidden_size
+        super().__init__(model, "LayerNormalization", "ReduceMean")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        Fuse Layer Normalization subgraph into one node LayerNormalization:
+              +----------------------+
+              |                      |
+              |                      v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
+                     (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0)    ^
+                                     |                                               |
+                                     +-----------------------------------------------+
+
+         It also handles cases of duplicated sub nodes exported from older version of PyTorch:
+              +----------------------+
+              |                      v
+              |           +-------> Sub-----------------------------------------------+
+              |           |                                                           |
+              |           |                                                           v
+          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
+              |                      ^
+              |                      |
+              +----------------------+
+        """
+        children = self.model.get_children(node, input_name_to_nodes)
+        if len(children) == 0 or len(children) > 2:
+            return
+
+        root_input = node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        if len(children) == 2:
+            if children[1].op_type != "Sub" or children[1].input[0] != root_input:
+                return
+
+        div_node = None
+        for child in children:
+            div_node = self.model.find_first_child_by_type(
+                child, "Div", input_name_to_nodes, recursive=False
+            )
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+
+        path_id, parent_nodes, _ = self.model.match_parent_paths(
+            div_node,
+            [
+                (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
+                (
+                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
+                    [1, 0, 0, 0, 0, 0],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[1]
+        i, add_weight = self.model.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expeced: {add_weight}")
+            return
+
+        pow_node = parent_nodes[3]
+        if not self.model.find_constant_input(pow_node, 2.0) == 1:
+            return
+
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        is_not_have_mul_and_add = False
+        is_not_have_mul_and_add_lst_node = None
+        # deal with special case : layernorm do not have mul and add
+        if mul_node.op_type != "Mul" and mul_node.op_type == "MatMul":
+            is_not_have_mul_and_add = True
+            is_not_have_mul_and_add_lst_node = div_node
+        elif mul_node.op_type != "Mul":
+            return
+
+        if is_not_have_mul_and_add:
+            last_add_node = is_not_have_mul_and_add_lst_node
+            if self.hidden_size == 0:
+                print(
+                    "[Error] Please add '--hidden_size' and '--num_head' to fuse layernorm ..."
+                )
+                exit(0)
+
+            subgraph_nodes = [node]
+            subgraph_nodes.extend(children)
+            subgraph_nodes.extend(parent_nodes[:-1])
+            subgraph_nodes.extend([last_add_node])
+            if len(subgraph_nodes) == 7:
+                self.nodes_to_remove.extend(subgraph_nodes)
+            else:
+                return
+
+            norm_name = self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            )
+            np_weights = np.ones((self.hidden_size)).astype(np.float32)
+            np_weights_name = norm_name + "_weights"
+            weights_tensor = helper.make_tensor(
+                np_weights_name, TensorProto.FLOAT, np_weights.shape, np_weights
+            )
+            np_bias = np.zeros((self.hidden_size)).astype(np.float32)
+            np_bias_name = norm_name + "_bias"
+            bias_tensor = helper.make_tensor(
+                np_bias_name, TensorProto.FLOAT, np_bias.shape, np_bias
+            )
+            self.model.add_initializer(weights_tensor)
+            self.model.add_initializer(bias_tensor)
+            normalize_node = helper.make_node(
+                "LayerNormalization",
+                inputs=[node.input[0], np_weights_name, np_bias_name],
+                outputs=[last_add_node.output[0]],
+                name=norm_name,
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("epsilon", float(add_weight))]
+            )
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+        else:
+            last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+            if last_add_node.op_type != "Add":
+                return
+
+            subgraph_nodes = [node]
+            subgraph_nodes.extend(children)
+            subgraph_nodes.extend(parent_nodes[:-1])
+
+            subgraph_nodes.extend([last_add_node, mul_node, div_node])
+            if not self.model.is_safe_to_fuse_nodes(
+                subgraph_nodes,
+                last_add_node.output,
+                input_name_to_nodes,
+                output_name_to_node,
+            ):
+                logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
+                return
+
+            weight_input = mul_node.input[
+                1 - self.model.input_index(div_node.output[0], mul_node)
+            ]
+            if not self.model.is_constant_with_specified_dimension(
+                weight_input, 1, "layernorm weight"
+            ):
+                return
+
+            bias_input = last_add_node.input[
+                1 - self.model.input_index(mul_node.output[0], last_add_node)
+            ]
+            if not self.model.is_constant_with_specified_dimension(
+                bias_input, 1, "layernorm bias"
+            ):
+                return
+
+            self.nodes_to_remove.extend(subgraph_nodes)
+            normalize_node = helper.make_node(
+                "LayerNormalization",
+                inputs=[node.input[0], weight_input, bias_input],
+                outputs=[last_add_node.output[0]],
+                name=self.model.create_node_name(
+                    "LayerNormalization", name_prefix="LayerNorm"
+                ),
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("epsilon", float(add_weight))]
+            )
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionLayerNormalizationKeras(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model, "LayerNormalization", "GlobalAveragePool", "Keras layernorm"
+        )
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+          +-------------------------------+
+          |                               |
+          |                               v
+        [Root] -->  GlobalAveragePool-->  Sub  --> Mul --> GlobalAveragePool --> Add/Min/Max --> Sqrt --> Div --> Mul --> Add
+                                           |                                                               ^
+                                           |                                                               |
+                                           +---------------------------------------------------------------+
+        """
+        children = self.model.get_children(node, input_name_to_nodes)
+        # print(len(children))
+        if len(children) != 1:
+            return
+
+        root_input = node.input[0]
+
+        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
+            return
+
+        div_node = None
+        for child in children:
+            div_node = self.model.find_first_child_by_type(
+                child, "Div", input_name_to_nodes, recursive=False
+            )
+            if div_node is not None:
+                break
+        if div_node is None:
+            return
+        # print('div_node_name:', div_node.name)
+        path_id, parent_nodes, _ = self.model.match_parent_paths(
+            div_node,
+            [
+                (
+                    ["Sqrt", "Max", "Min", "Add", "GlobalAveragePool", "Mul", "Sub"],
+                    [1, 0, 0, 0, None, 0, None],
+                ),
+            ],
+            output_name_to_node,
+        )
+        if path_id < 0:
+            return
+
+        sub_node = parent_nodes[-1]
+        if sub_node not in children:
+            return
+
+        second_add_node = parent_nodes[3]
+        i, add_weight = self.model.get_constant_input(second_add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expeced: {add_weight}")
+            return
+
+        mul_node = input_name_to_nodes[div_node.output[0]][0]
+        if mul_node.op_type != "Mul":
+            return
+
+        last_add_node = input_name_to_nodes[mul_node.output[0]][0]
+        if last_add_node.op_type != "Add":
+            return
+
+        subgraph_nodes = [node]
+        subgraph_nodes.extend(children)
+        subgraph_nodes.extend(parent_nodes[:-1])
+
+        subgraph_nodes.extend([last_add_node, mul_node, div_node])
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            last_add_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
+            return
+
+        weight_input = mul_node.input[
+            1 - self.model.input_index(div_node.output[0], mul_node)
+        ]
+        if not self.model.is_constant_with_specified_dimension(
+            weight_input, 1, "layernorm weight"
+        ):
+            return
+
+        bias_input = last_add_node.input[
+            1 - self.model.input_index(mul_node.output[0], last_add_node)
+        ]
+        if not self.model.is_constant_with_specified_dimension(
+            bias_input, 1, "layernorm bias"
+        ):
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        normalize_node = helper.make_node(
+            "LayerNormalization",
+            inputs=[node.input[0], weight_input, bias_input],
+            outputs=[last_add_node.output[0]],
+            name=self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            ),
+        )
+        normalize_node.attribute.extend(
+            [helper.make_attribute("epsilon", float(add_weight))]
+        )
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionLayerNormalizationTF(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "LayerNormalization", "Add", "TF")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+         Layer Norm from Tensorflow model(using keras2onnx or tf2onnx):
+          +------------------------------------+
+          |                                    |
+          |                                    |
+        (Cast_1)                               |
+          |                                    |
+          |                                    v                                           (B)                             (B)             (A)
+         Add --> (Cast_1) --> ReduceMean -->  Sub  --> Mul --> ReduceMean --> (Cast_3) --> Add --> Sqrt --> Reciprocol --> Mul --> Mul --> Sub --> Add
+          |                       |                                                                                         |       ^              ^
+          |                       |                                                                                         |       |              |
+          |                       +--------------------------------------------------(Cast_2)-------------------------------|-------+              |
+          |                                                                                                                 v                      |
+          +---------------------------------------------------------------------------------------------------------------> Mul--------------------+
+        """
+        return_indice = []
+        _, parent_nodes, return_indice = self.model.match_parent_paths(
+            node,
+            [
+                (
+                    [
+                        "Sub",
+                        "Mul",
+                        "Mul",
+                        "Reciprocal",
+                        "Sqrt",
+                        "Add",
+                        "ReduceMean",
+                        "Mul",
+                        "Sub",
+                        "ReduceMean",
+                    ],
+                    [1, 1, None, 0, 0, 0, None, 0, 0, None],
+                ),
+                (
+                    [
+                        "Sub",
+                        "Mul",
+                        "Mul",
+                        "Reciprocal",
+                        "Sqrt",
+                        "Add",
+                        "Cast",
+                        "ReduceMean",
+                        "Mul",
+                        "Sub",
+                        "ReduceMean",
+                    ],
+                    [1, 1, None, 0, 0, 0, 0, None, 0, 0, None],
+                ),
+            ],
+            output_name_to_node,
+        )  # yapf: disable
+
+        if parent_nodes is None:
+            return
+
+        assert len(return_indice) == 3
+        if not (
+            return_indice[0] in [0, 1]
+            and return_indice[1] in [0, 1]
+            and return_indice[2] in [0, 1]
+        ):
+            logger.debug(
+                "return indice is exepected in [0, 1], but got {return_indice}"
+            )
+            return
+
+        (
+            sub_node_0,
+            mul_node_0,
+            mul_node_1,
+            reciprocol_node,
+            sqrt_node,
+            add_node_0,
+        ) = parent_nodes[:6]
+        reduce_mean_node_0, mul_node_2, sub_node_1, reduce_mean_node_1 = parent_nodes[
+            -4:
+        ]
+
+        cast_node_3 = None
+        if len(parent_nodes) == 11:
+            cast_node_3 = parent_nodes[6]
+            assert cast_node_3.op_type == "Cast"
+
+        mul_node_3 = self.model.match_parent(node, "Mul", 0, output_name_to_node)
+        if mul_node_3 is None:
+            logger.debug("mul_node_3 not found")
+            return
+
+        node_before_reduce = self.model.get_parent(
+            reduce_mean_node_1, 0, output_name_to_node
+        )
+        root_node = (
+            node_before_reduce
+            if cast_node_3 is None
+            else self.model.get_parent(node_before_reduce, 0, output_name_to_node)
+        )
+        if root_node is None:
+            logger.debug("root node is none")
+            return
+
+        i, epsilon = self.model.get_constant_input(add_node_0)
+        if (
+            epsilon is None
+            or epsilon <= 0
+            or (epsilon > 1.0e-5 and cast_node_3 is None)
+        ):
+            logger.debug("epsilon is not matched")
+            return
+
+        if cast_node_3 is None and (
+            reduce_mean_node_1.input[0] not in mul_node_3.input
+            or reduce_mean_node_1.input[0] not in sub_node_1.input
+        ):
+            logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
+            return
+
+        if cast_node_3 is not None and (
+            node_before_reduce.input[0] not in mul_node_3.input
+            or reduce_mean_node_1.input[0] not in sub_node_1.input
+        ):
+            logger.debug("reduce_mean_node_1 and mul_node_3 shall link from root node")
+            return
+
+        if mul_node_2.input[0] != mul_node_2.input[1]:
+            logger.debug("mul_node_2 shall have two same inputs")
+            return
+
+        subgraph_nodes = [
+            node,
+            sub_node_0,
+            mul_node_0,
+            mul_node_1,
+            reciprocol_node,
+            sqrt_node,
+            add_node_0,
+            reduce_mean_node_0,
+            mul_node_2,
+            sub_node_1,
+            reduce_mean_node_1,
+            mul_node_3,
+        ]
+
+        if cast_node_3 is not None:
+            cast_node_2 = self.model.match_parent(
+                mul_node_0, "Cast", 0, output_name_to_node
+            )
+            if cast_node_2 is None:
+                logger.debug("cast_node_2 not found")
+                return
+            subgraph_nodes.extend([node_before_reduce, cast_node_2, cast_node_3])
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            node.output,
+            self.model.input_name_to_nodes(),
+            self.model.output_name_to_node(),
+        ):
+            logger.debug("not safe to fuse layer normalization")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        weight_input = mul_node_1.input[1]
+        bias_input = sub_node_0.input[0]
+
+        # TODO: add epsilon attribute
+        fused_node = helper.make_node(
+            "LayerNormalization",
+            inputs=[mul_node_3.input[0], weight_input, bias_input],
+            outputs=[node.output[0]],
+            name=self.model.create_node_name(
+                "LayerNormalization", name_prefix="LayerNorm"
+            ),
+        )
+        fused_node.attribute.extend([helper.make_attribute("epsilon", float(epsilon))])
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0a1a535b13b391736f91064799b89f422eb600a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_options.py
@@ -0,0 +1,170 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from argparse import ArgumentParser
+
+
+class AttentionMaskFormat:
+    MaskIndexEnd = 0
+    MaskIndexEndAndStart = 1
+    AttentionMask = 2
+    NoMask = 3
+
+
+class FusionOptions:
+    """Options of fusion in graph optimization"""
+
+    def __init__(self, model_type):
+        self.enable_gelu = True
+        self.enable_layer_norm = True
+        self.enable_attention = True
+        self.enable_skip_layer_norm = True
+        self.enable_embed_layer_norm = True
+        self.enable_bias_skip_layer_norm = True
+        self.enable_bias_gelu = True
+        self.enable_gelu_approximation = False
+        self.enable_qordered_matmul = True
+
+        self.enable_shape_inference = True
+        self.enable_swint_opt = False
+        self.enable_format_roformer = False
+        self.enable_gpt2_classify = False
+        self.enable_vit = False
+        self.attention_mask_format = AttentionMaskFormat.AttentionMask
+
+        if model_type == "gpt2":
+            self.enable_skip_layer_norm = False
+            self.enable_gpt2_classify = True
+        elif model_type == "swint":
+            self.enable_swint_opt = True
+        elif model_type == "roformer":
+            self.enable_format_roformer = True
+        elif model_type == "vit":
+            self.enable_vit = True
+
+    def use_raw_attention_mask(self, use_raw_mask=True):
+        if use_raw_mask:
+            self.attention_mask_format = AttentionMaskFormat.AttentionMask
+        else:
+            self.attention_mask_format = AttentionMaskFormat.MaskIndexEnd
+
+    def disable_attention_mask(self):
+        self.attention_mask_format = AttentionMaskFormat.NoMask
+
+    @staticmethod
+    def parse(args):
+        options = FusionOptions(args.model_type)
+        if args.disable_gelu:
+            options.enable_gelu = False
+        if args.disable_layer_norm:
+            options.enable_layer_norm = False
+        if args.disable_attention:
+            options.enable_attention = False
+        if args.disable_skip_layer_norm:
+            options.enable_skip_layer_norm = False
+        if args.disable_embed_layer_norm:
+            options.enable_embed_layer_norm = False
+        if args.disable_bias_skip_layer_norm:
+            options.enable_bias_skip_layer_norm = False
+        if args.disable_bias_gelu:
+            options.enable_bias_gelu = False
+        if args.enable_gelu_approximation:
+            options.enable_gelu_approximation = True
+        if args.disable_shape_inference:
+            options.enable_shape_inference = False
+        if args.use_mask_index:
+            options.use_raw_attention_mask(False)
+        if args.no_attention_mask:
+            options.disable_attention_mask()
+        return options
+
+    @staticmethod
+    def add_arguments(parser: ArgumentParser):
+        parser.add_argument(
+            "--disable_attention",
+            required=False,
+            action="store_true",
+            help="disable Attention fusion",
+        )
+        parser.set_defaults(disable_attention=False)
+
+        parser.add_argument(
+            "--disable_skip_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable SkipLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_skip_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_embed_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable EmbedLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_embed_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_bias_skip_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable Add Bias and SkipLayerNormalization fusion",
+        )
+        parser.set_defaults(disable_bias_skip_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_bias_gelu",
+            required=False,
+            action="store_true",
+            help="disable Add Bias and Gelu/FastGelu fusion",
+        )
+        parser.set_defaults(disable_bias_gelu=False)
+
+        parser.add_argument(
+            "--disable_layer_norm",
+            required=False,
+            action="store_true",
+            help="disable LayerNormalization fusion",
+        )
+        parser.set_defaults(disable_layer_norm=False)
+
+        parser.add_argument(
+            "--disable_gelu",
+            required=False,
+            action="store_true",
+            help="disable Gelu fusion",
+        )
+        parser.set_defaults(disable_gelu=False)
+
+        parser.add_argument(
+            "--enable_gelu_approximation",
+            required=False,
+            action="store_true",
+            help="enable Gelu/BiasGelu to FastGelu conversion",
+        )
+        parser.set_defaults(enable_gelu_approximation=False)
+
+        parser.add_argument(
+            "--disable_shape_inference",
+            required=False,
+            action="store_true",
+            help="disable symbolic shape inference",
+        )
+        parser.set_defaults(disable_shape_inference=False)
+
+        parser.add_argument(
+            "--use_mask_index",
+            required=False,
+            action="store_true",
+            help="use mask index instead of raw attention mask in attention operator",
+        )
+        parser.set_defaults(use_mask_index=False)
+
+        parser.add_argument(
+            "--no_attention_mask",
+            required=False,
+            action="store_true",
+            help="no attention mask. Only works for model_type=bert",
+        )
+        parser.set_defaults(no_attention_mask=False)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b502acb41a9a34f31b4ace3c9d01ea218382ec
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_attention.py
@@ -0,0 +1,421 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple
+
+import numpy as np
+from .fusion_attention import AttentionMask
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedAttention(Fusion):
+    def __init__(
+        self,
+        model: OnnxModel,
+        hidden_size: int,
+        num_heads: int,
+        attention_mask: AttentionMask,
+    ):
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.attention_mask = attention_mask
+
+        super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization")
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+
+            # Check if the second input to Reshape flows through a Constant node
+            # TODO: Investigate why FusionAttention doesn't have such logic
+            constant_node = self.model.match_parent_path(reshape_q, ["Constant"], [1])
+
+            if constant_node is None:
+                return self.num_heads, self.hidden_size  # Fall back to user specified value
+            else:
+                constant_node = constant_node[0]
+
+                if len(constant_node.attribute) != 1:
+                    return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+                # This is assuming it is a Tensor attribute (this is a safe assumption)
+                q_shape = constant_node.attribute[0].t
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        if self.num_heads > 0 and num_heads != self.num_heads:
+            if self.num_heads_warning:
+                logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
+                self.num_heads_warning = False  # Do not show the warning more than once
+
+        if self.hidden_size > 0 and hidden_size != self.hidden_size:
+            if self.hidden_size_warning:
+                logger.warning(
+                    f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
+                )
+                self.hidden_size_warning = False  # Do not show the warning more than once
+
+        return num_heads, hidden_size
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        add_before_layernorm = self.model.match_parent_path(
+            normalize_node,
+            ["QuantizeLinear", "Add"],
+            [0, 0],
+        )
+
+        if add_before_layernorm is not None:
+            start_node = add_before_layernorm[-1]
+        else:
+            return
+
+        # Input QDQ nodes
+        dequantize_input = self.model.match_parent_path(
+            start_node,
+            ["DequantizeLinear"],
+            [None],
+        )
+
+        if dequantize_input is None:
+            logger.debug("fuse_qordered_attention: failed to match input qdq nodes path")
+            return
+
+        dequantize_input = dequantize_input[-1]
+
+        # QKV nodes
+        qkv_nodes = self.model.match_parent_path(
+            start_node,
+            ["Add", "MatMul", "Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear", "MatMul"],
+            [None, None, 0, 0, 0, 0, 0],
+        )
+
+        if qkv_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match qkv path")
+            return
+
+        (_, projection_matmul, reshape_qkv, transpose_qkv, dequantize_qkv, quantize_qkv, matmul_qkv) = qkv_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qkv, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qkv, self.model):
+            return
+
+        # Identify the root input to the Attention node
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+
+            other_inputs.append(input)
+
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+
+        # V nodes
+        v_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [1, 0, 0, 0, 0, None],
+        )
+
+        if v_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match v path")
+            return
+
+        (_, _, dequantize_v, quantize_v, add_v, matmul_v) = v_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_v, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v, self.model):
+            return
+
+        # V MatMul weight
+        dequantize_v_matmul_weight = self.model.match_parent_path(matmul_v, ["DequantizeLinear"], [1])
+
+        if dequantize_v_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match v path")
+            return
+
+        dequantize_v_matmul_weight = dequantize_v_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_v_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_v_matmul_weight, self.model, False):
+            return
+
+        # QK nodes
+        qk_nodes = self.model.match_parent_path(
+            matmul_qkv,
+            [
+                "DequantizeLinear",
+                "QuantizeLinear",
+                "Softmax",
+                "Add",
+                "Div",
+                "DequantizeLinear",
+                "QuantizeLinear",
+                "MatMul",
+            ],
+            [0, 0, 0, 0, None, 0, 0, 0],
+        )
+
+        if qk_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match qk path")
+            return
+
+        (
+            dequantize_qk_softmax,
+            quantize_qk_softmax,
+            softmax_qk,
+            add_qk,
+            div_qk,
+            dequantize_qk,
+            quantize_qk,
+            matmul_qk,
+        ) = qk_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk_softmax, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk_softmax, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_qk, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk, self.model):
+            return
+
+        # Q nodes
+        q_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [0, 0, 0, 0, 0, None],
+        )
+
+        if q_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match q path")
+            return
+
+        (_, reshape_q, dequantize_q, quantize_q, add_q, matmul_q) = q_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_q, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q, self.model):
+            return
+
+        # Q MatMul weight
+        dequantize_q_matmul_weight = self.model.match_parent_path(matmul_q, ["DequantizeLinear"], [1])
+
+        if dequantize_q_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match q path")
+            return
+
+        dequantize_q_matmul_weight = dequantize_q_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_q_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_q_matmul_weight, self.model, False):
+            return
+
+        # K nodes
+        k_nodes = self.model.match_parent_path(
+            matmul_qk,
+            ["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
+            [1, 0, 0, 0, 0, None],
+        )
+
+        if k_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match k path")
+            return
+
+        (_, _, dequantize_k, quantize_k, add_k, matmul_k) = k_nodes
+
+        # Make sure the Q/DQ has the proper zero points and constant per-tensor scales
+        if not FusionUtils.check_qdq_node_for_fusion(quantize_k, self.model):
+            return
+
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k, self.model):
+            return
+
+        # K MatMul weight
+        dequantize_k_matmul_weight = self.model.match_parent_path(matmul_k, ["DequantizeLinear"], [1])
+
+        if dequantize_k_matmul_weight is None:
+            logger.debug("fuse_qordered_attention: failed to match k path")
+            return
+
+        dequantize_k_matmul_weight = dequantize_k_matmul_weight[0]
+
+        if self.model.get_constant_value(dequantize_k_matmul_weight.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_k_matmul_weight, self.model, False):
+            return
+
+        # Mask nodes
+        mask_nodes = self.model.match_parent_path(
+            add_qk, ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]
+        )
+
+        if mask_nodes is None:
+            logger.debug("fuse_qordered_attention: failed to match mask_nodes path")
+            return
+
+        # Ascertain `qkv_hidden_sizes` attribute value
+        q_weight = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
+        k_weight = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
+        v_weight = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
+
+        qw = NumpyHelper.to_array(q_weight)
+        kw = NumpyHelper.to_array(k_weight)
+        vw = NumpyHelper.to_array(v_weight)
+
+        qw_out_size = np.prod(qw.shape[1:])
+        kw_out_size = np.prod(kw.shape[1:])
+        vw_out_size = np.prod(vw.shape[1:])
+
+        # Form QOrderedAttention node
+        if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
+            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+
+            # Ascertain `num_heads` and `hidden_size`
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+
+            # Formulate the inputs
+            # Actual quantized input
+            attention_inputs = [dequantize_input.input[0]]
+            attention_inputs.append(dequantize_input.input[1])
+
+            attention_inputs.append(dequantize_q.input[1])
+            attention_inputs.append(dequantize_k.input[1])
+            attention_inputs.append(dequantize_v.input[1])
+
+            attention_inputs.append(dequantize_q_matmul_weight.input[0])
+            attention_inputs.append(dequantize_k_matmul_weight.input[0])
+            attention_inputs.append(dequantize_v_matmul_weight.input[0])
+
+            attention_inputs.append(dequantize_q_matmul_weight.input[1])
+            attention_inputs.append(dequantize_k_matmul_weight.input[1])
+            attention_inputs.append(dequantize_v_matmul_weight.input[1])
+
+            if self.model.get_initializer(add_q.input[0]):
+                attention_inputs.append(add_q.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_q.input[1])
+
+            if self.model.get_initializer(add_k.input[0]):
+                attention_inputs.append(add_k.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_k.input[1])
+
+            if self.model.get_initializer(add_v.input[0]):
+                attention_inputs.append(add_v.input[0])
+            else:  # second input is the constant bias
+                attention_inputs.append(add_v.input[1])
+
+            attention_inputs.append(quantize_qk.input[1])
+            attention_inputs.append(quantize_qk_softmax.input[1])
+            attention_inputs.append(dequantize_qkv.input[1])
+
+            # Mask input
+            if mask_index is not None:
+                attention_inputs.append(mask_index)
+            else:
+                attention_inputs.append("")
+
+            # The MatMul weight 'B' and 'bias' need some post-processing
+            # Transpose weight 'B' from order ROW to order COL
+            # This offline transpose is needed only while using the CUDA EP
+            # TODO: Make this fusion logic EP-agnostic ?
+            q_weight_tensor = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(q_weight_tensor)
+
+            k_weight_tensor = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(k_weight_tensor)
+
+            v_weight_tensor = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
+            FusionUtils.transpose_2d_int8_tensor(v_weight_tensor)
+
+            # Name and create Attention node
+            attention_node_name = self.model.create_node_name("QOrderedAttention")
+
+            attention_node = helper.make_node(
+                "QOrderedAttention",
+                inputs=attention_inputs,
+                outputs=[reshape_qkv.output[0]],
+                name=attention_node_name,
+            )
+
+            self.model.replace_node_input(dequantize_qkv, dequantize_qkv.input[0], attention_node.output[0])
+            self.model.replace_node_input(projection_matmul, projection_matmul.input[0], dequantize_qkv.output[0])
+
+            attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+            attention_node.attribute.extend([helper.make_attribute("order_input", 1)])
+            attention_node.attribute.extend([helper.make_attribute("order_weight", 0)])
+            attention_node.attribute.extend([helper.make_attribute("order_output", 1)])
+            attention_node.attribute.extend(
+                [helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
+            )
+
+            attention_node.domain = "com.microsoft"
+
+            self.nodes_to_add.append(attention_node)
+            self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([reshape_qkv, transpose_qkv, quantize_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes)
+            self.nodes_to_remove.extend(
+                [dequantize_q_matmul_weight, dequantize_k_matmul_weight, dequantize_v_matmul_weight]
+            )
+
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ce59f784bc9242213c9e0dc699764d8c50e0fb2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_gelu.py
@@ -0,0 +1,117 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedGelu(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedGelu", ["Gelu", "FastGelu"])
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        INPUT PATTERN
+        Fuse (quantized) Gelu subgraph into one node QOrderedGelu:
+            -> quantized input  -> DQ -> Gelu -> Q ->
+
+        (or)
+
+            -> quantized input  -> DQ -> FastGelu -> Q ->
+
+        OUTPUT PATTERN
+            -> QOrderedGelu ->
+        """
+        gelu_children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - QuantizeLinear (or)
+        # Should have 2 children - QuantizeLinear + Shape
+        if not (
+            (len(gelu_children) == 1 and gelu_children[0].op_type == "QuantizeLinear")
+            or (
+                len(gelu_children) == 2
+                and gelu_children[0].op_type == "QuantizeLinear"
+                and gelu_children[1].op_type == "Shape"
+            )
+        ):
+            return
+
+        downstream_quantize_node = gelu_children[0]
+        downstream_shape_node = None
+
+        if len(gelu_children) == 2:
+            downstream_shape_node = gelu_children[1]
+
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to Gelu should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        if first_path_id < 0:
+            return
+
+        upstream_dequantize_node = first_input_parent_nodes[0]
+
+        if not FusionUtils.check_qdq_node_for_fusion(upstream_dequantize_node, self.model):
+            return
+
+        # Fusion logic
+        subgraph_nodes = [node]  # Gelu/FastGelu
+        subgraph_nodes.extend([downstream_quantize_node, upstream_dequantize_node])  # Relevant Q, DQ nodes
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [node.output[0], downstream_quantize_node.output[0]]
+            if downstream_shape_node is not None
+            else downstream_quantize_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        ordered_gelu_node = helper.make_node(
+            "QOrderedGelu",
+            inputs=[
+                upstream_dequantize_node.input[0],
+                upstream_dequantize_node.input[1],
+                downstream_quantize_node.input[1],
+            ],
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedGelu", name_prefix="QOrderedGelu"),
+        )
+
+        # Arrange the downstream Shape's input to be fed from the
+        # downstream QuantizeLinear node, so that fusion will
+        # be deemed safe
+        if downstream_shape_node is not None:
+            self.model.replace_node_input(
+                downstream_shape_node, downstream_shape_node.input[0], downstream_quantize_node.output[0]
+            )
+
+        # TODO: We only support CuBlasLt order ORDER_ROW for now.
+        # Once we start supporting other data ordering format(s), we
+        # will support user configuring the data ordering for the op.
+        ordered_gelu_node.attribute.extend([helper.make_attribute("order_X", 1)])
+        ordered_gelu_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        ordered_gelu_node.domain = "com.microsoft"
+
+        self.nodes_to_add.append(ordered_gelu_node)
+        self.node_name_to_graph_name[ordered_gelu_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..08def4a20f205658df7ca9371e9fb9509103657b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_layernorm.py
@@ -0,0 +1,121 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedLayerNormalization", "LayerNormalization")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        """
+        Fuse (quantized) Layer Normalization subgraph into one node QOrderedLayerNormalization:
+            quantized input  -> DQ
+                                |
+                                |
+            (other inputs)-> LayerNormalization --> Q -->
+
+            should become
+
+            (quantized input + other inputs)->  QOrderedLayerNormalization --> Q -->
+        """
+
+        children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - QuantizeLinear (or)
+        # Should have 2 children - QuantizeLinear + Shape
+        if not (
+            (len(children) == 1 and children[0].op_type == "QuantizeLinear")
+            or (len(children) == 2 and children[0].op_type == "QuantizeLinear" and children[1].op_type == "Shape")
+        ):
+            return
+
+        downstream_quantize_node = children[0]
+        downstream_shape_node = None
+
+        if len(children) == 2:
+            downstream_shape_node = children[1]
+
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to LayerNormalization should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        if first_path_id < 0:
+            return
+
+        upstream_dequantize_node = first_input_parent_nodes[0]
+
+        if not FusionUtils.check_qdq_node_for_fusion(upstream_dequantize_node, self.model):
+            return
+
+        # Fusion logic
+        subgraph_nodes = [node]  # LayerNormalization
+        subgraph_nodes.extend([downstream_quantize_node])  # Q node after LayerNormalization
+
+        upstream_dequantize_node_children = self.model.get_children(upstream_dequantize_node, input_name_to_nodes)
+
+        # In GPT2, the DQ node will be feeding a residual downstream Add and hence,
+        # we do not want to remove it
+        if len(upstream_dequantize_node_children) == 1:
+            subgraph_nodes.extend([upstream_dequantize_node])  # DQ node before LayerNormalization
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes,
+            [node.output[0], downstream_quantize_node.output[0]]
+            if downstream_shape_node is not None
+            else downstream_quantize_node.output,
+            input_name_to_nodes,
+            output_name_to_node,
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedLayerNormalization node. Skip")
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+
+        normalize_node = helper.make_node(
+            "QOrderedLayerNormalization",
+            inputs=[
+                upstream_dequantize_node.input[0],
+                upstream_dequantize_node.input[1],
+                node.input[1],
+                node.input[2],
+                downstream_quantize_node.input[1],
+            ],
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedLayerNormalization", name_prefix="QOrderedLayerNormalization"),
+        )
+
+        # Arrange the downstream Shape's input to be fed from the
+        # downstream QuantizeLinear node, so that fusion will
+        # be deemed safe
+        if downstream_shape_node is not None:
+            self.model.replace_node_input(
+                downstream_shape_node, downstream_shape_node.input[0], downstream_quantize_node.output[0]
+            )
+
+        # TODO: We only support CuBlasLt order ORDER_ROW for now.
+        # Once we start supporting other data ordering format(s), we
+        # will support user configuring the data ordering for the op.
+        normalize_node.attribute.extend([helper.make_attribute("order_X", 1)])
+        normalize_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        normalize_node.domain = "com.microsoft"
+
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
new file mode 100644
index 0000000000000000000000000000000000000000..de0196c53b3f6e8c38301adae476dcfd6f524aa3
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_qordered_matmul.py
@@ -0,0 +1,217 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionQOrderedMatMul(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "QOrderedMatMul", "MatMul")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        matmul_children = self.model.get_children(node, input_name_to_nodes)
+
+        # Should only have 1 child - Bias Add
+        if len(matmul_children) != 1 or matmul_children[0].op_type != "Add":
+            return
+
+        bias_add_node = matmul_children[0]
+
+        # Atleast one of the inputs to Bias Add node must be a constant
+        bias_add_node_index = 0
+        if (
+            self.model.get_constant_value(bias_add_node.input[0]) is None
+            and self.model.get_constant_value(bias_add_node.input[1]) is None
+        ):
+            return
+
+        if self.model.get_constant_value(bias_add_node.input[0]) is None:
+            bias_add_node_index = 1
+
+        bias_add_children = self.model.get_children(bias_add_node, input_name_to_nodes)
+
+        if len(bias_add_children) != 1:
+            return
+
+        bias_add_child = bias_add_children[0]
+
+        # Bias Add can have another Add downstream (Residual Add layer)
+        residual_add_node = None
+
+        downstream_quantize_node = None
+
+        if bias_add_child.op_type == "Add":
+            residual_add_node = bias_add_child
+
+            residual_add_children = self.model.get_children(residual_add_node, input_name_to_nodes)
+
+            if len(residual_add_children) != 1 or residual_add_children[0].op_type != "QuantizeLinear":
+                return
+
+            downstream_quantize_node = residual_add_children[0]
+
+        elif bias_add_child.op_type == "QuantizeLinear":
+            downstream_quantize_node = bias_add_child
+
+        else:
+            return
+
+        # Make sure the downstream QuantizeLinear has the proper zero points and scales
+        if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
+            return
+
+        # The first input to MatMul should flow through a DequantizeLinear node
+        first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear"], [0])],
+            output_name_to_node,
+        )
+
+        # If Attention is not fused, this is the pattern to look for
+        # leading upto the MatMul
+        reshape_node_0 = None
+        transpose_node_0 = None
+        if first_path_id < 0:
+            first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
+                node,
+                [(["Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear"], [0, 0, 0, 0])],
+                output_name_to_node,
+            )
+
+            if first_path_id < 0:
+                return
+
+            reshape_node_0 = first_input_parent_nodes[0]
+            transpose_node_0 = first_input_parent_nodes[1]
+            dequantize_node_0 = first_input_parent_nodes[2]
+        else:
+            dequantize_node_0 = first_input_parent_nodes[0]
+
+        # Make sure the upstream DequantizeLinear-0 has the proper zero points and scales
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_0, self.model):
+            return
+
+        # The second input to MatMul should flow through a DequantizeLinear node
+        dequantize_node_1 = None
+        is_weight_transpose_required = True
+
+        weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
+            node,
+            [(["DequantizeLinear", "QuantizeLinear", "Transpose", "DequantizeLinear"], [1, 0, 0, 0])],
+            output_name_to_node,
+        )
+
+        if weight_path_id < 0:
+            weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
+                node,
+                [(["DequantizeLinear"], [1])],
+                output_name_to_node,
+            )
+
+            if weight_path_id < 0:
+                return
+
+            dequantize_node_1 = weight_nodes[0]
+        else:
+            is_weight_transpose_required = False
+            dequantize_node_1 = weight_nodes[3]
+
+        # Check if weight 'B' is a constant
+        if self.model.get_constant_value(dequantize_node_1.input[0]) is None:
+            return
+
+        # Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
+        # Per-channel scales are supported for weights alone
+        if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_1, self.model, False):
+            return
+
+        # Make sure the upstream flow into the Residual Add node flows through a DQ node
+        residual_add_dequantize_node = None
+
+        if residual_add_node is not None:
+            residual_path_id, residual_input_parent_nodes, _ = self.model.match_parent_paths(
+                residual_add_node,
+                [
+                    (["DequantizeLinear"], [1]),
+                ],
+                output_name_to_node,
+            )
+
+            if residual_path_id < 0:
+                return
+
+            residual_add_dequantize_node = residual_input_parent_nodes[0]
+
+        # Make sure the upstream DequantizeLinear to the Residual Add has the proper zero points and scales
+        if residual_add_dequantize_node is not None and not FusionUtils.check_qdq_node_for_fusion(
+            residual_add_dequantize_node, self.model
+        ):
+            return
+
+        # Subgraph nodes to be fused
+        subgraph_nodes = [node, bias_add_node]  # MatMul + Bias Add
+
+        if residual_add_node is not None:
+            subgraph_nodes.extend([residual_add_node])  # Residual Add
+
+        subgraph_nodes.extend(weight_nodes)
+        subgraph_nodes.extend([downstream_quantize_node])  # Downstream Q node
+
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, downstream_quantize_node.output, input_name_to_nodes, output_name_to_node
+        ):
+            logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
+            return
+
+        # Deal with the case where-in the Attention subgraph is not fused
+        if transpose_node_0 is not None:
+            self.model.replace_node_input(transpose_node_0, transpose_node_0.input[0], dequantize_node_0.input[0])
+
+        # Make inputs
+        fused_node_inputs = [
+            reshape_node_0.output[0] if reshape_node_0 is not None else dequantize_node_0.input[0],
+            dequantize_node_0.input[1],
+            dequantize_node_1.input[0],
+            dequantize_node_1.input[1],
+            downstream_quantize_node.input[1],
+            bias_add_node.input[bias_add_node_index],
+        ]
+
+        if residual_add_node is not None:
+            fused_node_inputs.append(residual_add_dequantize_node.input[0])
+            fused_node_inputs.append(residual_add_dequantize_node.input[1])
+
+        # The MatMul weight 'B' and 'bias' need some post-processing
+        # Transpose weight 'B' from order ROW to order COL
+        # This offline transpose is needed only while using the CUDA EP
+        # TODO: Make this fusion logic EP-agnostic ?
+        if is_weight_transpose_required:
+            weight_tensor = self.model.get_initializer(dequantize_node_1.input[0])
+            FusionUtils.transpose_2d_int8_tensor(weight_tensor)
+
+        fused_node = helper.make_node(
+            "QOrderedMatMul",
+            inputs=fused_node_inputs,
+            outputs=[downstream_quantize_node.output[0]],
+            name=self.model.create_node_name("QOrderedMatMul", name_prefix="QOrderedMatMul"),
+        )
+
+        fused_node.attribute.extend([helper.make_attribute("order_A", 1)])
+        fused_node.attribute.extend([helper.make_attribute("order_B", 0)])
+        fused_node.attribute.extend([helper.make_attribute("order_Y", 1)])
+
+        fused_node.domain = "com.microsoft"
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b46c16cac89d23bbdbea86b0b418bff792fcdc
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_reshape.py
@@ -0,0 +1,175 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+import numpy as np
+from .fusion_base import Fusion
+from onnx import TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionReshape(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Reshape", "Reshape")
+        self.prune_graph: bool = False
+
+    def replace_reshape_node(self, shape, reshape_node, concat_node):
+        shape_value = np.asarray(shape, dtype=np.int64)
+        constant_shape_name = self.model.create_node_name("Constant", "constant_shape")
+        new_node = helper.make_node(
+            "Constant",
+            inputs=[],
+            outputs=[constant_shape_name],
+            value=helper.make_tensor(
+                name="const_tensor",
+                data_type=TensorProto.INT64,
+                dims=shape_value.shape,
+                vals=bytes(shape_value),
+                raw=True,
+            ),
+        )
+        reshape_node.input[1] = constant_shape_name
+        reshape_node.name = self.model.create_node_name("Reshape", "Reshape_Fuse")
+        self.nodes_to_remove.extend([concat_node])
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+    def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node):
+        if reshape_node.input[1] not in output_name_to_node:
+            return
+
+        concat_node = output_name_to_node[reshape_node.input[1]]
+        if concat_node.op_type != "Concat" or len(concat_node.input) < 3 or len(concat_node.input) > 4:
+            return
+
+        path0 = self.model.match_parent_path(
+            concat_node,
+            ["Unsqueeze", "Gather", "Shape"],
+            [0, 0, 0],
+            output_name_to_node,
+        )
+        if path0 is None:
+            return
+
+        (unsqueeze_0, gather_0, shape_0) = path0
+
+        path1 = self.model.match_parent_path(
+            concat_node,
+            ["Unsqueeze", "Gather", "Shape"],
+            [1, 0, 0],
+            output_name_to_node,
+        )
+        if path1 is None:
+            return
+        (unsqueeze_1, gather_1, shape_1) = path1
+
+        shape = []
+        gather_value = self.model.get_constant_value(gather_0.input[1])
+        if gather_value == 0:
+            shape.append(0)
+
+        gather_value = self.model.get_constant_value(gather_1.input[1])
+        if gather_value == 1:
+            shape.append(0)
+
+        if len(shape) != 2:
+            return
+
+        path2 = []
+        path3 = []
+        shape_nodes = [shape_0, shape_1]
+        if len(concat_node.input) == 3 and self.model.get_initializer(concat_node.input[2]) is None:
+            path2 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Mul", "Gather", "Shape"],
+                [2, 0, 0, 0],
+                output_name_to_node,
+            )
+            if path2 is None:
+                path2 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
+                    [2, 0, 0, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path2 is None:
+                    return
+
+            path3 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Mul", "Gather", "Shape"],
+                [2, 0, 1, 0],
+                output_name_to_node,
+            )
+            if path3 is None:
+                path3 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Mul", "Squeeze", "Slice", "Shape"],
+                    [2, 0, 1, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path3 is None:
+                    return
+
+            shape_nodes.extend([path2[-1], path3[-1]])
+            shape.append(-1)
+        elif len(concat_node.input) > 2:
+            concat_2 = self.model.get_initializer(concat_node.input[2])
+            if concat_2 is None:
+                return
+            concat_value = numpy_helper.to_array(concat_2)
+            if isinstance(concat_value, list):
+                shape.extend(concat_value)
+            else:
+                shape.append(concat_value)
+
+        if len(concat_node.input) == 4 and self.model.get_initializer(concat_node.input[3]) is None:
+            if -1 in shape:
+                return
+
+            path2 = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Div", "Gather", "Shape"],
+                [3, 0, 0, 0],
+                output_name_to_node,
+            )
+            if path2 is None:
+                path2 = self.model.match_parent_path(
+                    concat_node,
+                    ["Unsqueeze", "Div", "Squeeze", "Slice", "Shape"],
+                    [3, 0, 0, 0, 0],
+                    output_name_to_node,
+                )  # GPT2 exported by PyTorch 1.4 with opset_version=11
+                if path2 is None:
+                    return
+            shape_nodes.extend([path2[-1]])
+            shape.append(-1)
+        elif len(concat_node.input) > 3:
+            concat_3 = self.model.get_initializer(concat_node.input[3])
+            if concat_3 is None:
+                return
+
+            concat_value = numpy_helper.to_array(concat_3)
+            if isinstance(concat_value, list):
+                shape.extend(concat_value)
+            else:
+                shape.append(concat_value)
+
+        root_input = reshape_node.input[0]
+        same_shape_input = True
+        for shape_node in shape_nodes:
+            if shape_node.input[0] != root_input:
+                same_shape_input = False
+
+        if not same_shape_input:
+            return
+
+        self.replace_reshape_node(shape, reshape_node, concat_node)
+
+        # TODO(tlwu): Subgraph blocks pruning un-used nodes. Add code to remove un-used nodes safely.
+        self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c831f15c58907e4069fcbebe7d23078c7b47bf06
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rms_norm.py
@@ -0,0 +1,155 @@
+import logging
+from typing import Dict
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = logging.getLogger(__name__)
+
+
+class FusionRMSNorm(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "RMSNorm", "Mul")
+
+    def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
+        if node.op_type != "Mul":
+            return
+
+        sim_ln_nodes = None
+        # SimplifiedLayerNorm calculation (notation from https://onnx.ai/onnx/operators/onnx__LayerNormalization.html#summary):
+        # DD = Pow(D, 2)
+        # Var = ReduceMean(DD)
+        # VarEps = Add(Var, epsilon)
+        # StdDev = Sqrt(VarEps)
+        # InvStdDev = Div(1, StdDev)
+        # Normalized = Mul(D, InvStdDev)
+        # NormalizedScaled = Mul(Normalized, Scale)
+
+        #                              RMSNorm
+        #          +-------------------------------------------------------+
+        #          |                                                       |
+        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_1 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
+            [1, 1, 1, 0, 0, 0, 0],
+        )
+        #                                RMSNorm
+        #             +-------------------------------------------------------+
+        #             |                                                       |
+        # Gather --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                     |
+        #                                                                    node
+        sim_ln_nodes_2 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Gather"],
+            [1, 1, 1, 0, 0, 0, 0],
+        )
+
+        # For LLaMA from Microsoft custom export:
+        # sim_ln_nodes_3 uses a different start parent index than sim_ln_nodes_1
+        #
+        #                              RMSNorm
+        #          +-------------------------------------------------------+
+        #          |                                                       |
+        # Add --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_3 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow", "Add"],
+            [0, 1, 1, 0, 0, 0, 0],
+        )
+
+        # sim_ln_nodes_4 starts with a graph input instead of an Add node like sim_ln_nodes_3
+        #
+        #                                  RMSNorm
+        #                  +-----------------------------------------------+
+        #                  |                                               |
+        # graph_input --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul
+        #                                                                  |
+        #                                                                 node
+        sim_ln_nodes_4 = self.model.match_parent_path(
+            node,
+            ["Mul", "Div", "Sqrt", "Add", "ReduceMean", "Pow"],
+            [0, 1, 1, 0, 0, 0],
+        )
+
+        add_node, pow_node = None, None
+        if sim_ln_nodes_1 is not None:
+            sim_ln_nodes = sim_ln_nodes_1
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_2 is not None:
+            sim_ln_nodes = sim_ln_nodes_2
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_3 is not None:
+            sim_ln_nodes = sim_ln_nodes_3
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-2]
+        elif sim_ln_nodes_4 is not None:
+            sim_ln_nodes = sim_ln_nodes_4
+            add_node = sim_ln_nodes[3]
+            pow_node = sim_ln_nodes[-1]
+            # Verify that parent input to Pow node is graph_input
+            if pow_node.input[0] not in self.model.get_graphs_input_names():
+                return
+        else:
+            return
+
+        layernorm_weight_index = (
+            1 if sim_ln_nodes in (sim_ln_nodes_3, sim_ln_nodes_4) else 0
+        )
+        starts_with_graph_input = sim_ln_nodes == sim_ln_nodes_4
+
+        if self.model.find_constant_input(pow_node, 2.0) != 1:
+            return
+
+        root_input = pow_node.input[0]
+        if root_input != sim_ln_nodes[0].input[0]:
+            return
+
+        i, add_weight = self.model.get_constant_input(add_node)
+        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
+            logger.warning(f"epsilon value is not expected: {add_weight}")
+            return
+
+        self.nodes_to_remove.extend(
+            sim_ln_nodes[:-1] if not starts_with_graph_input else sim_ln_nodes
+        )
+        self.nodes_to_remove.append(node)
+
+        normalize_node = helper.make_node(
+            "RMSNormPluginDynamic_IxRT",
+            inputs=[root_input, node.input[layernorm_weight_index]],
+            outputs=[node.output[0]],
+            name=self.model.create_node_name(
+                "RMSNormPluginDynamic_IxRT", name_prefix="RMSNorm_"
+            ),
+        )
+
+        normalize_node.domain = "com.iluvatar"
+        normalize_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        normalize_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        normalize_node.attribute.extend(
+            [helper.make_attribute("epsilon", float(add_weight))]
+        )
+        normalize_node.attribute.extend([helper.make_attribute("axis", -1)])
+        normalize_node.attribute.extend([helper.make_attribute("stash_type", 1)])
+        gamma_data = self.model.get_initializer(normalize_node.input[1])
+        gamma_data_np = NumpyHelper.to_array(gamma_data)
+        normalize_node.attribute.extend(
+            [helper.make_attribute("hidden_size", int(gamma_data_np.shape[0]))]
+        )
+
+        normalize_node.attribute.extend([helper.make_attribute("gamma", gamma_data)])
+
+        self.nodes_to_add.append(normalize_node)
+        self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+        return True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5079c2d38c8fd465e49ca51735c570706c9bd40
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_roformer_attention.py
@@ -0,0 +1,368 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionRoformerCrossAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQkvCrossToContext_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, mul: NodeProto
+    ) -> Tuple[int, int]:
+        mul_initializer = self.model.get_initializer(mul.input[1])
+
+        # 检查float_data是否为空
+        if len(mul_initializer.float_data) > 0:
+            mul_value = mul_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(mul_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
+                mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the mul_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.floor(1.0 / (mul_value * mul_value))
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input_q: str,
+        input_k: str,
+        input_v: str,
+        input_mask: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input_q: str,
+            input_k: str,
+            input_v: str,
+            input_mask: str,
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("CrossAttention")
+
+        attention_inputs = [input_q, input_k, input_v, input_mask]
+
+        attention_node = helper.make_node(
+            "CustomQkvCrossToContext_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+        return attention_node
+
+    def get_shape(self, edge_name):
+        for info in self.model.graph().value_info:
+            if info.name == edge_name:
+                return info.type.tensor_type.shape.dim
+        return None
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (
+                [
+                    "CustomFCPluginDynamic_IxRT",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "MatMul",
+                ],
+                [0, 0, 0, 0, 0],
+            ),
+            "path2": (
+                [
+                    "CustomFCPluginDynamic_IxRT",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "MatMul",
+                ],
+                [1, 0, 0, 0, 0],
+            ),
+        }
+        # print('start_nodes:', start_node.name)
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        fc_after_atten = None
+        if qkv_path in ["path1", "path2"]:
+            (
+                fc_after_atten,
+                reshape_qkv_2,
+                transpose_qkv,
+                reshape_qkv_1,
+                matmul_qkv,
+            ) = qkv_nodes
+
+        """
+        Match
+        Add --> LayerNormalization -->  Attention -->     Add --> LayerNormalization
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        v_paths = {"path1": (["Reshape", "Transpose", "Reshape"], [1, 0, 0])}
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == "path1":
+            (reshape_v, transpose_v, v_reshape) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (
+                ["Softmax", "Add", "Mul", "Mul", "Reshape", "MatMul"],
+                [0, 0, None, None, None, 0],
+            )
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+        # print('qk_nodes', qk_nodes[0].name)
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, add_mask, mul_mask, mul_qk, reshape_qk, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1": (["Transpose", "Add"], [0, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        # print('q_nodes', q_nodes[0].name)
+        if q_path == "path1":
+            (q_tranpose, q_add) = q_nodes
+
+        k_paths = {
+            "path1": (["Reshape", "Transpose", "Add"], [1, 0, 0]),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        # print('k_nodes', k_nodes[0].name)
+        if k_path == "path1":
+            (_, k_transpose, k_add) = k_nodes
+        # print('add_mask', add_mask.name)
+        mask_paths = {
+            "path1": (
+                ["Mul", "Sub", "Unsqueeze", "Cast", "Greater"],
+                [1, None, 1, 0, 0],
+            )
+        }
+        mask_nodes, mask_path = self.match_parent_path_from_dict(add_mask, mask_paths)
+
+        if mask_nodes is None:
+            logger.debug("fuse_attention: failed to match mask path")
+            return
+        # print('mask_nodes', mask_nodes[0].name)
+        (_, mask_sub, mask_unsqueeze, mask_cast, mask_greater) = mask_nodes
+
+        if (
+            self.get_shape(q_add.output[0]) == self.get_shape(k_add.output[0])
+            and self.get_shape(k_add.output[0]) == self.get_shape(v_reshape.output[0])
+            and mul_mask.input[1] in mask_unsqueeze.output
+        ):
+            attention_last_node = reshape_qkv_1
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+                fc_after_atten, mul_qk
+            )
+
+            q_transpose_type = None
+            q_transpose_name = None
+            for info in self.model.graph().value_info:
+                if info.name == q_tranpose.output[0]:
+                    q_transpose_type = info.type
+                    q_transpose_name = info.name
+                    break
+
+            q_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_q", q_transpose_type
+            )
+            q_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[q_add.output[0]],
+                outputs=[q_transpose_output.name],
+                name=q_transpose_output.name,
+            )
+            q_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            k_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_k", q_transpose_type
+            )
+            k_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[k_add.output[0]],
+                outputs=[k_transpose_output.name],
+                name=k_transpose_output.name,
+            )
+            k_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            v_transpose_output = helper.make_value_info(
+                q_transpose_name[:-2] + "_fake_v", q_transpose_type
+            )
+            v_transpose_node = helper.make_node(
+                "Transpose",
+                inputs=[v_reshape.output[0]],
+                outputs=[v_transpose_output.name],
+                name=v_transpose_output.name,
+            )
+            v_transpose_node.attribute.extend(
+                [helper.make_attribute("perm", [0, 2, 1, 3])]
+            )
+
+            mask_type = None
+            for info in self.model.graph().value_info:
+                if info.name == mask_sub.output[0]:
+                    mask_type = info.type
+                    break
+
+            new_mask_type = onnx.TypeProto()
+            new_mask_type.tensor_type.elem_type = onnx.TensorProto.INT32
+            for dim in mask_type.tensor_type.shape.dim:
+                new_dim = new_mask_type.tensor_type.shape.dim.add()
+                new_dim.CopyFrom(dim)
+
+            mask_cast_to_int32_output = helper.make_value_info(
+                mask_sub.name + "_cast_to_int32", new_mask_type
+            )
+            mask_cast_to_int32_node = helper.make_node(
+                "Cast",
+                inputs=[mask_sub.output[0]],
+                outputs=[mask_cast_to_int32_output.name],
+                name=mask_cast_to_int32_output.name,
+            )
+            mask_cast_to_int32_node.attribute.extend([helper.make_attribute("to", 6)])
+
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                q_transpose_node.output[0],
+                k_transpose_node.output[0],
+                v_transpose_node.output[0],
+                mask_cast_to_int32_node.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.extend(
+                [
+                    q_transpose_node,
+                    k_transpose_node,
+                    v_transpose_node,
+                    new_node,
+                    mask_cast_to_int32_node,
+                ]
+            )
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[q_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[k_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[v_transpose_node.name] = self.this_graph_name
+            self.node_name_to_graph_name[
+                mask_cast_to_int32_node.name
+            ] = self.this_graph_name
+
+            self.nodes_to_remove.extend(qkv_nodes[3:])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes[:-1])
+            self.nodes_to_remove.extend(k_nodes[:-1])
+            self.nodes_to_remove.extend(v_nodes[:-1])
+            self.nodes_to_remove.extend([mask_nodes[0]])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ca376c39904b298973f403c2989418ec17e460e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_rope.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionRoPE(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "CustomRoPEPluginDynamic_IxRT", "Add")
+
+    def fuse(self, start_node, input_name_to_nodes, output_name_to_node):
+        src_paths = {"path1": (["Mul", "Concat", "Split", "Slice"], [0, 1, None, 0])}
+        src_nodes, src_path = self.match_parent_path_from_dict(start_node, src_paths)
+        if src_nodes is None:
+            logger.debug("fuse_rope: failed to match src_node")
+            return
+
+        src_node = src_nodes[0]
+
+        rotate_paths = {"path1": (["Mul", "Reshape", "Concat"], [1, 0, 0])}
+        rotate_nodes, rotate_path = self.match_parent_path_from_dict(
+            start_node, rotate_paths
+        )
+
+        if rotate_nodes is None:
+            logger.debug("fuse_rope: failed to match rotate_path")
+            return
+
+        concat_node = rotate_nodes[-1]
+        mul_right_node = rotate_nodes[0]
+
+        odd_paths = {"path1": (["Unsqueeze", "Neg", "Slice", "Reshape"], [0, 0, 0, 0])}
+        odd_nodes, odd_path = self.match_parent_path_from_dict(concat_node, odd_paths)
+
+        if odd_nodes is None:
+            logger.debug("fuse_rope: failed to match odd_path")
+            return
+
+        even_paths = {"path1": (["Unsqueeze", "Slice", "Reshape"], [1, 0, 0])}
+        even_nodes, even_path = self.match_parent_path_from_dict(
+            concat_node, even_paths
+        )
+
+        if even_nodes is None:
+            logger.debug("fuse_rope: failed to match even_path")
+            return
+        reshape_node = even_nodes[-1]
+
+        if reshape_node.output[0] == src_node.input[0]:
+            rope_node_name = self.model.create_node_name("RoPE")
+            rope_node = helper.make_node(
+                "CustomRoPEPluginDynamic_IxRT",
+                inputs=[
+                    reshape_node.output[0],
+                    src_nodes[0].input[1],
+                    mul_right_node.input[1],
+                ],
+                outputs=[start_node.output[0]],
+                name=rope_node_name,
+            )
+            rope_node.domain = "com.iluvatar"
+            rope_node.attribute.extend([helper.make_attribute("type_id", 2)])
+            rope_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+            rope_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+            self.nodes_to_add.append(rope_node)
+            self.node_name_to_graph_name[rope_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([start_node])
+            self.nodes_to_remove.extend([src_nodes[0]])
+            self.nodes_to_remove.extend(rotate_nodes)
+            self.nodes_to_remove.extend(odd_nodes[:-1])
+            self.nodes_to_remove.extend(even_nodes[:-1])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47be680f13948c63ea73694d443488cf992daa1
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_shape.py
@@ -0,0 +1,110 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Dict, List, Union
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils
+from onnx import NodeProto, TensorProto
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionShape(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "Shape", "Concat")
+        self.utils = FusionUtils(model)
+        self.shape_infer = None
+        self.shape_infer_done = False
+
+    def get_dimensions_from_tensor_proto(self, tensor_proto: TensorProto) -> Union[int, None]:
+        if tensor_proto.type.tensor_type.HasField("shape"):
+            return len(tensor_proto.type.tensor_type.shape.dim)
+        else:
+            return None
+
+    def get_dimensions(self, input_name: str) -> Union[int, None]:
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input:
+            return self.get_dimensions_from_tensor_proto(graph_input)
+
+        if not self.shape_infer_done:
+            self.shape_infer = self.model.infer_runtime_shape({}, update=True)
+            self.shape_infer_done = True
+
+        if self.shape_infer is not None:
+            return self.get_dimensions_from_tensor_proto(self.shape_infer.known_vi_[input_name])
+
+        return None
+
+    def fuse(
+        self,
+        concat_node: NodeProto,
+        input_name_to_nodes: Dict[str, List[NodeProto]],
+        output_name_to_node: Dict[str, NodeProto],
+    ):
+        """
+        Smplify subgraph like
+
+                   (2d_input)
+                    /       \
+                Shape       shape
+                /             \
+            Gather(indices=0)  Gather(indices=1)
+                |                |
+            Unsqueeze(axes=0)   Unsqueeze(axes=0)
+                   \          /
+                      Concat 
+                        |
+
+        into  (2d_input) --> Shape -->
+        """
+        opset_version = self.model.get_opset_version()
+
+        inputs = len(concat_node.input)
+        root = None
+        shape_output = None
+        for i in range(inputs):
+            path = self.model.match_parent_path(
+                concat_node,
+                ["Unsqueeze", "Gather", "Shape"],
+                [i, 0, 0],
+                output_name_to_node,
+            )
+            if path is None:
+                return
+
+            unsqueeze, gather, shape = path
+            if i == 0:
+                shape_output = shape.output[0]
+            if root is None:
+                root = shape.input[0]
+                if self.get_dimensions(root) != inputs:
+                    return
+            elif shape.input[0] != root:
+                return
+
+            if not FusionUtils.check_node_attribute(unsqueeze, "axis", 0, default_value=0):
+                return
+
+            if opset_version < 13:
+                if not FusionUtils.check_node_attribute(unsqueeze, "axes", [0]):
+                    return
+            else:
+                if not self.utils.check_node_input_value(unsqueeze, 1, [0]):
+                    return
+
+            value = self.model.get_constant_value(gather.input[1])
+            from numpy import array_equal, ndarray
+
+            if not (isinstance(value, ndarray) and value.size == 1 and value.item() == i):
+                return
+
+        if self.model.find_graph_output(concat_node.output[0]) is None:
+            self.model.replace_input_of_all_nodes(concat_node.output[0], shape_output)
+            self.fused_count += 1
+            self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5868964467ee7555ea3b47603402f4034885c590
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_skiplayernorm.py
@@ -0,0 +1,212 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+
+from onnx import helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionSkipLayerNormalization(Fusion):
+    """
+    Fuse Add + LayerNormalization into one node: SkipLayerNormalization
+    Note: This fusion does not check the input shape of Add and LayerNormalization.
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model, "CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"
+        )
+        # Update shape inference is needed since other fusions might add new edge which does not have shape info yet.
+        self.shape_infer_helper = self.model.infer_runtime_shape(
+            {"batch_size": 4, "seq_len": 7}, update=True
+        )
+
+        if self.shape_infer_helper is None:
+            # TODO(tianleiwu): support subgraph in shape inference or add broadcasting in SkipLayerNormalization op.
+            logger.warning("symbolic shape inference disabled or failed.")
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        add = self.model.get_parent(node, 0, output_name_to_node)
+
+        # In some models there is input_ids->gather->add->LayerNorm and one of input of the
+        # add node is initializer with fixed shape which should not be fused into SkipLayerNorm
+        if add is None:
+            return
+
+        for add_input in add.input:
+            if self.model.get_initializer(add_input) != None:
+                return
+
+        # The number of input node of add should be 2
+        if len(self.model.get_parents(add)) != 2:
+            return
+
+        if self.shape_infer_helper is not None:
+            if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]):
+                logger.debug(
+                    "skip SkipLayerNormalization fusion since shape of inputs (%s, %s) are not same",
+                    add.input[0],
+                    add.input[1],
+                )
+                return
+        else:
+            layernorm_weight = self.model.get_initializer(node.input[1])
+            if layernorm_weight is not None:
+                layernorm_weight_arr = NumpyHelper.to_array(layernorm_weight)
+                hidden_size = layernorm_weight_arr.shape[0]
+            else:
+                logger.debug(
+                    "skip SkipLayerNormalization fusion since symbolic shape inference failed"
+                )
+                return
+
+        # gather_path = self.model.match_parent_path(add, ["Gather"], [None])
+        # if gather_path is not None and self.model.find_graph_input(gather_path[0].input[1]) is None:
+        #     if self.model.match_parent_path(gather_path[0], ["ConstantOfShape"], [1]) is None:
+        #         return
+
+        if (
+            add is not None
+            and add.op_type == "Add"
+            and self.model.is_safe_to_fuse_nodes(
+                [add, node], node.output, input_name_to_nodes, output_name_to_node
+            )
+        ):
+            self.nodes_to_remove.extend([add, node])
+
+            inputs = [add.input[0], add.input[1]]
+            normalize_node = helper.make_node(
+                "CustomSkipLayerNormPluginDynamic_IxRT",
+                inputs=inputs,
+                outputs=[node.output[0]],
+                name=self.model.create_node_name(
+                    "SkipLayerNormalization", name_prefix="SkipLayerNorm"
+                ),
+            )
+            normalize_node.domain = "com.iluvatar"
+            if self.shape_infer_helper is not None:
+                hidden_size = self.shape_infer_helper.get_edge_shape(node.input[1])[-1]
+            normalize_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
+            normalize_node.attribute.extend([helper.make_attribute("type_id", 2)])
+            normalize_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "beta", self.model.get_initializer(node.input[2])
+                    )
+                ]
+            )
+            normalize_node.attribute.extend(
+                [
+                    helper.make_attribute(
+                        "gamma", self.model.get_initializer(node.input[1])
+                    )
+                ]
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("plugin_namespace", "")]
+            )
+            normalize_node.attribute.extend(
+                [helper.make_attribute("plugin_version", "1")]
+            )
+
+            self.nodes_to_add.append(normalize_node)
+            self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name
+
+
+class FusionBiasSkipLayerNormalization(Fusion):
+    def __init__(self, model: OnnxModel):
+        super().__init__(
+            model,
+            "CustomSkipLayerNormPluginDynamic_IxRT",
+            "SkipLayerNormalization",
+            "add bias",
+        )
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+        if len(node.input) != 4:
+            return
+
+        return_indice = []
+        nodes = self.model.match_parent_path(
+            node, ["Add", "MatMul"], [None, None], None, return_indice
+        )
+        if nodes is None:
+            return
+        assert len(return_indice) == 2
+        add_input_index = return_indice[0]
+        if add_input_index >= 2:
+            return
+
+        (add, matmul) = nodes
+
+        # bias should be one dimension
+        bias_index = -1
+        for i, input in enumerate(add.input):
+            initializer = self.model.get_initializer(input)
+            if initializer is None:
+                continue
+            bias_index = i
+            bias_weight = NumpyHelper.to_array(initializer)
+            break
+        if bias_weight is None:
+            logger.debug(f"Bias weight not found")
+            return
+        if len(bias_weight.shape) != 1:
+            logger.debug(f"Bias weight is not 1D")
+            return
+
+        subgraph_nodes = [node, add]
+        if not self.model.is_safe_to_fuse_nodes(
+            subgraph_nodes, [node.output[0]], input_name_to_nodes, output_name_to_node
+        ):
+            logger.debug(
+                f"Skip fusing SkipLayerNormalization with Bias since it is not safe"
+            )
+            return
+
+        self.nodes_to_remove.extend(subgraph_nodes)
+        inputs = [
+            node.input[1 - add_input_index],
+            matmul.output[0],
+            node.input[2],
+            node.input[3],
+            add.input[bias_index],
+        ]
+        new_node = helper.make_node(
+            "CustomSkipLayerNormPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=node.output,
+            name=self.model.create_node_name(
+                "SkipLayerNormalization", "SkipLayerNorm_AddBias_"
+            ),
+        )
+        new_node.domain = "com.iluvatar"
+        hidden_size = self.shape_infer_helper.get_edge_shape(node.input[2])[-1]
+        new_node.attribute.extend([helper.make_attribute("ld", hidden_size)])
+        new_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        new_node.attribute.extend(
+            [helper.make_attribute("beta", self.model.get_initializer(node.input[3]))]
+        )
+        new_node.attribute.extend(
+            [helper.make_attribute("gamma", self.model.get_initializer(node.input[2]))]
+        )
+        new_node.attribute.extend(
+            [
+                helper.make_attribute(
+                    "bias", self.model.get_initializer(add.input[bias_index])
+                )
+            ]
+        )
+        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+
+        self.nodes_to_add.append(new_node)
+        self.node_name_to_graph_name[new_node.name] = self.this_graph_name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
new file mode 100644
index 0000000000000000000000000000000000000000..a74fe9ee0a86a88f271b085ae1b946b97b394e7e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_splitQKV.py
@@ -0,0 +1,109 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionSplitQKV(Fusion):
+    """
+    Fuse FusionSplitQKV
+    """
+
+    def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int):
+        super().__init__(model, "SplitQKV_IxRT", "MatMul")
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+
+    def create_splitqkv_node(
+        self, input: str, query_out: str, key_out: str, value_out: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        node_name = self.model.create_node_name("SplitQKV_IxRT")
+
+        new_node = helper.make_node(
+            "SplitQKV_IxRT",
+            inputs=[input],
+            outputs=[query_out, key_out, value_out],
+            name=node_name,
+        )
+        new_node.domain = "com.iluvatar"
+        new_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        new_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        new_node.attribute.extend(
+            [helper.make_attribute("atten_scale", 1 / self.num_heads)]
+        )
+
+        return new_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        split_query_paths = {
+            "query_path": (
+                ["Div", "Transpose", "Reshape", "Slice", "CustomFCPluginDynamic_IxRT"],
+                [0, 0, 0, 0, 0],
+            ),
+        }
+
+        split_key_paths = {
+            "key_path": (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
+        }
+
+        q_nodes, q_path = self.match_parent_path_from_dict(node, split_query_paths)
+
+        k_nodes, k_path = self.match_parent_path_from_dict(node, split_key_paths)
+
+        if (q_nodes is not None) and (k_nodes is not None):
+            (
+                q_div_node,
+                q_transpose_node,
+                q_reshape_node,
+                q_slice_node,
+                coustom_fc_node,
+            ) = q_nodes
+            k_transpose_node, k_reshape_node, k_slice_node = k_nodes
+            slice_nodes = self.model.get_children(coustom_fc_node)
+
+            if len(slice_nodes) != 3:
+                return
+            slice_nodes.remove(q_slice_node)
+            slice_nodes.remove(k_slice_node)
+            v_slice_node = slice_nodes[0]
+
+            node.input[0] = q_div_node.input[0]  # dele div
+            new_node = self.create_splitqkv_node(
+                coustom_fc_node.output[0],
+                q_slice_node.output[0],
+                k_slice_node.output[0],
+                v_slice_node.output[0],
+            )
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+            self.nodes_to_remove.append(q_slice_node)
+            self.nodes_to_remove.append(k_slice_node)
+            self.nodes_to_remove.append(v_slice_node)
+            self.nodes_to_remove.append(q_div_node)
+
+        else:
+            return
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..8edb9a5ada34fdc7ae8a5f8b0fecc0d57b57257b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_swinl_attention.py
@@ -0,0 +1,321 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union, List
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+import onnx
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionSwinLAttention(Fusion):
+    """
+    Fuse SwinL subgraph into one Attention node.
+    """
+
+    def __init__(
+            self,
+            model: OnnxModel,
+    ):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["CustomFCPluginDynamic_IxRT"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_v: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        v_shape = self.model.get_initializer(reshape_v.input[1])
+        if v_shape is None:
+            logger.debug(f"{reshape_v.input[1]} is not initializer.")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        v_shape_value = NumpyHelper.to_array(v_shape)
+        if len(v_shape_value) != 3 or (v_shape_value[1] <= 0 or v_shape_value[2] <= 0):
+            logger.debug(f"v_shape_value={v_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
+            return self.num_heads, self.hidden_size  # Fall back to user specified value
+
+        num_heads = 1
+        for value_info in self.model.graph().value_info:
+            if value_info.name == reshape_v.input[0]:
+                num_heads = value_info.type.tensor_type.shape.dim[2].dim_value
+                break
+        hidden_size = v_shape_value[2]
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+            self,
+            num_heads: int,
+            hidden_size: int,
+            inputs: List[str],
+            output: str,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", 1)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", 1)])
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        self.fuse_pattern1(normalize_node, input_name_to_nodes, output_name_to_node)
+        self.fuse_pattern2(normalize_node, input_name_to_nodes, output_name_to_node)
+
+    def fuse_pattern2(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        """ match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC
+         """
+        logger.debug("fuse swin-L attention pass")
+        # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
+        start_node = normalize_node
+        qkv_paths = {
+            "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
+        }
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+        assert qkv_path == 'path1', 'abnormal qkv path'
+        reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
+
+        # 2. MatMul as start, go up to find v path
+        v_paths = {
+            "path1": (["Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"], [None, 0, 0])
+        }
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if not v_nodes:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        assert v_path == 'path1', 'abnormal v path'
+
+        # 3. MatMul as start, go up to find q,k paths
+        # q path
+        q_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"],
+                      [None, 0, 0, 0, 0, 0, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
+        if not q_nodes:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        assert q_path == 'path1', 'abnormal q paths found'
+
+        # get Add(bias) input name as fused Attention inputs
+        add_op, div_op = q_nodes[1], q_nodes[2]
+        relative_position_bias_name = add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
+
+        # k path
+        k_paths = {
+            "path2": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "CustomFCPluginDynamic_IxRT"],
+                      [None, 0, 0, 0, 1, 0, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
+        if not k_nodes:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        assert k_path == 'path2', 'abnormal k paths found'
+        # 4. Fuse 3 CustomFC into one, and fuse attention
+        # Fuse FCs
+        fc_nodes = [q_nodes[-1], k_nodes[-1], v_nodes[-1]]
+        weight = self.fuse_tensor_in_node_attrs(fc_nodes, "W", q_nodes[-1].name + "_Weight")
+        bias = self.fuse_tensor_in_node_attrs(fc_nodes, "B", q_nodes[-1].name + "_Bias")
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[q_nodes[-1].input[0]],
+            outputs=q_nodes[-1].output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", numpy_helper.to_array(bias).shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", bias)])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+
+        # Fuse Attention
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
+        attention_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            [fused_node.output[0], relative_position_bias_name],
+            reshape_qkv.output[0],
+        )
+        if not attention_node:
+            return
+        self.nodes_to_add.append(attention_node)
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.nodes_to_remove.extend([*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes])
+        self.prune_graph = True
+
+    def fuse_pattern1(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        """ match Swin-L pattern and fuse them to CustomFC --> Attention --> CustomFC
+        """
+        logger.debug("fuse swin-L attention pass")
+        # 1. CustomFCPluginDynamic_IxRT node as start, go up to find a pattern for swin-L pattern
+        start_node = normalize_node
+        qkv_paths = {
+            "path1": (["Reshape", "Transpose", "MatMul"], [0, 0, 0]),
+        }
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+        assert qkv_path == 'path1', 'abnormal qkv path'
+        reshape_qkv, transpose_qkv, matmul_qkv = qkv_nodes
+
+        # 2. MatMul as start, go up to find v path
+        v_paths = {
+            "path1": (["Transpose", "Reshape", "Add", "Split", "MatMul"], [None, 0, 0, None, 0])
+        }
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if not v_nodes:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        assert v_path == 'path1', 'abnormal v path'
+
+        # 3. MatMul as start, go up to find q,k paths
+        # q path
+        q_paths = {
+            "path1": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "Add", "Split", "MatMul"],
+                      [None, 0, 0, 0, 0, 0, 0, None, 0]),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qkv, q_paths)
+        if not q_nodes:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        assert q_path == 'path1', 'abnormal q paths found'
+
+        # get Add(bias) input name as fused Attention inputs
+        add_op, div_op = q_nodes[1], q_nodes[2]
+        relative_position_bias_name = add_op.input[1] if add_op.input[0] == div_op.output[0] else add_op.input[0]
+
+        # k path
+        k_paths = {
+            "path2": (["Softmax", "Add", "Div", "MatMul", "Transpose", "Reshape", "Add", "Split", "MatMul"],
+                      [None, 0, 0, 0, 1, 0, 0, None, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qkv, k_paths)
+        if not k_nodes:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        assert k_path == 'path2', 'abnormal k paths found'
+        # 4. Attention and CustomFC have been found, now transform the found nodes to two plugin nodes
+        # Test 3 paths have the same origin
+        is_same_origin = q_nodes[-1] is k_nodes[-1] is v_nodes[-1]
+        is_same_origin &= q_nodes[-2] is k_nodes[-2] is v_nodes[-2]
+        is_same_origin &= q_nodes[-3] is not k_nodes[-2] is not v_nodes[-3]
+        if not is_same_origin:
+            print("swin-L fuse_attention: found qkv path but not has the same origin")
+            return
+        origin_matmul = q_nodes[-1]
+        fc_add = [q_nodes[-3], k_nodes[-3], v_nodes[-3]]
+        # Now fuse
+        num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_qkv)
+
+        # Fuse FC
+        weight = self.model.get_initializer(origin_matmul.input[1])
+        biases = [self.model.get_initializer(i.input[0]) for i in fc_add]
+        if not weight or not all(biases):
+            print("swin-L: couldn't find weights")
+            return
+        weight_arr = onnx.numpy_helper.to_array(weight).transpose(1,0)
+        weight.CopyFrom(numpy_helper.from_array(weight_arr))
+        bias_arr = np.concatenate([onnx.numpy_helper.to_array(i) for i in biases], axis=0)
+
+        fused_node = helper.make_node(
+            "CustomFCPluginDynamic_IxRT",
+            inputs=[origin_matmul.input[0]],
+            outputs=fc_add[0].output,
+            name=self.model.create_node_name("CustomFC", "MatMul_AddBias_"),
+        )
+        fused_node.domain = "com.iluvatar"
+        fused_node.attribute.extend([helper.make_attribute("out_dims", bias_arr.shape[0])])
+        fused_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        fused_node.attribute.extend([helper.make_attribute("W", weight)])
+        fused_node.attribute.extend([helper.make_attribute("B", numpy_helper.from_array(bias_arr))])
+        fused_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        fused_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        fused_node.attribute.extend([helper.make_attribute("act_type", -1)])
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+        self.nodes_to_add.append(fused_node)
+        # Fuse Attention
+        attention_node = self.create_attention_node(
+            num_heads,
+            hidden_size,
+            [fused_node.output[0], relative_position_bias_name],
+            reshape_qkv.output[0],
+
+        )
+        if not attention_node:
+            return
+        self.nodes_to_add.append(attention_node)
+        self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
+        self.nodes_to_remove.extend([*qkv_nodes, *q_nodes[:-2], *k_nodes[:-2], *v_nodes])
+        self.prune_graph = True
+
+    def fuse_tensor_in_node_attrs(self, fc_nodes, attr_name, tensor_name):
+        result = [get_tensor_attr(i.attribute, attr_name) for i in fc_nodes]
+        result = np.concatenate(result, axis=0)
+        result = numpy_helper.from_array(result, tensor_name)
+        return result
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..661e8375973d1dd6706ad95a112ddc177a178d53
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_t5_attention.py
@@ -0,0 +1,312 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionT5Attention(Fusion):
+    """
+    Fuse T5Attention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "RMSNormPluginDynamic_IxRT"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        q_shape = self.model.get_initializer(reshape_q.input[1])
+        if q_shape is None:
+            logger.debug(f"{reshape_q.input[1]} is not initializer.")
+            return [0, 0]
+
+        q_shape_value = NumpyHelper.to_array(q_shape)
+        if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
+            logger.debug(
+                f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size]."
+            )
+            return [0, 0]
+
+        num_heads = q_shape_value[2]
+        head_size = q_shape_value[3]
+        hidden_size = num_heads * head_size
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        add_input_is_value = False
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            if qk_bias:
+                add_input_is_value = True
+                qk_bias_arr = NumpyHelper.to_array(qk_bias)
+                if len(qk_bias_arr.shape) == 3:
+                    qk_bias_arr = qk_bias_arr.squeeze(0)
+                has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+                if np.any(has_neg_inf):
+                    qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
+                        np.float32
+                    )
+                qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+
+        attention_inputs = [input]
+
+        # 如果add的输入不是值，而是一个边，那么这个边的值需要cast到fp32
+        cast_node = None
+        if not add_input_is_value:
+            cast_out_name = attention_node_name + "_fp32_in1"
+            cast_out_tensor = helper.make_tensor_value_info(
+                cast_out_name, TensorProto.FLOAT, [None, None, None, None]
+            )
+            # self.model.add_initializer(cast_out_name)
+            cast_node = helper.make_node(
+                "Cast",
+                inputs=[matmul_qk_add.input[1]],
+                outputs=[cast_out_tensor.name],
+                name=self.model.create_node_name("Cast"),
+                to=1,
+            )
+            self.node_name_to_graph_name[cast_node.name] = self.this_graph_name
+            attention_inputs.append(cast_out_name)
+
+        if has_qk_bias:
+            if add_input_is_value:
+                has_mask = 1
+                attention_inputs.append(qk_bias.name)
+            else:
+                has_mask = 1
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("is_t5_mode", 1)])
+
+        return attention_node, cast_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "RMSNormPluginDynamic_IxRT":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (["MatMul", "Reshape", "Transpose", "MatMul"], [0, 0, 0, 0]),
+            "path2": (["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ["path1", "path2"]:
+            (atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match T5
+        Add/Gather --> LayerNormalization --> Attention --> Add --> LayerNormalization
+         |                                                  |
+         |                                                  |
+         +---------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Gather", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "RMSNormPluginDynamic_IxRT":
+                    root_input = child.output[0]
+
+        v_paths = {
+            "path1": (
+                ["Transpose", "Reshape", "Split", "MatMul"],
+                [1, 0, 0, None],
+            )  # T5
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == "path1":
+            (_, _, _, matmul_in_qkv) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {"path1": (["Transpose", "Reshape", "Split"], [0, 0, 0])}
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+
+        if q_path == "path1":
+            (_, reshape_q, split_q) = q_nodes
+            # print("   split_q.name : ", split_q.name)
+
+        k_paths = {
+            "path1": (["Transpose", "Reshape", "Split"], [1, 0, 0]),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+
+        if k_path == "path1":
+            (_, _, split_k) = k_nodes
+
+        if (
+            matmul_in_qkv.input[0] == root_input
+            and split_q.input[0] == matmul_in_qkv.output[0]
+            and split_k.input[0] == matmul_in_qkv.output[0]
+        ):
+            attention_last_node = reshape_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
+
+            new_node, new_cast_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                matmul_in_qkv.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            if new_cast_node:
+                self.nodes_to_add.append(new_cast_node)
+
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend(
+                [attention_last_node, transpose_qkv, matmul_qkv]
+            )
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes[:-2])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5207f28f0a57f417b1cbd45fdeb88168e2baf50d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_utils.py
@@ -0,0 +1,240 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from logging import getLogger
+from typing import Tuple
+
+import numpy
+from numpy import array_equal, ndarray
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from onnx import onnx_pb as onnx_proto
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionUtils:
+    def __init__(self, model: OnnxModel):
+        self.model: OnnxModel = model
+
+    def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]:
+        graph_input = self.model.find_graph_input(input_name)
+        if graph_input is not None and graph_input.type.tensor_type.elem_type != TensorProto.INT32:
+            cast_output, cast_node = self.cast_input_to_int32(input_name)
+            logger.debug(f"Casted graph input {input_name} to int32")
+            return True, cast_output
+
+        logger.debug(f"Did not cast graph input {input_name} to int32: found {graph_input is not None}")
+        return False, input_name
+
+    def cast_input_to_int32(self, input_name: str):
+        cast_output = input_name + "_int32"
+
+        # Avoid consequent Cast nodes.
+        inputs = [input_name]
+        output_name_to_node = self.model.output_name_to_node()
+        if input_name in output_name_to_node:
+            parent_node = output_name_to_node[input_name]
+            if parent_node and parent_node.op_type == "Cast":
+                inputs = [parent_node.input[0]]
+
+        cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output])
+        cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.INT32))])
+        self.model.add_node(cast_node)
+
+        return cast_output, cast_node
+
+    def remove_cast_int32(self, input_name: str):
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        nodes = input_name_to_nodes[input_name]
+        for node in nodes:
+            if node.op_type == "Cast":
+                is_int32 = False
+                for att in node.attribute:
+                    if att.name == "to" and att.i == int(TensorProto.INT32):
+                        is_int32 = True
+                        break
+                if is_int32:
+                    output_name = node.output[0]
+                    self.model.remove_node(node)
+                    self.model.replace_input_of_all_nodes(output_name, input_name)
+
+    @staticmethod
+    def check_node_attribute(node, attribute_name: str, expected_value, default_value=None):
+        """Verify that a node has expected value for an attribute.
+
+        Args:
+            node (NodeProto): a node to check
+            attribute_name (str): name of attribute
+            expected_value (Any): expected value of the attribute
+            default_value (Any, optional): default value if the attribute does not exist. Defaults to None.
+
+        Returns:
+            bool: whether the check is passed or not
+        """
+        value = default_value
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = helper.get_attribute_value(attr)
+
+        if isinstance(expected_value, list):
+            return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
+                expected_value, value, equal_nan=False
+            )
+        else:
+            return value == expected_value
+
+    @staticmethod
+    def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto):
+        """Transpose a 2-D INT8 TensorProto
+        Args:
+            tensor (TensorProto): tensor to be transposed
+        Returns:
+            tensor (TensorProto): transposed tensor
+        """
+        if not isinstance(tensor, onnx_proto.TensorProto):
+            raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
+
+        if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8:
+            raise ValueError("Only INT8 2-D tensors can be transposed")
+
+        if tensor.raw_data:
+            int32_data = numpy.reshape(numpy.frombuffer(tensor.raw_data, dtype="int8"), tensor.dims)
+            int32_transposed_data = numpy.transpose(int32_data, [1, 0])
+            tensor.raw_data = int32_transposed_data.tobytes()
+
+        else:
+            raise ValueError("only raw buffer supported")
+
+        return tensor
+
+    @staticmethod
+    def check_qdq_node_for_fusion(node: NodeProto, model: OnnxModel, allow_per_tensor_quantization_only=True):
+        """Verify if a provided QuantizeLinear (Q) / DequantizeLinear (DQ) node is a good candidate for fusion.
+           It is a good candidate for fusion if:
+           (1) The Q/DQ node is for per-tensor quantization if allow_per_tensor_quantization_only is `True`
+           (2) The Q/DQ node should have constant scale
+           (3) The Q/DQ node should have a zero point of 0
+        Args:
+            node (NodeProto): a Q/DQ node to check
+        Returns:
+            bool: whether the check is passed or not
+        """
+        if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
+            logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}")
+
+        scale = model.get_constant_value(node.input[1])
+
+        # Scale is not constant
+        if scale is None:
+            return False
+
+        # Not per-tensor quantization
+        scale_has_single_element = scale.ndim == 0 or (scale.ndim == 1 and scale.shape[0] == 1)
+        if allow_per_tensor_quantization_only and not scale_has_single_element:
+            return False
+
+        # If the Q/DQ node has no zero point input, it is assumed to be 0 (per ONNX spec)
+        if len(node.input) == 2:
+            return True
+
+        # Zero point should be constant and should have a value of 0
+        zero_point = model.get_constant_value(node.input[2])
+
+        # Zero point and scale should have same number of dims
+        if scale.ndim != zero_point.ndim:
+            return False
+
+        # Zero point is not constant or zero point is not zero
+        if zero_point is None:
+            return False
+
+        return numpy.all(zero_point == 0)
+
+    def check_node_input_value(self, node, input_index: int, expected_value):
+        """Verify that a node has expected input value
+
+        Args:
+            node (NodeProto): a node to check
+            input_index (int): index of its input to be verified
+            expected_value (Any): expected value of the input
+
+        Returns:
+            bool: whether the check is passed or not
+        """
+        assert len(node.input) > input_index
+
+        value = self.model.get_constant_value(node.input[input_index])
+
+        if isinstance(expected_value, list):
+            return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
+                expected_value, value, equal_nan=False
+            )
+        else:
+            return value == expected_value
+
+    def remove_identity_nodes(self):
+        """Remove Identity nodes, except those right before graph output."""
+        nodes_to_remove = []
+        for node in self.model.nodes():
+            if node.op_type == "Identity":
+                if node.output[0] not in self.model.get_graphs_output_names():
+                    self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            self.model.remove_nodes(nodes_to_remove)
+            logger.info(f"Removed {len(nodes_to_remove)} Identity nodes")
+
+    def remove_cascaded_cast_nodes(self):
+        self.model.remove_cascaded_cast_nodes()
+
+    def remove_useless_cast_nodes(self):
+        self.model.remove_useless_cast_nodes()
+
+    def remove_useless_reshape_nodes(self):
+        """Remove reshape node that is not needed based on symbolic shape inference: input and output has same shape"""
+        shape_infer = self.model.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            return
+
+        nodes_to_remove = []
+        for node in self.model.nodes():
+            if node.op_type == "Reshape":
+                input_shape = shape_infer.get_edge_shape(node.input[0])
+                output_shape = shape_infer.get_edge_shape(node.output[0])
+                if input_shape and output_shape and input_shape == output_shape:
+                    logger.info(
+                        f"Remove reshape node {node.name} since its input shape is same as output: {input_shape}"
+                    )
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            graph_input_names = set(self.model.get_graphs_input_names())
+            graph_output_names = set(self.model.get_graphs_output_names())
+            for node in nodes_to_remove:
+                if bool(set(node.output) & graph_output_names):
+                    if not bool(set(node.input) & graph_input_names):
+                        self.model.replace_output_of_all_nodes(node.input[0], node.output[0])
+                    else:
+                        continue
+                else:
+                    self.model.replace_input_of_all_nodes(node.output[0], node.input[0])
+                self.model.remove_node(node)
+
+
+class NumpyHelper:
+    @staticmethod
+    def to_array(tensor: TensorProto, fill_zeros: bool = False) -> ndarray:
+        # When weights are in external data format but not presented, we can still test the optimizer with two changes:
+        # (1) set fill_zeros = True  (2) change load_external_data=False in optimizer.py
+        if fill_zeros:
+            from onnx import mapping
+
+            return ndarray(
+                shape=tensor.dims,
+                dtype=mapping.TENSOR_TYPE_TO_NP_TYPE[tensor.data_type],
+            )
+
+        return numpy_helper.to_array(tensor)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..1133877bf6717dc7a2336db9e2c7976cf35c1405
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_videobert_attention.py
@@ -0,0 +1,306 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+import onnx
+import math
+
+logger = getLogger(__name__)
+
+class FusionVideoBertAttention(Fusion):
+    """
+    Fuse VideoBertAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "CustomQKVToContextPluginDynamic_IxRT", ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(self, atten_matmul: NodeProto, div: NodeProto) -> Tuple[int, int]:
+        """Detect num_heads and hidden_size from a reshape node.
+
+        Args:
+            reshape_q (NodeProto): reshape node for Q
+
+        Returns:
+            Tuple[int, int]: num_heads and hidden_size
+        """
+
+        # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
+        atten_matul_initializer = self.model.get_initializer(atten_matmul.input[1])
+        div_initializer = self.model.get_initializer(div.input[1])
+        
+        # 检查float_data是否为空
+        if len(div_initializer.float_data) > 0:
+            div_value = div_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(div_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[div_initializer.data_type]
+                div_value = np.frombuffer(div_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the div_initializer")
+            
+        atten_matul_shape_value = NumpyHelper.to_array(atten_matul_initializer).shape
+        head_dim = math.ceil(div_value*div_value)
+        hidden_size = atten_matul_shape_value[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size 
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+        
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            qk_bias_arr = NumpyHelper.to_array(qk_bias)
+            if len(qk_bias_arr.shape) == 3:
+                qk_bias_arr = qk_bias_arr.squeeze(0)
+            has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+            if np.any(has_neg_inf):
+                qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(np.float32)
+            qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+        
+        attention_inputs = [
+            input
+        ]
+        
+        if qk_bias is not None:
+            has_mask = 1
+            attention_inputs.append(qk_bias.name)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend([helper.make_attribute("hidden_size", hidden_size)])
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend([helper.make_attribute("has_qk_bias", has_qk_bias)])
+        
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1" : (["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [0, None, 0, 0, 0]),
+            "path2" : (["Add", "MatMul", "Reshape", "Transpose", "MatMul"], [1, None, 0, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ['path1', 'path2']:
+            (_, atten_matmul, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match videobert              
+        transpose/Add --> LayerNormalization -->  Attention -->     Add --> LayerNormalization
+         |                                                        |
+         |                                                        |
+         +---------------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == 'LayerNormalization':
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == 'LayerNormalization':
+                    root_input = child.output[0]
+
+        v_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice", "Add", "MatMul"], [1, 0, 0, 0, None]) # videobert
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+        if v_path == 'path1':
+            (_, _, _, add_in_qkv, matmul_in_qkv) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+        
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None])
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+        
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+        
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice"], [0, 0, 0]),
+            "path2" : (["Div", "Transpose", "Reshape", "Slice"], [0, 0, 0, 0])
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+        
+        if q_path == 'path1':
+            (_, _, slice_q) = q_nodes
+        else:
+            (div, _, _, slice_q) = q_nodes
+
+        k_paths = {
+            "path1" : (["Transpose", "Reshape", "Slice"], [1, 0, 0]),
+            "path2" : (["Div", "Transpose", "Reshape", "Slice"], [1, 0, 0, 0])
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+        
+        if k_path == 'path1':
+            (_, _, slice_k) = k_nodes
+        else:
+            (div, _, _, slice_k) = k_nodes
+        
+        if matmul_in_qkv.input[0] == root_input and slice_q.input[0] == add_in_qkv.output[0] and slice_k.input[0] == add_in_qkv.output[0]:
+            attention_last_node = reshape_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(atten_matmul, div)
+            
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                add_in_qkv.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes)
+            self.nodes_to_remove.extend(k_nodes)
+            self.nodes_to_remove.extend(v_nodes[:-2])
+            
+            # fuse head and tail transpose
+            if transpose_before_layernorm is not None:
+                node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+                for child in node_children:
+                    for i, input in enumerate(child.input):
+                        if child.input[i] == transpose_before_layernorm.output[0]:
+                            child.input[i] = transpose_before_layernorm.input[0]
+                self.nodes_to_remove.extend([transpose_before_layernorm])
+                
+                node = transpose_before_layernorm
+                while True:
+                    found = False
+                    node_children = input_name_to_nodes[node.output[0]]
+                    for child in node_children:
+                        if child is not None and child.op_type in ['SkipLayerNorm', "Add"]:
+                            node = child
+                            found = True
+                            break
+                    if not found:
+                        break
+                node_children = input_name_to_nodes[node.output[0]]
+                if len(node_children) == 1 and node_children[0].op_type == 'Transpose':
+                    transpose_node = node_children[0]
+                    transpose_children = input_name_to_nodes[transpose_node.output[0]]
+                    for i, input in enumerate(transpose_children[0].input):
+                        if transpose_children[0].input[i] == transpose_node.output[0]:
+                            transpose_children[0].input[i] = transpose_node.input[0]
+                    self.nodes_to_remove.extend([transpose_node])
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            # self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6e16f17a8a7a9679f9dc52d2902297ee3d0e33a
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_vit_attention.py
@@ -0,0 +1,354 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import math
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import Tuple, Union
+
+import numpy as np
+import onnx
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_options import AttentionMaskFormat
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+from .shape_infer_helper import SymbolicShapeInferenceHelper, get_shape_from_type_proto
+
+logger = getLogger(__name__)
+
+
+class FusionVITAttention(Fusion):
+    """
+    Fuse VITAttention subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(
+            model,
+            "CustomQKVToContextPluginDynamic_IxRT",
+            ["CustomSkipLayerNormPluginDynamic_IxRT", "LayerNormalization"],
+        )
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def get_num_heads_and_hidden_size(
+        self, custom_fc: NodeProto, mul: NodeProto
+    ) -> Tuple[int, int]:
+        mul_initializer = self.model.get_initializer(mul.input[1])
+
+        # 检查float_data是否为空
+        if len(mul_initializer.float_data) > 0:
+            mul_value = mul_initializer.float_data[0]
+        else:
+            # 如果float_data为空，尝试其他方式获取数据
+            # 例如，如果数据存储在raw_data中
+            if len(mul_initializer.raw_data) > 0:
+                dtype = onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[mul_initializer.data_type]
+                mul_value = np.frombuffer(mul_initializer.raw_data, dtype=dtype)[0]
+            else:
+                raise ValueError("Data not found in the mul_initializer")
+
+        for attr in custom_fc.attribute:
+            if attr.name == "W":
+                tensor_value = attr.t
+                tensor_shape = [dim for dim in tensor_value.dims]
+                break
+        head_dim = math.floor(1.0 / (mul_value * mul_value)) * math.floor(
+            1.0 / (mul_value * mul_value)
+        )
+        hidden_size = tensor_shape[0]
+        num_heads = hidden_size // head_dim
+
+        return num_heads, hidden_size
+
+    def create_attention_node(
+        self,
+        num_heads: int,
+        hidden_size: int,
+        input: str,
+        output: str,
+        matmul_qk_add: NodeProto,
+    ) -> Union[NodeProto, None]:
+        """Create an Attention node.
+
+        Args:
+            num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
+            hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
+            input (str): input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        assert num_heads > 0
+        # print(hidden_size, num_heads)
+        if hidden_size > 0 and (hidden_size % num_heads) != 0:
+            logger.debug(
+                f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}"
+            )
+            return None
+
+        attention_node_name = self.model.create_node_name("Attention")
+
+        qk_bias = None
+        has_mask = 0
+        has_qk_bias = 0
+        if matmul_qk_add is not None:
+            has_qk_bias = 1
+            qk_bias = self.model.get_initializer(matmul_qk_add.input[1])
+            qk_bias_arr = NumpyHelper.to_array(qk_bias)
+            if len(qk_bias_arr.shape) == 3:
+                qk_bias_arr = qk_bias_arr.squeeze(0)
+            has_neg_inf = np.isinf(qk_bias_arr) & (qk_bias_arr < 0)
+            if np.any(has_neg_inf):
+                qk_bias_arr = np.where(qk_bias_arr == -np.inf, -100, 0.0).astype(
+                    np.float32
+                )
+            qk_bias.CopyFrom(numpy_helper.from_array(qk_bias_arr, qk_bias.name))
+
+        attention_inputs = [input]
+
+        if qk_bias is not None:
+            has_mask = 1
+            attention_inputs.append(qk_bias.name)
+
+        attention_node = helper.make_node(
+            "CustomQKVToContextPluginDynamic_IxRT",
+            inputs=attention_inputs,
+            outputs=[output],
+            name=attention_node_name,
+        )
+        attention_node.domain = "com.iluvatar"
+        attention_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
+        attention_node.attribute.extend(
+            [helper.make_attribute("hidden_size", hidden_size)]
+        )
+        attention_node.attribute.extend([helper.make_attribute("has_mask", has_mask)])
+        attention_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        attention_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        attention_node.attribute.extend(
+            [helper.make_attribute("has_qk_bias", has_qk_bias)]
+        )
+
+        return attention_node
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
+        # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
+        start_node = normalize_node
+        if normalize_node.op_type == "LayerNormalization":
+            add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
+            if add_before_layernorm is not None:
+                start_node = add_before_layernorm
+
+        # SkipLayerNormalization has two inputs, and one of them is the root input for attention.
+        qkv_paths = {
+            "path1": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [0, 0, 0]),
+            "path2": (["CustomFCPluginDynamic_IxRT", "Transpose", "MatMul"], [1, 0, 0]),
+        }
+
+        qkv_nodes, qkv_path = self.match_parent_path_from_dict(start_node, qkv_paths)
+
+        if qkv_nodes is None:
+            logger.debug("fuse_attention: failed to match qkv path")
+            return
+
+        if qkv_path in ["path1", "path2"]:
+            (custom_fc_after_atten, transpose_qkv, matmul_qkv) = qkv_nodes
+
+        other_inputs = []
+        for i, input in enumerate(start_node.input):
+            if input not in output_name_to_node:
+                continue
+
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
+
+        root_input = other_inputs[0]
+        """
+        Match VIT
+        transpose --> LayerNormalization -->  custom_fc -> attention -> Add
+         |                                                                  |
+         |                                                                  |
+         +-------------------------------------------------------------------
+        """
+        transpose_before_layernorm = self.model.match_parent(start_node, "Transpose", 0)
+        if transpose_before_layernorm is not None:
+            node_children = input_name_to_nodes[transpose_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        add_before_layernorm = self.model.match_parent(start_node, "Add", None)
+        if add_before_layernorm is not None:
+            node_children = input_name_to_nodes[add_before_layernorm.output[0]]
+            for child in node_children:
+                if child is not None and child.op_type == "LayerNormalization":
+                    root_input = child.output[0]
+
+        # print("root_input: ", root_input, matmul_qkv.name)
+        v_paths = {
+            "path1": (
+                [
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "Gather",
+                    "Squeeze",
+                    "Transpose",
+                    "Unsqueeze",
+                    "Reshape",
+                    "CustomFCPluginDynamic_IxRT",
+                ],
+                [1, 0, 0, 0, 0, 0, 0, 0, 0],
+            )  # vit
+        }
+
+        v_nodes, v_path = self.match_parent_path_from_dict(matmul_qkv, v_paths)
+
+        squeeze_input = custom_fc = None
+        if v_path == "path1":
+            (_, _, _, _, squeeze_input, _, _, _, custom_fc) = v_nodes
+
+        if v_nodes is None:
+            logger.debug("fuse_attention: failed to match v path")
+            return
+
+        qk_paths = {
+            "path1": (["Softmax", "MatMul"], [0, 0]),
+            "path2": (["Softmax", "Add", "MatMul"], [0, 0, None]),
+        }
+
+        qk_nodes, qk_path = self.match_parent_path_from_dict(matmul_qkv, qk_paths)
+        # print("qk_nodes:", qk_nodes[1].name)
+        if qk_nodes is None:
+            logger.debug("fuse_attention: failed to match qk path")
+            return
+
+        matmul_qk_add = None
+        if qk_path == "path1":
+            (_, matmul_qk) = qk_nodes
+        else:
+            (_, matmul_qk_add, matmul_qk) = qk_nodes
+
+        q_paths = {
+            "path1": (
+                ["Mul", "Reshape", "Transpose", "Reshape", "Gather", "Squeeze"],
+                [0, 0, 0, 0, 0, 0],
+            ),
+        }
+        q_nodes, q_path = self.match_parent_path_from_dict(matmul_qk, q_paths)
+        # print("q_nodes:", q_nodes[0].name)
+        squeeze_q = mul_q = None
+        if q_path == "path1":
+            squeeze_q = q_nodes[-1]
+            mul_q = q_nodes[0]
+
+        if q_nodes is None:
+            logger.debug("fuse_attention: failed to match q path")
+            return
+
+        k_paths = {
+            "path1": (
+                [
+                    "Mul",
+                    "Transpose",
+                    "Reshape",
+                    "Transpose",
+                    "Reshape",
+                    "Gather",
+                    "Squeeze",
+                ],
+                [1, 0, 0, 0, 0, 0, 0],
+            ),
+        }
+        k_nodes, k_path = self.match_parent_path_from_dict(matmul_qk, k_paths)
+        # print("k_nodes:", k_nodes[0].name)
+        squeeze_k = None
+        if k_path == "path1":
+            squeeze_k = k_nodes[-1]
+
+        if k_nodes is None:
+            logger.debug("fuse_attention: failed to match k path")
+            return
+
+        if (
+            custom_fc.input[0] == root_input
+            and squeeze_input == squeeze_q
+            and squeeze_input == squeeze_k
+        ):
+            attention_last_node = transpose_qkv
+
+            num_heads, hidden_size = self.get_num_heads_and_hidden_size(
+                custom_fc_after_atten, mul_q
+            )
+
+            new_node = self.create_attention_node(
+                num_heads,
+                hidden_size,
+                custom_fc.output[0],
+                attention_last_node.output[0],
+                matmul_qk_add,
+            )
+            if new_node is None:
+                return
+
+            self.nodes_to_add.append(new_node)
+            self.node_name_to_graph_name[new_node.name] = self.this_graph_name
+
+            self.nodes_to_remove.extend([transpose_qkv, matmul_qkv])
+            self.nodes_to_remove.extend(qk_nodes)
+            self.nodes_to_remove.extend(q_nodes[:-1])
+            self.nodes_to_remove.extend(k_nodes[:-1])
+            self.nodes_to_remove.extend(v_nodes[:-1])
+
+            # fuse head and tail transpose
+            if transpose_before_layernorm is not None:
+                node_children = input_name_to_nodes[
+                    transpose_before_layernorm.output[0]
+                ]
+                for child in node_children:
+                    for i, input in enumerate(child.input):
+                        if child.input[i] == transpose_before_layernorm.output[0]:
+                            child.input[i] = transpose_before_layernorm.input[0]
+                self.nodes_to_remove.extend([transpose_before_layernorm])
+
+                node = transpose_before_layernorm
+                while True:
+                    found = False
+                    node_children = input_name_to_nodes[node.output[0]]
+                    for child in node_children:
+                        if child is not None and child.op_type in [
+                            "SkipLayerNorm",
+                            "Add",
+                        ]:
+                            node = child
+                            found = True
+                            break
+                    if not found:
+                        break
+                node_children = input_name_to_nodes[node.output[0]]
+                if len(node_children) == 1 and node_children[0].op_type == "Transpose":
+                    transpose_node = node_children[0]
+                    transpose_children = input_name_to_nodes[transpose_node.output[0]]
+                    for i, input in enumerate(transpose_children[0].input):
+                        if transpose_children[0].input[i] == transpose_node.output[0]:
+                            transpose_children[0].input[i] = transpose_node.input[0]
+                    self.nodes_to_remove.extend([transpose_node])
+            # Use prune graph to remove mask nodes since they are shared by all attention nodes.
+            # self.nodes_to_remove.extend(mask_nodes)
+            # self.prune_graph = True
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d9cb2d8de05e0e59cb369c1d336649e4f8b429
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_xsoftmax.py
@@ -0,0 +1,83 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+from logging import getLogger
+from typing import Tuple, Union
+
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+class FusionXSoftmax(Fusion):
+    """
+    Fuse Where + Softmax + Where into one node: XSoftmax
+    """
+
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, "XSoftmax_IxRT", "MatMul")
+
+    def create_xsoftmax_node(
+        self, data_input: str, mask_input: str, output: str
+    ) -> Union[NodeProto, None]:
+        """Create an XSoftmax node.
+
+        Args:
+            data_input (str): data input name
+            mask_input (str): max input name
+            output (str): output name
+
+        Returns:
+            Union[NodeProto, None]: the node created or None if failed.
+        """
+        xsoftmax_node_name = self.model.create_node_name("XSoftmax")
+
+        xsoftmax_node = helper.make_node(
+            "XSoftmax_IxRT",
+            inputs=[data_input, mask_input],
+            outputs=[output],
+            name=xsoftmax_node_name,
+        )
+        xsoftmax_node.domain = "com.iluvatar"
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_namespace", "")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("plugin_version", "1")])
+        xsoftmax_node.attribute.extend([helper.make_attribute("type_id", 2)])
+        xsoftmax_node.attribute.extend([helper.make_attribute("dim", -1)])
+
+        return xsoftmax_node
+
+    def fuse(self, node, input_name_to_nodes, output_name_to_node):
+
+        xsoftmax_paths = {
+            "path": (["Where", "Softmax", "Where", "Add"], [None, None, None, None]),
+        }
+        xsoftmax_nodes, xsoftmax_path = self.match_parent_path_from_dict(
+            node, xsoftmax_paths
+        )
+
+        if xsoftmax_nodes is None:
+            logger.debug("fuse_xsoftmax: failed to match xsoftmax path")
+            return
+        else:
+            (tail_where, softmax, head_where, add) = xsoftmax_nodes
+            where_inputs = [i for i in tail_where.input if i in head_where.input]
+            assert len(where_inputs) == 1
+            mask_input = where_inputs[0]
+            data_input = add.output[0]
+            data_output = tail_where.output[0]
+
+            xsoftmax_node = self.create_xsoftmax_node(
+                data_input, mask_input, data_output
+            )
+
+            self.nodes_to_add.append(xsoftmax_node)
+            self.node_name_to_graph_name[xsoftmax_node.name] = self.this_graph_name
+            self.nodes_to_remove.append(tail_where)
+            self.nodes_to_remove.append(softmax)
+            self.nodes_to_remove.append(head_where)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba66693c965db49dc4287911fc00e2373a20efbc
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/fusion_yolov5_decoder.py
@@ -0,0 +1,131 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+from enum import Enum
+from logging import getLogger
+from os import name
+from sys import path
+from typing import List, Tuple, Union
+
+import numpy as np
+from onnx import NodeProto, TensorProto, helper, numpy_helper
+
+from .fusion_base import Fusion
+from .fusion_utils import FusionUtils, NumpyHelper
+from .onnx_model import OnnxModel
+
+logger = getLogger(__name__)
+
+
+def get_tensor_attr(attrs, attr_name):
+    result = None
+    for i in attrs:
+        if i.name == attr_name:
+            return numpy_helper.to_array(i.t)
+    return result
+
+
+class FusionYoloV5Decoder(Fusion):
+    """
+    Fuse SwinL subgraph into one Attention node.
+    """
+
+    def __init__(
+        self,
+        model: OnnxModel,
+    ):
+        super().__init__(model, "YoloV5Decoder", ["Reshape"])
+
+        # Flags to show warning only once
+        self.num_heads_warning = True
+        self.hidden_size_warning = True
+
+    def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
+        short_path = ["Concat", "Slice", "Sigmoid", "Transpose", "Reshape"]
+        paths = [
+            (["Concat", "Unsqueeze", "Gather", "Shape"], [1] + [None] * 3),
+            (
+                ["Concat", "Mul", "Add", "Sub", "Mul", "Slice", "Sigmoid", "Transpose"],
+                [0, 0] + [None] * 6,
+            ),
+            (
+                ["Concat", "Mul", "Pow", "Mul", "Slice", "Sigmoid", "Transpose"],
+                [0, 1] + [None] * 5,
+            ),
+            (short_path, [None] * 5),
+            (short_path + ["Concat", "Unsqueeze", "Gather", "Shape"], [None] * 9),
+        ]
+        paths_found = []
+        nodes_names_found = set()
+        nodes_found = []
+        for path_i in paths:
+            nodes = self.model.match_parent_path(normalize_node, path_i[0], path_i[1])
+            paths_found.append(nodes)
+            if nodes:
+                for n in nodes:
+                    if n.name not in nodes_names_found:
+                        nodes_names_found.add(n.name)
+                        nodes_found.append(n)
+        if not all(paths_found):
+            return
+        shape_node = paths_found[-1][-1]
+        params = self._find_yolov5_decoder_params(paths_found)
+        self._fuse_node(
+            inputs=shape_node.input, outputs=normalize_node.output, params=params
+        )
+        self.nodes_to_remove.extend(nodes_found)
+        self._delete_extra_output_edges(paths_found)
+        self.prune_graph = True
+
+    def _fuse_node(self, inputs, outputs, params):
+        fused_node = helper.make_node(
+            "YoloV5Decoder",
+            inputs=inputs,
+            outputs=outputs,
+            name=self.model.create_node_name("YoloV5Decoder"),
+        )
+        fused_node.attribute.extend(params)
+        self.nodes_to_add.append(fused_node)
+        self.node_name_to_graph_name[fused_node.name] = self.this_graph_name
+
+    def _delete_extra_output_edges(self, paths_found):
+        transpose_node = paths_found[2][-1]
+        assert transpose_node.op_type == "Transpose"
+        out_edge = transpose_node.output[0]
+        for item in self.model.graph().output:
+            if item.name == out_edge:
+                self.model.graph().output.remove(item)
+                logger.warning(f"Output: {out_edge} is useless in graph, delete it")
+                return
+
+    def _find_yolov5_decoder_params(self, paths_found):
+        # num_class
+        concat_op = paths_found[0][0]
+        assert concat_op.op_type == "Concat"
+        num_class_arr = self.model.get_initializer(concat_op.input[2], True)
+        assert num_class_arr
+        num_class = (num_class_arr - 5).tolist()[0]
+        num_class = helper.make_attribute("num_class", num_class)
+
+        # stride
+        mul_op = paths_found[1][1]
+        assert mul_op.op_type == "Mul"
+        input_arrs = self.model.get_initializer_input_edges(mul_op.name, True)
+        assert len(input_arrs) == 1
+        stride = input_arrs[0].tolist()
+        stride = helper.make_attribute("stride", stride)
+
+        # anchor
+        mul_op = paths_found[2][1]
+        assert mul_op.op_type == "Mul"
+        anchor = self.model.get_initializer_input_edges(mul_op.name, True)
+        assert len(anchor) == 1
+        anchor = anchor[0]
+        anchor = anchor[0, :, 0, 0, :] if len(anchor.shape) == 5 else anchor[:, 0, 0, :]
+        anchor = helper.make_attribute("anchor", list(anchor.flatten()))
+
+        # fast_impl
+        fast_impl = helper.make_attribute("faster_impl", 1)
+
+        return [num_class, stride, anchor, fast_impl]
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b176058c9fdc7a5b3dbbc9ef8294d910f689cc31
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/onnx_model.py
@@ -0,0 +1,1166 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import logging
+import os
+import sys
+from collections import deque
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+from onnx import (
+    AttributeProto,
+    GraphProto,
+    ModelProto,
+    NodeProto,
+    TensorProto,
+    helper,
+    numpy_helper,
+    save_model,
+)
+
+from .float16 import convert_float_to_float16
+from .shape_infer_helper import SymbolicShapeInferenceHelper
+
+logger = logging.getLogger(__name__)
+
+
+class OnnxModel:
+    def __init__(self, model):
+        self.initialize(model)
+        self.initializer_visited: Dict[str, bool] = {}
+
+    def initialize(self, model):
+        self.model: ModelProto = model
+        self._node_name_suffix: Dict[
+            str, int
+        ] = {}  # key is node name prefix, value is the last suffix generated
+        self.shape_infer_helper: SymbolicShapeInferenceHelper = None
+        self.enable_shape_infer: bool = True
+        self.all_graphs: Optional[List[GraphProto]] = None
+
+    def disable_shape_inference(self):
+        self.enable_shape_infer = False
+
+    def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
+        if self.enable_shape_infer:
+            if self.shape_infer_helper is None or update:
+                self.shape_infer_helper = SymbolicShapeInferenceHelper(self.model)
+
+            try:
+                if self.shape_infer_helper.infer(dynamic_axis_mapping):
+                    return self.shape_infer_helper
+            except:
+                self.enable_shape_infer = (
+                    False  # disable shape inference to suppress same error message.
+                )
+                print("failed in shape inference", sys.exc_info()[0])
+
+        return None
+
+    def input_name_to_nodes(self):
+        input_name_to_nodes = {}
+        for node in self.nodes():
+            for input_name in node.input:
+                if input_name not in input_name_to_nodes:
+                    input_name_to_nodes[input_name] = [node]
+                else:
+                    input_name_to_nodes[input_name].append(node)
+        return input_name_to_nodes
+
+    def output_name_to_node(self):
+        output_name_to_node = {}
+        for node in self.nodes():
+            for output_name in node.output:
+                output_name_to_node[output_name] = node
+        return output_name_to_node
+
+    def nodes(self):
+        all_nodes = []
+        for graph in self.graphs():
+            for node in graph.node:
+                all_nodes.append(node)
+        return all_nodes
+
+    def graph(self):
+        return self.model.graph
+
+    def graphs(self):
+        if self.all_graphs is not None:
+            return self.all_graphs
+        self.all_graphs = []
+        graph_queue = [self.model.graph]
+        while graph_queue:
+            graph = graph_queue.pop(0)
+            self.all_graphs.append(graph)
+            for node in graph.node:
+                for attr in node.attribute:
+                    if attr.type == AttributeProto.AttributeType.GRAPH:
+                        assert isinstance(attr.g, GraphProto)
+                        graph_queue.append(attr.g)
+                    if attr.type == AttributeProto.AttributeType.GRAPHS:
+                        for g in attr.graphs:
+                            assert isinstance(g, GraphProto)
+                            graph_queue.append(g)
+        return self.all_graphs
+
+    def get_graphs_input_names(self):
+        input_names = []
+        for graph in self.graphs():
+            for input in graph.input:
+                input_names.append(input.name)
+        return input_names
+
+    def get_graphs_output_names(self):
+        output_names = []
+        for graph in self.graphs():
+            for output in graph.output:
+                output_names.append(output.name)
+        return output_names
+
+    def get_graph_by_node(self, node):
+        for graph in self.graphs():
+            if node in graph.node:
+                return graph
+        return None
+
+    def get_graph_by_name(self, graph_name):
+        for graph in self.graphs():
+            if graph_name == graph.name:
+                return graph
+        return None
+
+    def get_topological_insert_id(self, graph, outputs):
+        for idx, node in enumerate(graph.node):
+            for input in node.input:
+                if input in outputs:
+                    return idx
+        return len(graph.node)
+
+    def remove_node(self, node):
+        for graph in self.graphs():
+            if node in graph.node:
+                graph.node.remove(node)
+
+    def remove_nodes(self, nodes_to_remove):
+        for node in nodes_to_remove:
+            self.remove_node(node)
+
+    def add_node(self, node, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.node.extend([node])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            insert_idx = self.get_topological_insert_id(graph, node.output)
+            graph.node.insert(insert_idx, node)
+
+    def add_nodes(self, nodes_to_add, node_name_to_graph_name=None):
+        if node_name_to_graph_name is None:
+            self.model.graph.node.extend(nodes_to_add)
+        else:
+            for node in nodes_to_add:
+                graph_name = node_name_to_graph_name[node.name]
+                self.add_node(node, graph_name)
+
+    def add_initializer(self, tensor, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.initializer.extend([tensor])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            graph.initializer.extend([tensor])
+
+    def add_input(self, input, graph_name=None):
+        if graph_name is None or graph_name == self.model.graph.name:
+            self.model.graph.input.extend([input])
+        else:
+            graph = self.get_graph_by_name(graph_name)
+            graph.input.extend([input])
+
+    @staticmethod
+    def replace_node_input(node, old_input_name, new_input_name):
+        assert isinstance(old_input_name, str) and isinstance(new_input_name, str)
+        for j in range(len(node.input)):
+            if node.input[j] == old_input_name:
+                node.input[j] = new_input_name
+
+    def replace_input_of_all_nodes(self, old_input_name, new_input_name):
+        for node in self.model.graph.node:
+            OnnxModel.replace_node_input(node, old_input_name, new_input_name)
+
+    @staticmethod
+    def replace_node_output(node, old_output_name, new_output_name):
+        assert isinstance(old_output_name, str) and isinstance(new_output_name, str)
+        for j in range(len(node.output)):
+            if node.output[j] == old_output_name:
+                node.output[j] = new_output_name
+
+    def replace_output_of_all_nodes(self, old_output_name, new_output_name):
+        for node in self.model.graph.node:
+            OnnxModel.replace_node_output(node, old_output_name, new_output_name)
+
+    def get_initializer(self, name, return_np_array=False):
+        for graph in self.graphs():
+            for tensor in graph.initializer:
+                if tensor.name == name:
+                    return numpy_helper.to_array(tensor) if return_np_array else tensor
+        return None
+
+    def get_node(self, op_name):
+        for graph in self.graphs():
+            for n in graph.node:
+                if n.name == op_name:
+                    return n
+        return None
+
+    def get_initializer_input_edges(self, op_name, return_np_array=False):
+        initializers = {i.name: i for graph in self.graphs() for i in graph.initializer}
+        node = self.get_node(op_name)
+        assert node
+        result = []
+        for i in node.input:
+            if i in initializers:
+                tensor = initializers[i]
+                tensor = numpy_helper.to_array(tensor) if return_np_array else tensor
+                result.append(tensor)
+        return result
+
+    def get_nodes_by_op_type(self, op_type):
+        nodes = []
+        for node in self.nodes():
+            if node.op_type == op_type:
+                nodes.append(node)
+        return nodes
+
+    def get_children(self, node, input_name_to_nodes=None):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = []
+        for output in node.output:
+            if output in input_name_to_nodes:
+                for node in input_name_to_nodes[output]:
+                    children.append(node)
+        return children
+
+    def get_parents(self, node, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = []
+        for input in node.input:
+            if input in output_name_to_node:
+                parents.append(output_name_to_node[input])
+        return parents
+
+    def get_parent(self, node, i, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if len(node.input) <= i:
+            return None
+
+        input = node.input[i]
+        if input not in output_name_to_node:
+            return None
+
+        return output_name_to_node[input]
+
+    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]):
+        """
+        Find parent node based on constraints on op_type.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+
+        Returns:
+            parent: The matched parent node. None if not found.
+            index: The input index of matched parent node. None if not found.
+        """
+        for i, input in enumerate(node.input):
+            if input in output_name_to_node:
+                parent = output_name_to_node[input]
+                if parent.op_type == parent_op_type and parent not in exclude:
+                    return parent, i
+                else:
+                    logger.debug(
+                        f"To find first {parent_op_type}, current {parent.op_type}"
+                    )
+        return None, None
+
+    def match_parent(
+        self,
+        node,
+        parent_op_type,
+        input_index=None,
+        output_name_to_node=None,
+        exclude=[],
+        return_indice=None,
+    ):
+        """
+        Find parent node based on constraints on op_type and index.
+        When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_type (str): constraint of parent node op_type.
+            input_index (int or None): only check the parent given input index of current node.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            exclude (list): list of nodes that are excluded (not allowed to match as parent).
+            return_indice (list): a list to append the input index when input_index is None.
+
+        Returns:
+            parent: The matched parent node.
+        """
+        assert node is not None
+        assert input_index is None or input_index >= 0
+
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        if input_index is None:
+            parent, index = self.match_first_parent(
+                node, parent_op_type, output_name_to_node, exclude
+            )
+            if return_indice is not None:
+                return_indice.append(index)
+            return parent
+
+        if input_index >= len(node.input):
+            logger.debug(f"input_index {input_index} >= node inputs {len(node.input)}")
+            return None
+
+        parent = self.get_parent(node, input_index, output_name_to_node)
+        if (
+            parent is not None
+            and parent.op_type == parent_op_type
+            and parent not in exclude
+        ):
+            return parent
+
+        if parent is not None:
+            logger.debug(f"Expect {parent_op_type}, Got {parent.op_type}")
+
+        return None
+
+    def match_parent_paths(self, node, paths, output_name_to_node):
+        for i, path in enumerate(paths):
+            assert isinstance(path, List) or isinstance(path, Tuple)
+            return_indice = []
+            matched = self.match_parent_path(
+                node, path[0], path[1], output_name_to_node, return_indice
+            )
+            if matched:
+                return i, matched, return_indice
+        return -1, None, None
+
+    def match_parent_path(
+        self,
+        node,
+        parent_op_types,
+        parent_input_index,
+        output_name_to_node=None,
+        return_indice=None,
+    ):
+        """
+        Find a sequence of input edges based on constraints on parent op_type and index.
+        When input_index is None, we will find the first parent node based on constraints, and return_indice will be appended the corresponding input index.
+
+        Args:
+            node (str): current node name.
+            parent_op_types (str): constraint of parent node op_type of each input edge.
+            parent_input_index (list): constraint of input index of each input edge. None means no constraint.
+            output_name_to_node (dict): dictionary with output name as key, and node as value.
+            return_indice (list): a list to append the input index when there is no constraint on input index of an edge.
+
+        Returns:
+            parents: a list of matched parent node.
+        """
+        assert len(parent_input_index) == len(parent_op_types)
+
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        current_node = node
+        matched_parents = []
+        for i, op_type in enumerate(parent_op_types):
+            matched_parent = self.match_parent(
+                current_node,
+                op_type,
+                parent_input_index[i],
+                output_name_to_node,
+                exclude=[],
+                return_indice=return_indice,
+            )
+            if matched_parent is None:
+                logger.debug(
+                    f"Failed to match index={i} parent_input_index={parent_input_index[i]} op_type={op_type}",
+                    stack_info=True,
+                )
+                return None
+
+            matched_parents.append(matched_parent)
+            current_node = matched_parent
+
+        return matched_parents
+
+    def find_first_child_by_type(
+        self, node, child_type, input_name_to_nodes=None, recursive=True
+    ):
+        children = self.get_children(node, input_name_to_nodes)
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == child_type:
+                return current_node
+
+            if recursive:
+                children = self.get_children(current_node, input_name_to_nodes)
+                for child in children:
+                    dq.appendleft(child)
+
+        return None
+
+    def find_first_parent_by_type(
+        self, node, parent_type, output_name_to_node=None, recursive=True
+    ):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        parents = self.get_parents(node, output_name_to_node)
+        dq = deque(parents)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node.op_type == parent_type:
+                return current_node
+
+            if recursive:
+                parents = self.get_parents(current_node, output_name_to_node)
+                for parent in parents:
+                    dq.appendleft(parent)
+
+        return None
+
+    def get_constant_value(self, output_name):
+        for node in self.get_nodes_by_op_type("Constant"):
+            if node.output[0] == output_name:
+                for att in node.attribute:
+                    if att.name == "value":
+                        return numpy_helper.to_array(att.t)
+
+        # Fall back to intializer since constant folding might have been applied.
+        initializer = self.get_initializer(output_name)
+        if initializer is not None:
+            return numpy_helper.to_array(initializer)
+
+        return None
+
+    def get_constant_input(self, node):
+        for i, input in enumerate(node.input):
+            value = self.get_constant_value(input)
+            if value is not None:
+                return i, value
+
+        return None, None
+
+    def find_constant_input(self, node, expected_value, delta=0.000001):
+        i, value = self.get_constant_input(node)
+        if (
+            value is not None
+            and value.size == 1
+            and abs(value - expected_value) < delta
+        ):
+            return i
+
+        return -1
+
+    def is_constant_with_specified_dimension(
+        self, output_name, dimensions, description
+    ):
+        value = self.get_constant_value(output_name)
+        if value is None:
+            logger.debug(f"{description} {output_name} is not initializer.")
+            return False
+
+        if len(value.shape) != dimensions:
+            logger.debug(
+                f"{description} {output_name} shall have {dimensions} dimensions. Got shape {value.shape}"
+            )
+            return False
+
+        return True
+
+    def has_constant_input(self, node, expected_value, delta=0.000001):
+        return self.find_constant_input(node, expected_value, delta) >= 0
+
+    def get_children_subgraph_nodes(
+        self, root_node, stop_nodes, input_name_to_nodes=None
+    ):
+        if input_name_to_nodes is None:
+            input_name_to_nodes = self.input_name_to_nodes()
+
+        children = input_name_to_nodes[root_node.output[0]]
+
+        unique_nodes = []
+
+        dq = deque(children)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node in stop_nodes:
+                continue
+
+            if current_node not in unique_nodes:
+                unique_nodes.append(current_node)
+
+                for output in current_node.output:
+                    if output in input_name_to_nodes:
+                        children = input_name_to_nodes[output]
+                        for child in children:
+                            dq.appendleft(child)
+
+        return unique_nodes
+
+    def tensor_shape_to_list(self, tensor_type):
+        """Convert tensor shape to list"""
+        shape_list = []
+        for d in tensor_type.shape.dim:
+            if d.HasField("dim_value"):
+                shape_list.append(d.dim_value)  # known dimension
+            elif d.HasField("dim_param"):
+                shape_list.append(d.dim_param)  # unknown dimension with symbolic name
+            else:
+                shape_list.append("?")  # shall not happen
+        return shape_list
+
+    def get_dtype(self, input_or_output: str):
+        """Try get data type given a name (could be initializer, graph input or output)."""
+        tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
+
+        if input_or_output in tensor_type_map:
+            return tensor_type_map[input_or_output].tensor_type.elem_type
+
+        graph_input = self.find_graph_input(input_or_output)
+        if graph_input:
+            return graph_input.type.tensor_type.elem_type
+
+        graph_output = self.find_graph_output(input_or_output)
+        if graph_output:
+            return graph_output.type.tensor_type.elem_type
+
+        return None
+
+    @staticmethod
+    def get_node_attribute(node: NodeProto, attribute_name: str):
+        for attr in node.attribute:
+            if attr.name == attribute_name:
+                value = helper.get_attribute_value(attr)
+                return value
+        return None
+
+    def remove_cascaded_cast_nodes(self):
+        """Remove Cast node that are followed by another Cast node like  --> Cast --> Cast -->
+        Note that this shall be used carefully since it might introduce semantic change.
+        For example, float -> int -> float could get different value than the original float value.
+        So, it is recommended to used only in post-processing of mixed precision conversion.
+        """
+        output_name_to_node = self.output_name_to_node()
+        removed_count = 0
+        for node in self.nodes():
+            if node.op_type == "Cast":
+                parent = self.get_parent(
+                    node, 0, output_name_to_node=output_name_to_node
+                )
+                if parent and parent.op_type == "Cast":
+                    node.input[0] = parent.input[0]
+                    removed_count += 1
+
+        if removed_count > 0:
+            logger.info("Removed %d cascaded Cast nodes", removed_count)
+            self.prune_graph()
+
+    def remove_useless_cast_nodes(self):
+        """Remove cast nodes that are not needed: input and output has same data type."""
+        shape_infer = self.infer_runtime_shape(update=True)
+        if shape_infer is None:
+            logger.info(
+                f"Skip removing useless cast nodes since shape inference failed."
+            )
+            return
+
+        def get_data_type(input_or_output_name):
+            dtype = self.get_dtype(input_or_output_name)
+            if dtype:
+                return dtype
+            if shape_infer.known_vi_[input_or_output_name].type.tensor_type.HasField(
+                "elem_type"
+            ):
+                return shape_infer.known_vi_[
+                    input_or_output_name
+                ].type.tensor_type.elem_type
+            return None
+
+        nodes_to_remove = []
+        for node in self.nodes():
+            if node.op_type == "Cast":
+                input_dtype = get_data_type(node.input[0])
+                output_dtype = get_data_type(node.output[0])
+                if input_dtype and input_dtype == output_dtype:
+                    nodes_to_remove.append(node)
+
+        if nodes_to_remove:
+            graph_input_names = set(self.get_graphs_input_names())
+            graph_output_names = set(self.get_graphs_output_names())
+            for node in nodes_to_remove:
+                if bool(set(node.output) & graph_output_names):
+                    if not bool(set(node.input) & graph_input_names):
+                        self.replace_output_of_all_nodes(node.input[0], node.output[0])
+                    else:
+                        continue
+                else:
+                    self.replace_input_of_all_nodes(node.output[0], node.input[0])
+                self.remove_node(node)
+
+            logger.info(
+                "Removed %d Cast nodes with output type same as input",
+                len(nodes_to_remove),
+            )
+
+    def convert_model_float32_to_float16(self, cast_input_output=True):
+        logger.warning(
+            "The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!"
+        )
+        self.convert_float_to_float16(
+            use_symbolic_shape_infer=True, keep_io_types=cast_input_output
+        )
+
+    def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
+        """Convert a model to half (default) or mixed precision.
+           To use mixed precision, user need specify which graph inputs, outputs, operator type or list of nodes shall keep in float32.
+           By default, we use symbolic shape inference to get shape and type information. If not, ONNX shape inference will be used.
+           Note that symbolic/ONNX shape inference might fail, and the conversion might not proceed without shape and type information.
+
+        Args:
+            use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. Defaults to True.
+            keep_io_types (Union[bool, List[str]], optional): It could be boolean or a list of float32 input/output names.
+                                                              If True, model inputs/outputs should be left as float32. Defaults to False.
+            op_block_list (List[str], optional): List of operator types to leave as float32.
+                                                 Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST` as default.
+            node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
+            force_fp16_initializers(bool): force converting all float initializers to float16.
+                                           Default to false, which will convert only the one needed to avoid precision loss.
+            min_positive_val (float, optional): minimal positive value. Defaults to 1e-7.
+            max_finite_val (float, optional): maximal finite value. Defaults to 1e4.
+        """
+        if "keep_io_types" not in kwargs:
+            kwargs["keep_io_types"] = True
+
+        model = self.model
+        if use_symbolic_shape_infer:
+            # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
+            shape_infer_helper = SymbolicShapeInferenceHelper(model)
+            model = shape_infer_helper.infer_shapes(
+                model, auto_merge=True, guess_output_rank=False
+            )
+
+        parameters = {"disable_shape_infer": use_symbolic_shape_infer}
+        parameters.update(
+            {
+                key: kwargs[key]
+                for key in [
+                    "keep_io_types",
+                    "min_positive_val",
+                    "max_finite_val",
+                    "op_block_list",
+                    "node_block_list",
+                    "force_fp16_initializers",
+                ]
+                if key in kwargs
+            }
+        )
+
+        fp16_model = convert_float_to_float16(model, **parameters)
+        self.initialize(fp16_model)
+
+        self.remove_cascaded_cast_nodes()
+
+        self.remove_useless_cast_nodes()
+
+    def create_node_name(self, op_type, name_prefix=None):
+        """Create a unique node name that starts with a prefix (default is operator type).
+           The name will not be duplicated with any name that generated or existed in current graphs.
+        Args:
+            op_type (str): operator type
+            name_prefix (str, optional): prefix of node name. Defaults to None.
+
+        Returns:
+            str: node name
+        """
+
+        if name_prefix:
+            prefix = name_prefix if name_prefix.endswith("_") else (name_prefix + "_")
+        else:
+            prefix = op_type + "_"
+
+        suffix: int = 0
+        if prefix in self._node_name_suffix:
+            suffix = self._node_name_suffix[prefix] + 1
+        else:
+            # Check existed node name only once for a prefix as we assume create_node_name is called for every new node in fusion.
+            for node in self.nodes():
+                if node.name and node.name.startswith(prefix):
+                    try:
+                        index = int(node.name[len(prefix) :])
+                        suffix = max(index + 1, suffix)
+                    except ValueError:
+                        continue
+
+        # Record the generated suffix so that we can avoid generating duplicated name.
+        self._node_name_suffix[prefix] = suffix
+
+        return prefix + str(suffix)
+
+    def find_graph_input(self, input_name):
+        for input in self.model.graph.input:
+            if input.name == input_name:
+                return input
+        return None
+
+    def find_graph_output(self, output_name):
+        for output in self.model.graph.output:
+            if output.name == output_name:
+                return output
+        return None
+
+    def get_parent_subgraph_nodes(self, node, stop_nodes, output_name_to_node=None):
+        if output_name_to_node is None:
+            output_name_to_node = self.output_name_to_node()
+
+        unique_nodes = []
+
+        parents = self.get_parents(node, output_name_to_node)
+        dq = deque(parents)
+        while len(dq) > 0:
+            current_node = dq.pop()
+            if current_node in stop_nodes:
+                continue
+
+            if current_node not in unique_nodes:
+                unique_nodes.append(current_node)
+
+                for input in current_node.input:
+                    if input in output_name_to_node:
+                        dq.appendleft(output_name_to_node[input])
+
+        return unique_nodes
+
+    def get_graph_inputs(self, current_node, recursive=False):
+        """
+        Find graph inputs that linked to current node.
+        """
+        graph_inputs = []
+        for input in current_node.input:
+            if self.find_graph_input(input) and input not in graph_inputs:
+                graph_inputs.append(input)
+
+        if recursive:
+            parent_nodes = self.get_parent_subgraph_nodes(current_node, [])
+            for node in parent_nodes:
+                for input in node.input:
+                    if self.find_graph_input(input) and input not in graph_inputs:
+                        graph_inputs.append(input)
+        return graph_inputs
+
+    @staticmethod
+    def input_index(node_output, child_node):
+        index = 0
+        for input in child_node.input:
+            if input == node_output:
+                return index
+            index += 1
+        return -1
+
+    def remove_unused_constant(self):
+        input_name_to_nodes = self.input_name_to_nodes()
+
+        # remove unused constant
+        unused_nodes = []
+        nodes = self.nodes()
+        for node in nodes:
+            if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes:
+                unused_nodes.append(node)
+
+        self.remove_nodes(unused_nodes)
+
+        if len(unused_nodes) > 0:
+            logger.debug(f"Removed unused constant nodes: {len(unused_nodes)}")
+
+    def prune_graph(self, outputs=None):
+        """
+        Prune graph to keep only required outputs. It removes unnecessary inputs and nodes.
+        Nodes are not linked (directly or indirectly) to any required output will be removed.
+
+        Args:
+            outputs (list): a list of graph outputs to retain. If it is None, all graph outputs will be kept.
+        """
+        if len(self.graphs()) > 1:
+            logger.debug(f"Skip prune_graph since graph has subgraph")
+            return
+
+        if outputs is None:
+            outputs = [output.name for output in self.model.graph.output]
+
+        output_name_to_node = self.output_name_to_node()
+        all_nodes = []
+        for output in outputs:
+            if output in output_name_to_node:
+                last_node = output_name_to_node[output]
+                if last_node in all_nodes:
+                    continue
+                nodes = self.get_parent_subgraph_nodes(last_node, [])
+                all_nodes.append(last_node)
+                all_nodes.extend(nodes)
+
+        nodes_to_remove = []
+        for node in self.model.graph.node:
+            if node not in all_nodes:
+                nodes_to_remove.append(node)
+
+        self.remove_nodes(nodes_to_remove)
+
+        # remove outputs not in list
+        output_to_remove = []
+        for output in self.model.graph.output:
+            if output.name not in outputs:
+                output_to_remove.append(output)
+        for output in output_to_remove:
+            self.model.graph.output.remove(output)
+
+        # remove inputs not used by any node.
+        input_name_to_nodes = self.input_name_to_nodes()
+        input_to_remove = []
+        for input in self.model.graph.input:
+            if input.name not in input_name_to_nodes:
+                input_to_remove.append(input)
+        for input in input_to_remove:
+            self.model.graph.input.remove(input)
+
+        if input_to_remove or output_to_remove or nodes_to_remove:
+            logger.info(
+                "Graph pruned: {} inputs, {} outputs and {} nodes are removed".format(
+                    len(input_to_remove), len(output_to_remove), len(nodes_to_remove)
+                )
+            )
+
+        self.update_graph()
+
+    def update_graph(self, verbose=False):
+        graph = self.model.graph
+
+        remaining_input_names = []
+        for node in graph.node:
+            if node.op_type in ["Loop", "Scan", "If"]:
+                # TODO: handle inner graph
+                logger.debug(
+                    f"Skip update_graph since graph has operator: {node.op_type}"
+                )
+                return
+            if node.op_type != "Constant":
+                for input_name in node.input:
+                    if input_name not in remaining_input_names:
+                        remaining_input_names.append(input_name)
+        if verbose:
+            logger.debug(f"remaining input names: {remaining_input_names}")
+
+        # remove graph input that is not used
+        inputs_to_remove = []
+        for input in graph.input:
+            if input.name not in remaining_input_names:
+                inputs_to_remove.append(input)
+        for input in inputs_to_remove:
+            graph.input.remove(input)
+
+        names_to_remove = [input.name for input in inputs_to_remove]
+        logger.debug(f"remove {len(inputs_to_remove)} unused inputs: {names_to_remove}")
+
+        # remove weights that are not used
+        weights_to_remove = []
+        weights_to_keep = []
+        for initializer in graph.initializer:
+            if (
+                initializer.name not in remaining_input_names
+                and not self.find_graph_output(initializer.name)
+            ):
+                weights_to_remove.append(initializer)
+            else:
+                weights_to_keep.append(initializer.name)
+        for initializer in weights_to_remove:
+            graph.initializer.remove(initializer)
+
+        names_to_remove = [initializer.name for initializer in weights_to_remove]
+        logger.debug(
+            f"remove {len(weights_to_remove)} unused initializers: {names_to_remove}"
+        )
+        if verbose:
+            logger.debug(f"remaining initializers:{weights_to_keep}")
+
+        self.remove_unused_constant()
+
+    def is_safe_to_fuse_nodes(
+        self, nodes_to_remove, keep_outputs, input_name_to_nodes, output_name_to_node
+    ):
+        for node_to_remove in nodes_to_remove:
+            for output_to_remove in node_to_remove.output:
+                if output_to_remove in keep_outputs:
+                    continue
+
+                if output_to_remove in input_name_to_nodes:
+                    for impacted_node in input_name_to_nodes[output_to_remove]:
+                        if impacted_node not in nodes_to_remove:
+                            logger.debug(
+                                f"it is not safe to remove nodes since output {output_to_remove} is used by {impacted_node}"
+                            )
+                            return False
+        return True
+
+    @staticmethod
+    def graph_topological_sort(graph):
+        deps_count = [0] * len(graph.node)  # dependency count of each node
+        deps_to_nodes = {}  # input to node indice
+        sorted_nodes = []  # initialize sorted_nodes
+        for node_idx, node in enumerate(graph.node):
+            # CANNOT use len(node.input) directly because input can be optional
+            deps_count[node_idx] = sum(1 for _ in node.input if _)
+            if deps_count[node_idx] == 0:  # Constant doesn't depend on any inputs
+                sorted_nodes.append(graph.node[node_idx])
+                continue
+
+            for input_name in node.input:
+                if input_name not in deps_to_nodes:
+                    deps_to_nodes[input_name] = [node_idx]
+                else:
+                    deps_to_nodes[input_name].append(node_idx)
+
+        # Note: this logic only applies to top level graph since a sub graph could use intializer from parent graph
+        initializer_names = [init.name for init in graph.initializer]
+        graph_input_names = [input.name for input in graph.input]
+        input_names = initializer_names + graph_input_names
+        input_names.sort()
+        prev_input_name = None
+        for input_name in input_names:
+            if prev_input_name == input_name:
+                continue
+
+            prev_input_name = input_name
+            if input_name in deps_to_nodes:
+                for node_idx in deps_to_nodes[input_name]:
+                    deps_count[node_idx] = deps_count[node_idx] - 1
+                    if deps_count[node_idx] == 0:
+                        sorted_nodes.append(graph.node[node_idx])
+
+        start = 0
+        end = len(sorted_nodes)
+
+        while start < end:
+            for output in sorted_nodes[start].output:
+                if output in deps_to_nodes:
+                    for node_idx in deps_to_nodes[output]:
+                        deps_count[node_idx] = deps_count[node_idx] - 1
+                        if deps_count[node_idx] == 0:
+                            sorted_nodes.append(graph.node[node_idx])
+                            end = end + 1
+            start = start + 1
+
+        if end != len(graph.node):
+            raise RuntimeError(
+                f"Graph is not a DAG: end={end}, len(graph.node)={len(graph.node)}, graph.node[end]={graph.node[end]}"
+            )
+
+        graph.ClearField("node")
+        graph.node.extend(sorted_nodes)
+
+    def topological_sort(self):
+        # TODO: support graph_topological_sort() in subgraphs
+        # for graph in self.graphs():
+        #    self.graph_topological_sort(graph)
+        OnnxModel.graph_topological_sort(self.model.graph)
+
+    @staticmethod
+    def save(
+        model,
+        output_path,
+        save_as_external_data=False,
+        all_tensors_to_one_file=True,
+        size_threshold=1024,
+        convert_attribute=False,
+    ):
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+
+        if save_as_external_data:
+            # Save model to external data, which is needed for model size > 2GB
+            output_dir = Path(output_path).parent
+            output_dir.mkdir(parents=True, exist_ok=True)
+            external_data_path = output_path + ".data"
+            location = (
+                Path(external_data_path).name if all_tensors_to_one_file else None
+            )
+
+            if os.path.exists(output_path):
+                logger.info(f"Delete the existed onnx file: {output_path}")
+                os.remove(output_path)
+
+            if all_tensors_to_one_file:
+                if os.path.exists(external_data_path):
+                    # Delete the external data file. Otherwise, data will be appended to existing file.
+                    logger.info(
+                        f"Delete the existed external data file: {external_data_path}"
+                    )
+                    os.remove(external_data_path)
+            else:
+                if os.listdir(output_dir):
+                    raise RuntimeError(
+                        f"Output directory ({output_dir}) for external data is not empty."
+                    )
+
+            save_model(
+                model,
+                output_path,
+                save_as_external_data=True,
+                all_tensors_to_one_file=all_tensors_to_one_file,
+                location=location,
+                size_threshold=size_threshold,
+                convert_attribute=convert_attribute,
+            )
+        else:
+            save_model(model, output_path)
+
+    def save_model_to_file(
+        self, output_path, use_external_data_format=False, all_tensors_to_one_file=True
+    ):
+        logger.info(f"Sort graphs in topological order")
+        self.topological_sort()
+
+        if output_path.endswith(".json"):  # Output text for testing small model.
+            with open(output_path, "w") as out:
+                out.write(str(model))
+        else:
+            OnnxModel.save(
+                self.model,
+                output_path,
+                use_external_data_format,
+                all_tensors_to_one_file,
+            )
+        logger.info(f"Model saved to {output_path}")
+
+    def get_graph_inputs_excluding_initializers(self):
+        """
+        Returns real graph inputs (excluding initializers from older onnx model).
+        """
+        graph_inputs = []
+        for input in self.model.graph.input:
+            if self.get_initializer(input.name) is None:
+                graph_inputs.append(input)
+        return graph_inputs
+
+    def get_opset_version(self):
+        """Get opset version of onnx domain
+
+        Raises:
+            RuntimeError: ONNX model has no opset for default domain.
+
+        Returns:
+            int: opset version of onnx domain.
+        """
+        for opset in self.model.opset_import:
+            if opset.domain in ["", "ai.onnx"]:
+                return opset.version
+        raise RuntimeError("ONNX model has no opset for default domain")
+
+    @staticmethod
+    def has_same_value(tensor1: TensorProto, tensor2: TensorProto) -> bool:
+        """Returns True when two tensors have same value.
+           Note that name can be different.
+
+        Args:
+            tensor1 (TensorProto): initializer 1
+            tensor2 (TensorProto): initializer 2
+
+        Returns:
+            bool: True when two intializers has same value.
+        """
+        if tensor1.data_type != tensor2.data_type or tensor1.dims != tensor2.dims:
+            return False
+        if tensor1.HasField("raw_data") and tensor2.HasField("raw_data"):
+            return tensor1.raw_data == tensor2.raw_data
+        return numpy_helper.to_array(tensor1) == numpy_helper.to_array(tensor2)
+
+    def remove_duplicated_initializer(self):
+        """Remove initializers with duplicated values, and only keep the first one.
+        It could help reduce size of models (like ALBert) with shared weights.
+        Note: this function does not process subgraph.
+        """
+        if len(self.graphs()) > 1:
+            logger.warning("remove_duplicated_initializer does not process subgraphs.")
+
+        initializer_count = len(self.model.graph.initializer)
+
+        same = [-1] * initializer_count
+        for i in range(initializer_count - 1):
+            if same[i] >= 0:
+                continue
+            for j in range(i + 1, initializer_count):
+                if OnnxModel.has_same_value(
+                    self.model.graph.initializer[i], self.model.graph.initializer[j]
+                ):
+                    same[j] = i
+
+        count = 0
+        for i in range(initializer_count):
+            if same[i] >= 0:
+                count += 1
+                self.replace_input_of_all_nodes(
+                    self.model.graph.initializer[i].name,
+                    self.model.graph.initializer[same[i]].name,
+                )
+
+        if count > 0:
+            self.update_graph()
+            print(f"Removed {count} initializers with duplicated value")
+
+    def add_prefix_to_names(self, prefix: str):
+        """Add prefix to initializer or intermediate outputs in graph. Main graph inputs and outputs are excluded.
+        It could help avoid conflicting in name of node_args when merging two graphs.
+        Note: this function does not process subgraph.
+        """
+        if len(self.graphs()) > 1:
+            logger.warning("add_prefix_to_names does not process subgraphs.")
+
+        # Exclude the names of inputs and outputs of main graph (but not subgraphs)
+        excluded = [i.name for i in self.model.graph.input] + [
+            o.name for o in self.model.graph.output
+        ]
+
+        for initializer in self.model.graph.initializer:
+            if initializer.name not in excluded:
+                if prefix + initializer.name not in excluded:
+                    initializer.name = prefix + initializer.name
+
+        for node in self.model.graph.node:
+            # update name of node inputs
+            for j in range(len(node.input)):
+                if node.input[j] not in excluded:
+                    if prefix + node.input[j] not in excluded:
+                        node.input[j] = prefix + node.input[j]
+
+            # update name of node outputs
+            for j in range(len(node.output)):
+                if node.output[j] not in excluded:
+                    if prefix + node.output[j] not in excluded:
+                        node.output[j] = prefix + node.output[j]
+
+        for value_info in self.model.graph.value_info:
+            if value_info.name not in excluded:
+                value_info.name = prefix + value_info.name
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..111444028e4ed9aa1d068f93167f8fabaca71b92
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/shape_infer_helper.py
@@ -0,0 +1,122 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+import logging
+import os
+import sys
+from typing import Dict
+
+# In ORT Package the symbolic_shape_infer.py is in ../tools
+file_path = os.path.dirname(__file__)
+if os.path.exists(os.path.join(file_path, "../tools/symbolic_shape_infer.py")):
+    sys.path.append(os.path.join(file_path, "../tools"))
+else:
+    sys.path.append(os.path.join(file_path, ".."))
+
+from .symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy
+
+logger = logging.getLogger(__name__)
+
+
+class SymbolicShapeInferenceHelper(SymbolicShapeInference):
+    def __init__(self, model, verbose=0, int_max=2**31 - 1, auto_merge=True, guess_output_rank=False):
+        super().__init__(int_max, auto_merge, guess_output_rank, verbose)
+        self.model_ = model
+        self.all_shapes_inferred_: bool = False
+        self.is_inferred_: bool = False
+        self.dynamic_axis_mapping_: Dict[str, int] = {}
+
+    def infer(self, dynamic_axis_mapping: Dict[str, int], max_runs: int = 128):
+        """Run shape inference, and try replace dynamic axis from string to integer when mapping is provided.
+
+        Args:
+            dynamic_axis_mapping (_type_): a dictionary with name of dynamic axis as key, like {"batch_size" : 4}
+            max_runs (int, optional): limit maximum number of runs to avoid infinite loop. Defaults to 32.
+
+        Returns:
+            bool: whether all shapes has been inferred or not.
+        """
+        assert dynamic_axis_mapping is not None
+
+        if self.is_inferred_ and self.dynamic_axis_mapping_ == dynamic_axis_mapping:
+            return self.all_shapes_inferred_
+
+        self.dynamic_axis_mapping_ = dynamic_axis_mapping
+
+        self._preprocess(self.model_)
+
+        count = 0
+        while self.run_:
+            logger.debug(f"shape infer run {count}")
+            self.all_shapes_inferred_ = self._infer_impl()
+            count += 1
+            if max_runs > 0 and count >= max_runs:
+                break
+
+        self.is_inferred_ = True
+        return self.all_shapes_inferred_
+
+    def _get_sympy_shape(self, node, idx):
+        """Override it to ensure shape inference by giving the actual value of dynamic axis."""
+        sympy_shape = []
+
+        shape = self._get_shape(node, idx)
+        if shape:
+            for dim in shape:
+                if isinstance(dim, str):
+                    if dim in self.dynamic_axis_mapping_:
+                        sympy_shape.append(self.dynamic_axis_mapping_[dim])
+                    elif dim in self.symbolic_dims_:
+                        sympy_shape.append(self.symbolic_dims_[dim])
+                    else:
+                        sympy_shape.append(sympy.Symbol(dim, integer=True))
+                else:
+                    assert dim is not None
+                    sympy_shape.append(dim)
+        return sympy_shape
+
+    def get_edge_shape(self, edge):
+        """Get shape of an edge.
+
+        Args:
+            edge (str): name of edge
+
+        Returns:
+            Optional[List[int]]: the shape, or None if shape is unknown
+        """
+        assert self.all_shapes_inferred_
+        if edge not in self.known_vi_:
+            print("Cannot retrieve the shape of " + str(edge))
+            return None
+
+        type_proto = self.known_vi_[edge].type
+        shape = get_shape_from_type_proto(type_proto)
+
+        if shape is not None:
+            for i, dim in enumerate(shape):
+                if isinstance(dim, str) and dim in self.dynamic_axis_mapping_:
+                    shape[i] = self.dynamic_axis_mapping_[dim]
+
+        return shape
+
+    def compare_shape(self, edge, edge_other):
+        """Compare shape of two edges.
+
+        Args:
+            edge (str): name of edge
+            edge_other (str): name of another edge
+
+        Raises:
+            Exception: At least one shape is missed for edges to compare
+
+        Returns:
+            bool: whether the shape is same or not
+        """
+        assert self.all_shapes_inferred_
+        shape = self.get_edge_shape(edge)
+        shape_other = self.get_edge_shape(edge_other)
+        if shape is None or shape_other is None:
+            raise Exception("At least one shape is missed for edges to compare")
+        return shape == shape_other
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5157f90eedf906e3e6f24dddf03219d3ca570f7
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/passes/symbolic_shape_infer.py
@@ -0,0 +1,2431 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# -*- coding: UTF-8 -*-
+import argparse
+import logging
+
+import numpy as np
+import onnx
+import sympy
+from onnx import helper, numpy_helper, shape_inference
+from packaging import version
+
+assert version.parse(onnx.__version__) >= version.parse("1.8.0")
+
+logger = logging.getLogger(__name__)
+
+
+def get_attribute(node, attr_name, default_value=None):
+    found = [attr for attr in node.attribute if attr.name == attr_name]
+    if found:
+        return helper.get_attribute_value(found[0])
+    return default_value
+
+
+def get_dim_from_proto(dim):
+    return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None
+
+
+def is_sequence(type_proto):
+    cls_type = type_proto.WhichOneof("value")
+    assert cls_type in ["tensor_type", "sequence_type"]
+    return cls_type == "sequence_type"
+
+
+def get_shape_from_type_proto(type_proto):
+    assert not is_sequence(type_proto)
+    if type_proto.tensor_type.HasField("shape"):
+        return [get_dim_from_proto(d) for d in type_proto.tensor_type.shape.dim]
+    else:
+        return None  # note no shape is different from shape without dim (scalar)
+
+
+def get_shape_from_value_info(vi):
+    cls_type = vi.type.WhichOneof("value")
+    if cls_type is None:
+        return None
+    if is_sequence(vi.type):
+        if "tensor_type" == vi.type.sequence_type.elem_type.WhichOneof("value"):
+            return get_shape_from_type_proto(vi.type.sequence_type.elem_type)
+        else:
+            return None
+    else:
+        return get_shape_from_type_proto(vi.type)
+
+
+def make_named_value_info(name):
+    vi = onnx.ValueInfoProto()
+    vi.name = name
+    return vi
+
+
+def get_shape_from_sympy_shape(sympy_shape):
+    return [None if i is None else (int(i) if is_literal(i) else str(i)) for i in sympy_shape]
+
+
+def is_literal(dim):
+    return type(dim) in [int, np.int64, np.int32, sympy.Integer] or (hasattr(dim, "is_number") and dim.is_number)
+
+
+def handle_negative_axis(axis, rank):
+    assert axis < rank and axis >= -rank
+    return axis if axis >= 0 else rank + axis
+
+
+def get_opset(mp, domain=None):
+    domain = domain or ["", "onnx", "ai.onnx"]
+    if type(domain) != list:
+        domain = [domain]
+    for opset in mp.opset_import:
+        if opset.domain in domain:
+            return opset.version
+
+    return None
+
+
+def as_scalar(x):
+    if type(x) == list:
+        assert len(x) == 1
+        return x[0]
+    elif type(x) == np.ndarray:
+        return x.item()
+    else:
+        return x
+
+
+def as_list(x, keep_none):
+    if type(x) == list:
+        return x
+    elif type(x) == np.ndarray:
+        return list(x)
+    elif keep_none and x is None:
+        return None
+    else:
+        return [x]
+
+
+def sympy_reduce_product(x):
+    if type(x) == list:
+        value = sympy.Integer(1)
+        for v in x:
+            value = value * v
+    else:
+        value = x
+    return value
+
+
+class SymbolicShapeInference:
+    def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
+        self.dispatcher_ = {
+            "Add": self._infer_symbolic_compute_ops,
+            "ArrayFeatureExtractor": self._infer_ArrayFeatureExtractor,
+            "AveragePool": self._infer_Pool,
+            "BatchNormalization": self._infer_BatchNormalization,
+            "Cast": self._infer_Cast,
+            "CategoryMapper": self._infer_CategoryMapper,
+            "Compress": self._infer_Compress,
+            "Concat": self._infer_Concat,
+            "ConcatFromSequence": self._infer_ConcatFromSequence,
+            "Constant": self._infer_Constant,
+            "ConstantOfShape": self._infer_ConstantOfShape,
+            "Conv": self._infer_Conv,
+            "CumSum": self._pass_on_shape_and_type,
+            "Div": self._infer_symbolic_compute_ops,
+            "Einsum": self._infer_Einsum,
+            "Expand": self._infer_Expand,
+            "Equal": self._infer_symbolic_compute_ops,
+            "Floor": self._infer_symbolic_compute_ops,
+            "Gather": self._infer_Gather,
+            "GatherElements": self._infer_GatherElements,
+            "GatherND": self._infer_GatherND,
+            "Identity": self._pass_on_shape_and_type,
+            "If": self._infer_If,
+            "Loop": self._infer_Loop,
+            "MatMul": self._infer_MatMul,
+            "MatMulInteger16": self._infer_MatMulInteger,
+            "MaxPool": self._infer_Pool,
+            "Max": self._infer_symbolic_compute_ops,
+            "Min": self._infer_symbolic_compute_ops,
+            "Mul": self._infer_symbolic_compute_ops,
+            "NonMaxSuppression": self._infer_NonMaxSuppression,
+            "NonZero": self._infer_NonZero,
+            "OneHot": self._infer_OneHot,
+            "Pad": self._infer_Pad,
+            "Range": self._infer_Range,
+            "Reciprocal": self._pass_on_shape_and_type,
+            "ReduceSum": self._infer_ReduceSum,
+            "ReduceProd": self._infer_ReduceProd,
+            "Reshape": self._infer_Reshape,
+            "Resize": self._infer_Resize,
+            "Round": self._pass_on_shape_and_type,
+            "Scan": self._infer_Scan,
+            "ScatterElements": self._infer_ScatterElements,
+            "SequenceAt": self._infer_SequenceAt,
+            "SequenceInsert": self._infer_SequenceInsert,
+            "Shape": self._infer_Shape,
+            "Size": self._infer_Size,
+            "Slice": self._infer_Slice,
+            "SoftmaxCrossEntropyLoss": self._infer_SoftmaxCrossEntropyLoss,
+            "SoftmaxCrossEntropyLossInternal": self._infer_SoftmaxCrossEntropyLoss,
+            "NegativeLogLikelihoodLossInternal": self._infer_SoftmaxCrossEntropyLoss,
+            "Split": self._infer_Split,
+            "SplitToSequence": self._infer_SplitToSequence,
+            "Squeeze": self._infer_Squeeze,
+            "Sub": self._infer_symbolic_compute_ops,
+            "Tile": self._infer_Tile,
+            "TopK": self._infer_TopK,
+            "Transpose": self._infer_Transpose,
+            "Unsqueeze": self._infer_Unsqueeze,
+            "Where": self._infer_symbolic_compute_ops,
+            "ZipMap": self._infer_ZipMap,
+            "Neg": self._infer_symbolic_compute_ops,
+            # contrib ops:
+            "Attention": self._infer_Attention,
+            "BiasGelu": self._infer_BiasGelu,
+            "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
+            "FastGelu": self._infer_FastGelu,
+            "Gelu": self._infer_Gelu,
+            "LayerNormalization": self._infer_LayerNormalization,
+            "LongformerAttention": self._infer_LongformerAttention,
+            "PythonOp": self._infer_PythonOp,
+            "SkipLayerNormalization": self._infer_SkipLayerNormalization,
+        }
+        self.aten_op_dispatcher_ = {
+            "embedding": self._infer_Gather,
+            "bitwise_or": self._infer_aten_bitwise_or,
+            "diagonal": self._infer_aten_diagonal,
+            "max_pool2d_with_indices": self._infer_aten_pool2d,
+            "max": self._infer_aten_minmax,
+            "min": self._infer_aten_minmax,
+            "multinomial": self._infer_aten_multinomial,
+            "unfold": self._infer_aten_unfold,
+            "argmax": self._infer_aten_argmax,
+            "avg_pool2d": self._infer_aten_pool2d,
+            "_adaptive_avg_pool2d": self._infer_aten_pool2d,
+            "numpy_T": self._infer_Transpose,
+        }
+        self.run_ = True
+        self.suggested_merge_ = {}
+        self.symbolic_dims_ = {}
+        self.input_symbols_ = {}
+        self.auto_merge_ = auto_merge
+        self.guess_output_rank_ = guess_output_rank
+        self.verbose_ = verbose
+        self.int_max_ = int_max
+        self.subgraph_id_ = 0
+        self.prefix_ = prefix
+
+    def _add_suggested_merge(self, symbols, apply=False):
+        assert all([(type(s) == str and s in self.symbolic_dims_) or is_literal(s) for s in symbols])
+        symbols = set(symbols)
+        for k, v in self.suggested_merge_.items():
+            if k in symbols:
+                symbols.remove(k)
+                symbols.add(v)
+        map_to = None
+        # if there is literal, map to it first
+        for s in symbols:
+            if is_literal(s):
+                map_to = s
+                break
+        # when no literals, map to input symbolic dims, then existing symbolic dims
+        if map_to is None:
+            for s in symbols:
+                if s in self.input_symbols_:
+                    map_to = s
+                    break
+        if map_to is None:
+            for s in symbols:
+                if type(self.symbolic_dims_[s]) == sympy.Symbol:
+                    map_to = s
+                    break
+        # when nothing to map to, use the shorter one
+        if map_to is None:
+            if self.verbose_ > 0:
+                logger.warning("Potential unsafe merge between symbolic expressions: ({})".format(",".join(symbols)))
+            symbols_list = list(symbols)
+            lens = [len(s) for s in symbols_list]
+            map_to = symbols_list[lens.index(min(lens))]
+            symbols.remove(map_to)
+
+        for s in symbols:
+            if s == map_to:
+                continue
+            if is_literal(map_to) and is_literal(s):
+                assert int(map_to) == int(s)
+            self.suggested_merge_[s] = int(map_to) if is_literal(map_to) else map_to
+            for k, v in self.suggested_merge_.items():
+                if v == s:
+                    self.suggested_merge_[k] = map_to
+        if apply and self.auto_merge_:
+            self._apply_suggested_merge()
+
+    def _apply_suggested_merge(self, graph_input_only=False):
+        if not self.suggested_merge_:
+            return
+        for i in list(self.out_mp_.graph.input) + ([] if graph_input_only else list(self.out_mp_.graph.value_info)):
+            for d in i.type.tensor_type.shape.dim:
+                if d.dim_param in self.suggested_merge_:
+                    v = self.suggested_merge_[d.dim_param]
+                    if is_literal(v):
+                        d.dim_value = int(v)
+                    else:
+                        d.dim_param = v
+
+    def _preprocess(self, in_mp):
+        self.out_mp_ = onnx.ModelProto()
+        self.out_mp_.CopyFrom(in_mp)
+        self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
+        self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer])
+        self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
+        self.known_vi_.update(
+            dict(
+                [
+                    (
+                        i.name,
+                        helper.make_tensor_value_info(i.name, i.data_type, list(i.dims)),
+                    )
+                    for i in self.out_mp_.graph.initializer
+                ]
+            )
+        )
+
+    def _merge_symbols(self, dims):
+        if not all([type(d) == str for d in dims]):
+            if self.auto_merge_:
+                unique_dims = list(set(dims))
+                is_int = [is_literal(d) for d in unique_dims]
+                assert sum(is_int) <= 1  # if there are more than 1 unique ints, something is wrong
+                if sum(is_int) == 1:
+                    int_dim = is_int.index(1)
+                    if self.verbose_ > 0:
+                        logger.debug(
+                            "dim {} has been merged with value {}".format(
+                                unique_dims[:int_dim] + unique_dims[int_dim + 1 :],
+                                unique_dims[int_dim],
+                            )
+                        )
+                    self._check_merged_dims(unique_dims, allow_broadcast=False)
+                    return unique_dims[int_dim]
+                else:
+                    if self.verbose_ > 0:
+                        logger.debug("dim {} has been mergd with dim {}".format(unique_dims[1:], unique_dims[0]))
+                    return dims[0]
+            else:
+                return None
+        if all([d == dims[0] for d in dims]):
+            return dims[0]
+        merged = [self.suggested_merge_[d] if d in self.suggested_merge_ else d for d in dims]
+        if all([d == merged[0] for d in merged]):
+            assert merged[0] in self.symbolic_dims_
+            return merged[0]
+        else:
+            return None
+
+    # broadcast from right to left, and merge symbolic dims if needed
+    def _broadcast_shapes(self, shape1, shape2):
+        new_shape = []
+        rank1 = len(shape1)
+        rank2 = len(shape2)
+        new_rank = max(rank1, rank2)
+        for i in range(new_rank):
+            dim1 = shape1[rank1 - 1 - i] if i < rank1 else 1
+            dim2 = shape2[rank2 - 1 - i] if i < rank2 else 1
+            if dim1 == 1 or dim1 == dim2:
+                new_dim = dim2
+            elif dim2 == 1:
+                new_dim = dim1
+            else:
+                new_dim = self._merge_symbols([dim1, dim2])
+                if not new_dim:
+                    # warning about unsupported broadcast when not auto merge
+                    # note that auto merge has the risk of incorrectly merge symbols while one of them being 1
+                    # for example, 'a' = 1, 'b' = 5 at runtime is valid broadcasting, but with auto merge 'a' == 'b'
+                    if self.auto_merge_:
+                        self._add_suggested_merge([dim1, dim2], apply=True)
+                    else:
+                        logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))
+            new_shape = [new_dim] + new_shape
+        return new_shape
+
+    def _get_shape(self, node, idx):
+        name = node.input[idx]
+        if name in self.known_vi_:
+            vi = self.known_vi_[name]
+            return get_shape_from_value_info(vi)
+        else:
+            assert name in self.initializers_
+            return list(self.initializers_[name].dims)
+
+    def _get_shape_rank(self, node, idx):
+        return len(self._get_shape(node, idx))
+
+    def _get_sympy_shape(self, node, idx):
+        sympy_shape = []
+        for d in self._get_shape(node, idx):
+            if type(d) == str:
+                sympy_shape.append(
+                    self.symbolic_dims_[d]
+                    if d in self.symbolic_dims_
+                    else sympy.Symbol(d, integer=True, nonnegative=True)
+                )
+            else:
+                assert None != d
+                sympy_shape.append(d)
+        return sympy_shape
+
+    def _get_value(self, node, idx):
+        name = node.input[idx]
+        assert name in self.sympy_data_ or name in self.initializers_
+        return self.sympy_data_[name] if name in self.sympy_data_ else numpy_helper.to_array(self.initializers_[name])
+
+    def _try_get_value(self, node, idx):
+        if idx >= len(node.input):
+            return None
+        name = node.input[idx]
+        if name in self.sympy_data_ or name in self.initializers_:
+            return self._get_value(node, idx)
+        return None
+
+    def _update_computed_dims(self, new_sympy_shape):
+        for i, new_dim in enumerate(new_sympy_shape):
+            if not is_literal(new_dim) and not type(new_dim) == str:
+                str_dim = str(new_dim)
+                if str_dim in self.suggested_merge_:
+                    if is_literal(self.suggested_merge_[str_dim]):
+                        continue  # no need to create dim for literals
+                    new_sympy_shape[i] = self.symbolic_dims_[self.suggested_merge_[str_dim]]
+                else:
+                    # add new_dim if it's a computational expression
+                    if not str(new_dim) in self.symbolic_dims_:
+                        self.symbolic_dims_[str(new_dim)] = new_dim
+
+    def _onnx_infer_single_node(self, node):
+        # skip onnx shape inference for some ops, as they are handled in _infer_*
+        skip_infer = node.op_type in [
+            "If",
+            "Loop",
+            "Scan",
+            "SplitToSequence",
+            "ZipMap",  # contrib ops
+            "Attention",
+            "BiasGelu",
+            "EmbedLayerNormalization",
+            "FastGelu",
+            "Gelu",
+            "LayerNormalization",
+            "LongformerAttention",
+            "SkipLayerNormalization",
+            "PythonOp",
+        ]
+
+        if not skip_infer:
+            # Only pass initializers that satisfy the following condition:
+            # (1) Operator need value of some input for shape inference.
+            #     For example, Unsqueeze in opset 13 uses the axes input to calculate shape of output.
+            # (2) opset version >= 9. In older version, initializer is required in graph input by onnx spec.
+            # (3) The initializer is not in graph input. The means the node input is "constant" in inference.
+            initializers = []
+            if (get_opset(self.out_mp_) >= 9) and node.op_type in ["Unsqueeze"]:
+                initializers = [
+                    self.initializers_[name]
+                    for name in node.input
+                    if (name in self.initializers_ and name not in self.graph_inputs_)
+                ]
+
+            # run single node inference with self.known_vi_ shapes
+            tmp_graph = helper.make_graph(
+                [node],
+                "tmp",
+                [self.known_vi_[i] for i in node.input if i],
+                [make_named_value_info(i) for i in node.output],
+                initializers,
+            )
+
+            self.tmp_mp_.graph.CopyFrom(tmp_graph)
+
+            self.tmp_mp_ = shape_inference.infer_shapes(self.tmp_mp_)
+
+        for i_o in range(len(node.output)):
+            o = node.output[i_o]
+            vi = self.out_mp_.graph.value_info.add()
+            if not skip_infer:
+                vi.CopyFrom(self.tmp_mp_.graph.output[i_o])
+            else:
+                vi.name = o
+            self.known_vi_[o] = vi
+
+    def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph_id=True):
+        if self.verbose_ > 2:
+            logger.debug(
+                "Inferencing subgraph of node {} with output({}...): {}".format(node.name, node.output[0], node.op_type)
+            )
+        # node inputs are not passed directly to the subgraph
+        # it's up to the node dispatcher to prepare subgraph input
+        # for example, with Scan/Loop, subgraph input shape would be trimmed from node input shape
+        # besides, inputs in subgraph could shadow implicit inputs
+        subgraph_inputs = set([i.name for i in list(subgraph.initializer) + list(subgraph.input)])
+        subgraph_implicit_input = set([name for name in self.known_vi_.keys() if not name in subgraph_inputs])
+        tmp_graph = helper.make_graph(
+            list(subgraph.node),
+            "tmp",
+            list(subgraph.input) + [self.known_vi_[i] for i in subgraph_implicit_input],
+            [make_named_value_info(i.name) for i in subgraph.output],
+        )
+        tmp_graph.initializer.extend([i for i in self.out_mp_.graph.initializer if i.name in subgraph_implicit_input])
+        tmp_graph.initializer.extend(subgraph.initializer)
+        self.tmp_mp_.graph.CopyFrom(tmp_graph)
+
+        symbolic_shape_inference = SymbolicShapeInference(
+            self.int_max_,
+            self.auto_merge_,
+            self.guess_output_rank_,
+            self.verbose_,
+            prefix=self.prefix_ + "_" + str(self.subgraph_id_),
+        )
+        if inc_subgraph_id:
+            self.subgraph_id_ += 1
+
+        all_shapes_inferred = False
+        symbolic_shape_inference._preprocess(self.tmp_mp_)
+        symbolic_shape_inference.suggested_merge_ = self.suggested_merge_.copy()
+        while symbolic_shape_inference.run_:
+            all_shapes_inferred = symbolic_shape_inference._infer_impl(self.sympy_data_.copy())
+        symbolic_shape_inference._update_output_from_vi()
+        if use_node_input:
+            # if subgraph uses node input, it needs to update to merged dims
+            subgraph.ClearField("input")
+            subgraph.input.extend(symbolic_shape_inference.out_mp_.graph.input[: len(node.input)])
+        subgraph.ClearField("output")
+        subgraph.output.extend(symbolic_shape_inference.out_mp_.graph.output)
+        subgraph.ClearField("value_info")
+        subgraph.value_info.extend(symbolic_shape_inference.out_mp_.graph.value_info)
+        subgraph.ClearField("node")
+        subgraph.node.extend(symbolic_shape_inference.out_mp_.graph.node)
+        # for new symbolic dims from subgraph output, add to main graph symbolic dims
+        subgraph_shapes = [get_shape_from_value_info(o) for o in symbolic_shape_inference.out_mp_.graph.output]
+        subgraph_new_symbolic_dims = set(
+            [d for s in subgraph_shapes if s for d in s if type(d) == str and not d in self.symbolic_dims_]
+        )
+        new_dims = {}
+        for d in subgraph_new_symbolic_dims:
+            assert d in symbolic_shape_inference.symbolic_dims_
+            new_dims[d] = symbolic_shape_inference.symbolic_dims_[d]
+        self.symbolic_dims_.update(new_dims)
+        return symbolic_shape_inference
+
+    def _get_int_values(self, node, broadcast=False):
+        values = [self._try_get_value(node, i) for i in range(len(node.input))]
+        if all([v is not None for v in values]):
+            # some shape compute is in floating point, cast to int for sympy
+            for i, v in enumerate(values):
+                if type(v) != np.ndarray:
+                    continue
+                if len(v.shape) > 1:
+                    new_v = None  # ignore value for rank > 1
+                elif len(v.shape) == 0:
+                    new_v = int(v.item())
+                else:
+                    assert len(v.shape) == 1
+                    new_v = [int(vv) for vv in v]
+                values[i] = new_v
+        values_len = [len(v) if type(v) == list else 0 for v in values]
+        max_len = max(values_len)
+        if max_len >= 1 and broadcast:
+            # broadcast
+            for i, v in enumerate(values):
+                if v is None:
+                    continue  # don't broadcast if value is unknown
+                if type(v) == list:
+                    if len(v) < max_len:
+                        values[i] = v * max_len
+                    else:
+                        assert len(v) == max_len
+                else:
+                    values[i] = [v] * max_len
+        return values
+
+    def _compute_on_sympy_data(self, node, op_func):
+        assert len(node.output) == 1
+        values = self._get_int_values(node, broadcast=True)
+        if all([v is not None for v in values]):
+            is_list = [type(v) == list for v in values]
+            as_list = any(is_list)
+            if as_list:
+                self.sympy_data_[node.output[0]] = [op_func(vs) for vs in zip(*values)]
+            else:
+                self.sympy_data_[node.output[0]] = op_func(values)
+
+    def _pass_on_sympy_data(self, node):
+        assert len(node.input) == 1 or node.op_type in [
+            "Reshape",
+            "Unsqueeze",
+            "Squeeze",
+        ]
+        self._compute_on_sympy_data(node, lambda x: x[0])
+
+    def _pass_on_shape_and_type(self, node):
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                self._get_shape(node, 0),
+            )
+        )
+
+    def _new_symbolic_dim(self, prefix, dim):
+        new_dim = "{}_d{}".format(prefix, dim)
+        if new_dim in self.suggested_merge_:
+            v = self.suggested_merge_[new_dim]
+            new_symbolic_dim = sympy.Integer(int(v)) if is_literal(v) else v
+        else:
+            new_symbolic_dim = sympy.Symbol(new_dim, integer=True, nonnegative=True)
+            self.symbolic_dims_[new_dim] = new_symbolic_dim
+        return new_symbolic_dim
+
+    def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0):
+        return self._new_symbolic_dim(
+            "{}{}_{}_o{}_".format(
+                node.op_type,
+                self.prefix_,
+                list(self.out_mp_.graph.node).index(node),
+                out_idx,
+            ),
+            dim,
+        )
+
+    def _new_symbolic_shape(self, rank, node, out_idx=0):
+        return [self._new_symbolic_dim_from_output(node, out_idx, i) for i in range(rank)]
+
+    def _compute_conv_pool_shape(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        if len(node.input) > 1:
+            W_shape = self._get_sympy_shape(node, 1)
+            rank = len(W_shape) - 2  # number of spatial axes
+            kernel_shape = W_shape[-rank:]
+            sympy_shape[1] = W_shape[0]
+        else:
+            W_shape = None
+            kernel_shape = get_attribute(node, "kernel_shape")
+            rank = len(kernel_shape)
+
+        assert len(sympy_shape) == rank + 2
+
+        # only need to symbolic shape inference if input has symbolic dims in spatial axes
+        is_symbolic_dims = [not is_literal(i) for i in sympy_shape[-rank:]]
+
+        if not any(is_symbolic_dims):
+            shape = get_shape_from_value_info(self.known_vi_[node.output[0]])
+            if len(shape) > 0:
+                assert len(sympy_shape) == len(shape)
+                sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]]
+                return sympy_shape
+
+        dilations = get_attribute(node, "dilations", [1] * rank)
+        strides = get_attribute(node, "strides", [1] * rank)
+        effective_kernel_shape = [(k - 1) * d + 1 for k, d in zip(kernel_shape, dilations)]
+        pads = get_attribute(node, "pads")
+        if pads is None:
+            pads = [0] * (2 * rank)
+            auto_pad = get_attribute(node, "auto_pad", b"NOTSET").decode("utf-8")
+            if auto_pad != "VALID" and auto_pad != "NOTSET":
+                try:
+                    residual = [sympy.Mod(d, s) for d, s in zip(sympy_shape[-rank:], strides)]
+                    total_pads = [
+                        max(0, (k - s) if r == 0 else (k - r))
+                        for k, s, r in zip(effective_kernel_shape, strides, residual)
+                    ]
+                except TypeError:  # sympy may throw TypeError: cannot determine truth value of Relational
+                    total_pads = [
+                        max(0, (k - s)) for k, s in zip(effective_kernel_shape, strides)
+                    ]  # assuming no residual if sympy throws error
+            elif auto_pad == "VALID":
+                total_pads = []
+            else:
+                total_pads = [0] * rank
+        else:
+            assert len(pads) == 2 * rank
+            total_pads = [p1 + p2 for p1, p2 in zip(pads[:rank], pads[rank:])]
+
+        ceil_mode = get_attribute(node, "ceil_mode", 0)
+        for i in range(rank):
+            effective_input_size = sympy_shape[-rank + i]
+            if len(total_pads) > 0:
+                effective_input_size = effective_input_size + total_pads[i]
+            if ceil_mode:
+                strided_kernel_positions = sympy.ceiling(
+                    (effective_input_size - effective_kernel_shape[i]) / strides[i]
+                )
+            else:
+                strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) // strides[i]
+            sympy_shape[-rank + i] = strided_kernel_positions + 1
+        return sympy_shape
+
+    def _check_merged_dims(self, dims, allow_broadcast=True):
+        if allow_broadcast:
+            dims = [d for d in dims if not (is_literal(d) and int(d) <= 1)]
+        if not all([d == dims[0] for d in dims]):
+            self._add_suggested_merge(dims, apply=True)
+
+    def _compute_matmul_shape(self, node, output_dtype=None):
+        lhs_shape = self._get_shape(node, 0)
+        rhs_shape = self._get_shape(node, 1)
+        lhs_rank = len(lhs_shape)
+        rhs_rank = len(rhs_shape)
+        lhs_reduce_dim = 0
+        rhs_reduce_dim = 0
+        assert lhs_rank > 0 and rhs_rank > 0
+        if lhs_rank == 1 and rhs_rank == 1:
+            new_shape = []
+        elif lhs_rank == 1:
+            rhs_reduce_dim = -2
+            new_shape = rhs_shape[:rhs_reduce_dim] + [rhs_shape[-1]]
+        elif rhs_rank == 1:
+            lhs_reduce_dim = -1
+            new_shape = lhs_shape[:lhs_reduce_dim]
+        else:
+            lhs_reduce_dim = -1
+            rhs_reduce_dim = -2
+            new_shape = self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]) + [lhs_shape[-2]] + [rhs_shape[-1]]
+        # merge reduce dim
+        self._check_merged_dims(
+            [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]],
+            allow_broadcast=False,
+        )
+        if output_dtype is None:
+            # infer output_dtype from input type when not specified
+            output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_shape))
+
+    def _fuse_tensor_type(self, node, out_idx, dst_type, src_type):
+        """
+        update dst_tensor_type to be compatible with src_tensor_type when dimension mismatches
+        """
+        dst_tensor_type = (
+            dst_type.sequence_type.elem_type.tensor_type if is_sequence(dst_type) else dst_type.tensor_type
+        )
+        src_tensor_type = (
+            src_type.sequence_type.elem_type.tensor_type if is_sequence(src_type) else src_type.tensor_type
+        )
+        if dst_tensor_type.elem_type != src_tensor_type.elem_type:
+            node_id = node.name if node.name else node.op_type
+            raise ValueError(
+                f"For node {node_id}, dst_tensor_type.elem_type != src_tensor_type.elem_type: "
+                f"{onnx.onnx_pb.TensorProto.DataType.Name(dst_tensor_type.elem_type)} vs "
+                f"{onnx.onnx_pb.TensorProto.DataType.Name(src_tensor_type.elem_type)}"
+            )
+        if dst_tensor_type.HasField("shape"):
+            for di, ds in enumerate(zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim)):
+                if ds[0] != ds[1]:
+                    # create a new symbolic dimension for node/out_idx/mismatch dim id in dst_tensor_type for tensor_type
+                    # for sequence_type, clear the dimension
+                    new_dim = onnx.TensorShapeProto.Dimension()
+                    if not is_sequence(dst_type):
+                        new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, out_idx, di))
+                    dst_tensor_type.shape.dim[di].CopyFrom(new_dim)
+        else:
+            dst_tensor_type.CopyFrom(src_tensor_type)
+
+    def _infer_ArrayFeatureExtractor(self, node):
+        data_shape = self._get_shape(node, 0)
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape[:-1] + indices_shape,
+            )
+        )
+
+    def _infer_symbolic_compute_ops(self, node):
+        funcs = {
+            "Add": lambda l: l[0] + l[1],
+            "Div": lambda l: l[0] // l[1],  # integer div in sympy
+            "Equal": lambda l: l[0] == l[1],
+            "Floor": lambda l: sympy.floor(l[0]),
+            "Max": lambda l: l[1]
+            if is_literal(l[0]) and int(l[0]) < -self.int_max_
+            else (l[0] if is_literal(l[1]) and int(l[1]) < -self.int_max_ else sympy.Max(l[0], l[1])),
+            "Min": lambda l: l[1]
+            if is_literal(l[0]) and int(l[0]) > self.int_max_
+            else (l[0] if is_literal(l[1]) and int(l[1]) > self.int_max_ else sympy.Min(l[0], l[1])),
+            "Mul": lambda l: l[0] * l[1],
+            "Sub": lambda l: l[0] - l[1],
+            "Where": lambda l: l[1] if l[0] else l[2],
+            "Neg": lambda l: -l[0],
+        }
+        assert node.op_type in funcs
+        self._compute_on_sympy_data(node, funcs[node.op_type])
+
+    def _infer_Cast(self, node):
+        self._pass_on_sympy_data(node)
+
+    def _infer_CategoryMapper(self, node):
+        input_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        if input_type == onnx.TensorProto.STRING:
+            output_type = onnx.TensorProto.INT64
+        else:
+            output_type = onnx.TensorProto.STRING
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_type, self._get_shape(node, 0)))
+
+    def _infer_Compress(self, node):
+        input_shape = self._get_shape(node, 0)
+        # create a new symbolic dimension for Compress output
+        compress_len = str(self._new_symbolic_dim_from_output(node))
+        axis = get_attribute(node, "axis")
+        if axis == None:
+            # when axis is not specified, input is flattened before compress so output is 1D
+            output_shape = [compress_len]
+        else:
+            output_shape = input_shape
+            output_shape[handle_negative_axis(axis, len(input_shape))] = compress_len
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+
+    def _infer_Concat(self, node):
+        if any([i in self.sympy_data_ or i in self.initializers_ for i in node.input]):
+            values = self._get_int_values(node)
+            if all([v is not None for v in values]):
+                assert 0 == get_attribute(node, "axis")
+                self.sympy_data_[node.output[0]] = []
+                for i in range(len(node.input)):
+                    value = values[i]
+                    if type(value) == list:
+                        self.sympy_data_[node.output[0]].extend(value)
+                    else:
+                        self.sympy_data_[node.output[0]].append(value)
+
+        sympy_shape = self._get_sympy_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis"), len(sympy_shape))
+        for i_idx in range(1, len(node.input)):
+            input_shape = self._get_sympy_shape(node, i_idx)
+            if input_shape:
+                sympy_shape[axis] = sympy_shape[axis] + input_shape[axis]
+        self._update_computed_dims(sympy_shape)
+        # merge symbolic dims for non-concat axes
+        for d in range(len(sympy_shape)):
+            if d == axis:
+                continue
+            dims = [self._get_shape(node, i_idx)[d] for i_idx in range(len(node.input)) if self._get_shape(node, i_idx)]
+            if all([d == dims[0] for d in dims]):
+                continue
+            merged = self._merge_symbols(dims)
+            if type(merged) == str:
+                sympy_shape[d] = self.symbolic_dims_[merged] if merged else None
+            else:
+                sympy_shape[d] = merged
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_ConcatFromSequence(self, node):
+        seq_shape = self._get_shape(node, 0)
+        new_axis = 1 if get_attribute(node, "new_axis") else 0
+        axis = handle_negative_axis(get_attribute(node, "axis"), len(seq_shape) + new_axis)
+        concat_dim = str(self._new_symbolic_dim_from_output(node, 0, axis))
+        new_shape = seq_shape
+        if new_axis:
+            new_shape = seq_shape[:axis] + [concat_dim] + seq_shape[axis:]
+        else:
+            new_shape[axis] = concat_dim
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.sequence_type.elem_type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_Constant(self, node):
+        t = get_attribute(node, "value")
+        self.sympy_data_[node.output[0]] = numpy_helper.to_array(t)
+
+    def _infer_ConstantOfShape(self, node):
+        sympy_shape = self._get_int_values(node)[0]
+        vi = self.known_vi_[node.output[0]]
+        if sympy_shape is not None:
+            if type(sympy_shape) != list:
+                sympy_shape = [sympy_shape]
+            self._update_computed_dims(sympy_shape)
+            # update sympy data if output type is int, and shape is known
+            if vi.type.tensor_type.elem_type == onnx.TensorProto.INT64 and all([is_literal(x) for x in sympy_shape]):
+                self.sympy_data_[node.output[0]] = np.ones(
+                    [int(x) for x in sympy_shape], dtype=np.int64
+                ) * numpy_helper.to_array(get_attribute(node, "value", 0))
+        else:
+            # create new dynamic shape
+            # note input0 is a 1D vector of shape, the new symbolic shape has the rank of the shape vector length
+            sympy_shape = self._new_symbolic_shape(self._get_shape(node, 0)[0], node)
+
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_Conv(self, node):
+        sympy_shape = self._compute_conv_pool_shape(node)
+        self._update_computed_dims(sympy_shape)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(sympy_shape),
+            )
+        )
+
+    def _infer_Einsum(self, node):
+        # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
+        equation = get_attribute(node, "equation")
+        equation = equation.replace(b" ", b"")
+        mid_index = equation.find(b"->")
+        left_equation = equation[:mid_index] if mid_index != -1 else equation
+
+        num_operands = 0
+        num_ellipsis = 0
+        num_ellipsis_indices = 0
+
+        letter_to_dim = {}
+
+        terms = left_equation.split(b",")
+        for term in terms:
+            ellipsis_index = term.find(b"...")
+            shape = self._get_shape(node, num_operands)
+            rank = len(shape)
+            if ellipsis_index != -1:
+                if num_ellipsis == 0:
+                    num_ellipsis_indices = rank - len(term) + 3
+                num_ellipsis = num_ellipsis + 1
+            for i in range(1, rank + 1):
+                letter = term[-i]
+                if letter != 46:  # letter != b'.'
+                    dim = shape[-i]
+                    if letter not in letter_to_dim.keys():
+                        letter_to_dim[letter] = dim
+                    elif type(dim) != sympy.Symbol:
+                        letter_to_dim[letter] = dim
+            num_operands = num_operands + 1
+
+        new_sympy_shape = []
+        from collections import OrderedDict
+
+        num_letter_occurrences = OrderedDict()
+        if mid_index != -1:
+            right_equation = equation[mid_index + 2 :]
+            right_ellipsis_index = right_equation.find(b"...")
+            if right_ellipsis_index != -1:
+                for i in range(num_ellipsis_indices):
+                    new_sympy_shape.append(shape[i])
+            for c in right_equation:
+                if c != 46:  # c != b'.'
+                    new_sympy_shape.append(letter_to_dim[c])
+        else:
+            for i in range(num_ellipsis_indices):
+                new_sympy_shape.append(shape[i])
+            for c in left_equation:
+                if c != 44 and c != 46:  # c != b',' and c != b'.':
+                    if c in num_letter_occurrences:
+                        num_letter_occurrences[c] = num_letter_occurrences[c] + 1
+                    else:
+                        num_letter_occurrences[c] = 1
+            for key, value in num_letter_occurrences.items():
+                if value == 1:
+                    new_sympy_shape.append(letter_to_dim[key])
+
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_sympy_shape))
+
+    def _infer_Expand(self, node):
+        expand_to_shape = as_list(self._try_get_value(node, 1), keep_none=True)
+        if expand_to_shape is not None:
+            # new_shape's dim can come from shape value
+            self._update_computed_dims(expand_to_shape)
+            shape = self._get_shape(node, 0)
+            new_shape = self._broadcast_shapes(shape, get_shape_from_sympy_shape(expand_to_shape))
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    new_shape,
+                )
+            )
+
+    def _infer_Gather(self, node):
+        data_shape = self._get_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", 0), len(data_shape))
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape[:axis] + indices_shape + data_shape[axis + 1 :],
+            )
+        )
+        # for 1D input, do some sympy compute
+        if node.input[0] in self.sympy_data_ and len(data_shape) == 1 and 0 == get_attribute(node, "axis", 0):
+            idx = self._try_get_value(node, 1)
+            if idx is not None:
+                data = self.sympy_data_[node.input[0]]
+                if type(data) == list:
+                    if type(idx) == np.ndarray and len(idx.shape) == 1:
+                        self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx]
+                    else:
+                        self.sympy_data_[node.output[0]] = data[int(idx)]
+                else:
+                    assert idx == 0 or idx == -1
+                    self.sympy_data_[node.output[0]] = data
+
+    def _infer_GatherElements(self, node):
+        indices_shape = self._get_shape(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                indices_shape,
+            )
+        )
+
+    def _infer_GatherND(self, node):
+        data_shape = self._get_shape(node, 0)
+        data_rank = len(data_shape)
+        indices_shape = self._get_shape(node, 1)
+        indices_rank = len(indices_shape)
+        last_index_dimension = indices_shape[-1]
+        assert is_literal(last_index_dimension) and last_index_dimension <= data_rank
+        new_shape = indices_shape[:-1] + data_shape[last_index_dimension:]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_If(self, node):
+        # special case for constant condition, in case there are mismatching shape from the non-executed branch
+        subgraphs = [
+            get_attribute(node, "then_branch"),
+            get_attribute(node, "else_branch"),
+        ]
+        cond = self._try_get_value(node, 0)
+        if cond is not None:
+            if as_scalar(cond) > 0:
+                subgraphs[1].CopyFrom(subgraphs[0])
+            else:
+                subgraphs[0].CopyFrom(subgraphs[1])
+
+        for i_sub, subgraph in enumerate(subgraphs):
+            subgraph_infer = self._onnx_infer_subgraph(node, subgraph, use_node_input=False)
+            for i_out in range(len(node.output)):
+                vi = self.known_vi_[node.output[i_out]]
+                if i_sub == 0:
+                    vi.CopyFrom(subgraph.output[i_out])
+                    vi.name = node.output[i_out]
+                else:
+                    self._fuse_tensor_type(node, i_out, vi.type, subgraph.output[i_out].type)
+
+                # pass on sympy data from subgraph, if cond is constant
+                if cond is not None and i_sub == (0 if as_scalar(cond) > 0 else 1):
+                    if subgraph.output[i_out].name in subgraph_infer.sympy_data_:
+                        self.sympy_data_[vi.name] = subgraph_infer.sympy_data_[subgraph.output[i_out].name]
+
+    def _infer_Loop(self, node):
+        subgraph = get_attribute(node, "body")
+        assert len(subgraph.input) == len(node.input)
+        num_loop_carried = len(node.input) - 2  # minus the length and initial loop condition
+        # when sequence_type is used as loop carried input
+        # needs to run subgraph infer twice if the tensor shape in sequence contains None
+        for i, si in enumerate(subgraph.input):
+            si_name = si.name
+            si.CopyFrom(self.known_vi_[node.input[i]])
+            si.name = si_name
+
+        self._onnx_infer_subgraph(node, subgraph)
+
+        # check subgraph input/output for shape changes in loop carried variables
+        # for tensor_type, create new symbolic dim when changing, i.e., output = Concat(input, a)
+        # for sequence_type, propagate from output to input
+        need_second_infer = False
+        for i_out in range(1, num_loop_carried + 1):
+            so = subgraph.output[i_out]
+            so_shape = get_shape_from_value_info(so)
+            if is_sequence(so.type):
+                if so_shape and None in so_shape:
+                    # copy shape from output to input
+                    # note that loop input is [loop_len, cond, input_0, input_1, ...]
+                    # while loop output is [cond, output_0, output_1, ...]
+                    subgraph.input[i_out + 1].type.sequence_type.elem_type.CopyFrom(so.type.sequence_type.elem_type)
+                    need_second_infer = True
+            else:
+                si = subgraph.input[i_out + 1]
+                si_shape = get_shape_from_value_info(si)
+                for di, dims in enumerate(zip(si_shape, so_shape)):
+                    if dims[0] != dims[1]:
+                        new_dim = onnx.TensorShapeProto.Dimension()
+                        new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, i_out, di))
+                        si.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+                        so.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+                        need_second_infer = True
+
+        if need_second_infer:
+            if self.verbose_ > 2:
+                logger.debug(
+                    "Rerun Loop: {}({}...), because of sequence in loop carried variables".format(
+                        node.name, node.output[0]
+                    )
+                )
+            self._onnx_infer_subgraph(node, subgraph, inc_subgraph_id=False)
+
+        # create a new symbolic dimension for iteration dependent dimension
+        loop_iter_dim = str(self._new_symbolic_dim_from_output(node))
+        for i in range(len(node.output)):
+            vi = self.known_vi_[node.output[i]]
+            vi.CopyFrom(subgraph.output[i + 1])  # first subgraph output is condition, not in node output
+            if i >= num_loop_carried:
+                assert not is_sequence(vi.type)  # TODO: handle loop accumulation in sequence_type
+                subgraph_vi_dim = subgraph.output[i + 1].type.tensor_type.shape.dim
+                vi.type.tensor_type.shape.ClearField("dim")
+                vi_dim = vi.type.tensor_type.shape.dim
+                vi_dim.add().dim_param = loop_iter_dim
+                vi_dim.extend(list(subgraph_vi_dim))
+            vi.name = node.output[i]
+
+    def _infer_MatMul(self, node):
+        self._compute_matmul_shape(node)
+
+    def _infer_MatMulInteger(self, node):
+        self._compute_matmul_shape(node, onnx.TensorProto.INT32)
+
+    def _infer_NonMaxSuppression(self, node):
+        selected = str(self._new_symbolic_dim_from_output(node))
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [selected, 3]))
+
+    def _infer_NonZero(self, node):
+        input_rank = self._get_shape_rank(node, 0)
+        # create a new symbolic dimension for NonZero output
+        nz_len = str(self._new_symbolic_dim_from_output(node, 0, 1))
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], vi.type.tensor_type.elem_type, [input_rank, nz_len]))
+
+    def _infer_OneHot(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        depth = self._try_get_value(node, 1)
+        axis = get_attribute(node, "axis", -1)
+        axis = handle_negative_axis(axis, len(sympy_shape) + 1)
+        new_shape = get_shape_from_sympy_shape(
+            sympy_shape[:axis]
+            + [self._new_symbolic_dim_from_output(node) if not is_literal(depth) else depth]
+            + sympy_shape[axis:]
+        )
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[2]].type.tensor_type.elem_type,
+                new_shape,
+            )
+        )
+
+    def _infer_Pad(self, node):
+        if get_opset(self.out_mp_) <= 10:
+            pads = get_attribute(node, "pads")
+        else:
+            pads = self._try_get_value(node, 1)
+
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+
+        if pads is not None:
+            assert len(pads) == 2 * rank
+            new_sympy_shape = [
+                d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])
+            ]
+            self._update_computed_dims(new_sympy_shape)
+        else:
+            # dynamic pads, create new symbolic dimensions
+            new_sympy_shape = self._new_symbolic_shape(rank, node)
+        output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape))
+        )
+
+    def _infer_Pool(self, node):
+        sympy_shape = self._compute_conv_pool_shape(node)
+        self._update_computed_dims(sympy_shape)
+        for o in node.output:
+            if not o:
+                continue
+            vi = self.known_vi_[o]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    o,
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(sympy_shape),
+                )
+            )
+
+    def _infer_aten_bitwise_or(self, node):
+        shape0 = self._get_shape(node, 0)
+        shape1 = self._get_shape(node, 1)
+        new_shape = self._broadcast_shapes(shape0, shape1)
+        t0 = self.known_vi_[node.input[0]]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], t0.type.tensor_type.elem_type, new_shape))
+
+    def _infer_aten_diagonal(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+        offset = self._try_get_value(node, 1)
+        dim1 = self._try_get_value(node, 2)
+        dim2 = self._try_get_value(node, 3)
+
+        assert offset is not None and dim1 is not None and dim2 is not None
+        dim1 = handle_negative_axis(dim1, rank)
+        dim2 = handle_negative_axis(dim2, rank)
+
+        new_shape = []
+        for dim, val in enumerate(sympy_shape):
+            if dim not in [dim1, dim2]:
+                new_shape.append(val)
+
+        shape1 = sympy_shape[dim1]
+        shape2 = sympy_shape[dim2]
+        if offset >= 0:
+            diag_shape = sympy.Max(0, sympy.Min(shape1, shape2 - offset))
+        else:
+            diag_shape = sympy.Max(0, sympy.Min(shape1 + offset, shape2))
+        new_shape.append(diag_shape)
+
+        if node.output[0]:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_shape),
+                )
+            )
+
+    def _infer_aten_multinomial(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        rank = len(sympy_shape)
+        assert rank in [1, 2]
+        num_samples = self._try_get_value(node, 1)
+        di = rank - 1
+        last_dim = num_samples if num_samples else str(self._new_symbolic_dim_from_output(node, 0, di))
+        output_shape = sympy_shape[:-1] + [last_dim]
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                onnx.TensorProto.INT64,
+                get_shape_from_sympy_shape(output_shape),
+            )
+        )
+
+    def _infer_aten_pool2d(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        assert len(sympy_shape) == 4
+        sympy_shape[-2:] = [self._new_symbolic_dim_from_output(node, 0, i) for i in [2, 3]]
+        self._update_computed_dims(sympy_shape)
+        for i, o in enumerate(node.output):
+            if not o:
+                continue
+            vi = self.known_vi_[o]
+            elem_type = onnx.TensorProto.INT64 if i == 1 else self.known_vi_[node.input[0]].type.tensor_type.elem_type
+            vi.CopyFrom(helper.make_tensor_value_info(o, elem_type, get_shape_from_sympy_shape(sympy_shape)))
+
+    def _infer_aten_minmax(self, node):
+        vi = self.known_vi_[node.output[0]]
+        if len(node.input) == 1:
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, []
+                )
+            )
+        else:
+            assert len(node.input) == 3
+            keepdim = self._try_get_value(node, 2)
+            assert keepdim is not None  # can only handle known keepdim case.
+            dim = self._try_get_value(node, 1)
+            if dim is None:
+                rank = self._get_shape_rank(node, 0)
+                output_shape = self._new_symbolic_shape(rank if keepdim else rank - 1, node)
+            else:
+                shape = self._get_sympy_shape(node, 0)
+                dim = handle_negative_axis(dim, len(shape))
+                output_shape = shape[:dim]
+                if keepdim:
+                    output_shape += [1]
+                output_shape += shape[dim + 1 :]
+
+            output_shape = get_shape_from_sympy_shape(output_shape)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, output_shape
+                )
+            )
+            vi1 = self.known_vi_[node.output[1]]
+            vi1.CopyFrom(helper.make_tensor_value_info(node.output[1], onnx.TensorProto.INT64, output_shape))
+
+    def _infer_aten_unfold(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        dimension = self._try_get_value(node, 1)
+        size = self._try_get_value(node, 2)
+        step = self._try_get_value(node, 3)
+        if dimension is not None and size is not None and step is not None:
+            assert dimension < len(sympy_shape)
+            sympy_shape[dimension] = (sympy_shape[dimension] - size) // step + 1
+            sympy_shape.append(size)
+        else:
+            rank = len(sympy_shape)
+            sympy_shape = self._new_symbolic_shape(rank + 1, node)
+        self._update_computed_dims(sympy_shape)
+        if node.output[0]:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(sympy_shape),
+                )
+            )
+
+    def _infer_aten_argmax(self, node):
+        new_shape = None
+        if node.input[1] == "":
+            # The argmax of the flattened input is returned.
+            new_shape = []
+        else:
+            dim = self._try_get_value(node, 1)
+            keepdim = self._try_get_value(node, 2)
+            if keepdim is not None:
+                sympy_shape = self._get_sympy_shape(node, 0)
+                if dim is not None:
+                    dim = handle_negative_axis(dim, len(sympy_shape))
+                    if keepdim:
+                        sympy_shape[dim] = 1
+                    else:
+                        del sympy_shape[dim]
+                else:
+                    rank = len(sympy_shape)
+                    sympy_shape = self._new_symbolic_shape(rank if keepdim else rank - 1, node)
+                self._update_computed_dims(sympy_shape)
+                new_shape = get_shape_from_sympy_shape(sympy_shape)
+        if node.output[0] and new_shape is not None:
+            vi = self.known_vi_[node.output[0]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, new_shape))
+
+    def _infer_BatchNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+        # this works for opsets < 14 and 14 since we check i < len(node.output) in the loop
+        for i in [1, 2, 3, 4]:
+            if i < len(node.output) and node.output[i] != "":
+                # all of these parameters have the same shape as the 1st input
+                self._propagate_shape_and_type(node, input_index=1, output_index=i)
+
+    def _infer_Range(self, node):
+        vi = self.known_vi_[node.output[0]]
+        input_data = self._get_int_values(node)
+        if all([i is not None for i in input_data]):
+            start = as_scalar(input_data[0])
+            limit = as_scalar(input_data[1])
+            delta = as_scalar(input_data[2])
+            new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start) / delta), 0)]
+        else:
+            new_sympy_shape = [self._new_symbolic_dim_from_output(node)]
+        self._update_computed_dims(new_sympy_shape)
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+    def _infer_ReduceSum(self, node):
+        keep_dims = get_attribute(node, "keepdims", 1)
+        if get_opset(self.out_mp_) >= 13 and len(node.input) > 1:
+            # ReduceSum changes axes to input[1] in opset 13
+            axes = self._try_get_value(node, 1)
+            vi = self.known_vi_[node.output[0]]
+            if axes is None:
+                assert keep_dims  # can only handle keep_dims==True when axes is unknown, by generating new ranks
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        get_shape_from_sympy_shape(self._new_symbolic_shape(self._get_shape_rank(node, 0), node)),
+                    )
+                )
+            else:
+                shape = self._get_shape(node, 0)
+                output_shape = []
+                axes = [handle_negative_axis(a, len(shape)) for a in axes]
+                for i, d in enumerate(shape):
+                    if i in axes:
+                        if keep_dims:
+                            output_shape.append(1)
+                    else:
+                        output_shape.append(d)
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        output_shape,
+                    )
+                )
+
+    def _infer_ReduceProd(self, node):
+        axes = get_attribute(node, "axes")
+        keep_dims = get_attribute(node, "keepdims", 1)
+        if keep_dims == 0 and axes == [0]:
+            data = self._get_int_values(node)[0]
+            if data is not None:
+                self.sympy_data_[node.output[0]] = sympy_reduce_product(data)
+
+    def _infer_Reshape(self, node):
+        shape_value = self._try_get_value(node, 1)
+        vi = self.known_vi_[node.output[0]]
+        if shape_value is None:
+            shape_shape = self._get_shape(node, 1)
+            assert len(shape_shape) == 1
+            shape_rank = shape_shape[0]
+            assert is_literal(shape_rank)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(self._new_symbolic_shape(shape_rank, node)),
+                )
+            )
+        else:
+            input_sympy_shape = self._get_sympy_shape(node, 0)
+            total = int(1)
+            for d in input_sympy_shape:
+                total = total * d
+            new_sympy_shape = []
+            deferred_dim_idx = -1
+            non_deferred_size = int(1)
+            for i, d in enumerate(shape_value):
+                if type(d) == sympy.Symbol:
+                    new_sympy_shape.append(d)
+                elif d == 0:
+                    new_sympy_shape.append(input_sympy_shape[i])
+                    non_deferred_size = non_deferred_size * input_sympy_shape[i]
+                else:
+                    new_sympy_shape.append(d)
+                if d == -1:
+                    deferred_dim_idx = i
+                elif d != 0:
+                    non_deferred_size = non_deferred_size * d
+
+            assert new_sympy_shape.count(-1) < 2
+            if -1 in new_sympy_shape:
+                new_dim = total // non_deferred_size
+                new_sympy_shape[deferred_dim_idx] = new_dim
+
+            self._update_computed_dims(new_sympy_shape)
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    vi.type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_sympy_shape),
+                )
+            )
+
+        self._pass_on_sympy_data(node)
+
+    def _infer_Resize(self, node):
+        vi = self.known_vi_[node.output[0]]
+        input_sympy_shape = self._get_sympy_shape(node, 0)
+        if get_opset(self.out_mp_) <= 10:
+            scales = self._try_get_value(node, 1)
+            if scales is not None:
+                new_sympy_shape = [sympy.simplify(sympy.floor(d * s)) for d, s in zip(input_sympy_shape, scales)]
+                self._update_computed_dims(new_sympy_shape)
+                vi.CopyFrom(
+                    helper.make_tensor_value_info(
+                        node.output[0],
+                        self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                        get_shape_from_sympy_shape(new_sympy_shape),
+                    )
+                )
+        else:
+            roi = self._try_get_value(node, 1)
+            scales = self._try_get_value(node, 2)
+            sizes = self._try_get_value(node, 3)
+            if sizes is not None:
+                new_sympy_shape = [sympy.simplify(sympy.floor(s)) for s in sizes]
+                self._update_computed_dims(new_sympy_shape)
+            elif scales is not None:
+                rank = len(scales)
+                if get_attribute(node, "coordinate_transformation_mode") == "tf_crop_and_resize":
+                    assert len(roi) == 2 * rank
+                    roi_start = list(roi)[:rank]
+                    roi_end = list(roi)[rank:]
+                else:
+                    roi_start = [0] * rank
+                    roi_end = [1] * rank
+                scales = list(scales)
+                new_sympy_shape = [
+                    sympy.simplify(sympy.floor(d * (end - start) * scale))
+                    for d, start, end, scale in zip(input_sympy_shape, roi_start, roi_end, scales)
+                ]
+                self._update_computed_dims(new_sympy_shape)
+            else:
+                new_sympy_shape = self._new_symbolic_shape(self._get_shape_rank(node, 0), node)
+
+            vi.CopyFrom(
+                helper.make_tensor_value_info(
+                    node.output[0],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(new_sympy_shape),
+                )
+            )
+
+    def _infer_Scan(self, node):
+        subgraph = get_attribute(node, "body")
+        num_scan_inputs = get_attribute(node, "num_scan_inputs")
+        scan_input_axes = get_attribute(node, "scan_input_axes", [0] * num_scan_inputs)
+        num_scan_states = len(node.input) - num_scan_inputs
+        scan_input_axes = [
+            handle_negative_axis(ax, self._get_shape_rank(node, i + num_scan_states))
+            for i, ax in enumerate(scan_input_axes)
+        ]
+        # We may have cases where the subgraph has optional inputs that appear in both subgraph's input and initializer,
+        # but not in the node's input. In such cases, the input model might be invalid, but let's skip those optional inputs.
+        assert len(subgraph.input) >= len(node.input)
+        subgraph_inputs = subgraph.input[: len(node.input)]
+        for i, si in enumerate(subgraph_inputs):
+            subgraph_name = si.name
+            si.CopyFrom(self.known_vi_[node.input[i]])
+            if i >= num_scan_states:
+                scan_input_dim = si.type.tensor_type.shape.dim
+                scan_input_dim.remove(scan_input_dim[scan_input_axes[i - num_scan_states]])
+            si.name = subgraph_name
+        self._onnx_infer_subgraph(node, subgraph)
+        num_scan_outputs = len(node.output) - num_scan_states
+        scan_output_axes = get_attribute(node, "scan_output_axes", [0] * num_scan_outputs)
+        scan_input_dim = get_shape_from_type_proto(self.known_vi_[node.input[-1]].type)[scan_input_axes[-1]]
+        for i, o in enumerate(node.output):
+            vi = self.known_vi_[o]
+            if i >= num_scan_states:
+                shape = get_shape_from_type_proto(subgraph.output[i].type)
+                new_dim = handle_negative_axis(scan_output_axes[i - num_scan_states], len(shape) + 1)
+                shape = shape[:new_dim] + [scan_input_dim] + shape[new_dim:]
+                vi.CopyFrom(helper.make_tensor_value_info(o, subgraph.output[i].type.tensor_type.elem_type, shape))
+            else:
+                vi.CopyFrom(subgraph.output[i])
+            vi.name = o
+
+    def _infer_ScatterElements(self, node):
+        data_shape = self._get_shape(node, 0)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                data_shape,
+            )
+        )
+
+    def _infer_SequenceAt(self, node):
+        # need to create new symbolic dimension if sequence shape has None:
+        seq_shape = self._get_shape(node, 0)
+        vi = self.known_vi_[node.output[0]]
+        if seq_shape is not None:
+            for di, d in enumerate(seq_shape):
+                if d is not None:
+                    continue
+                new_dim = onnx.TensorShapeProto.Dimension()
+                new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, 0, di))
+                vi.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
+
+    def _infer_SequenceInsert(self, node):
+        # workaround bug in onnx's shape inference
+        vi_seq = self.known_vi_[node.input[0]]
+        vi_tensor = self.known_vi_[node.input[1]]
+        vi_out_seq = self.known_vi_[node.output[0]]
+        vi_out_seq.CopyFrom(vi_seq)
+        vi_out_seq.name = node.output[0]
+        self._fuse_tensor_type(node, 0, vi_out_seq.type, vi_tensor.type)
+
+    def _infer_Shape(self, node):
+        self.sympy_data_[node.output[0]] = self._get_sympy_shape(node, 0)
+
+    def _infer_Size(self, node):
+        sympy_shape = self._get_sympy_shape(node, 0)
+        self.sympy_data_[node.output[0]] = sympy_reduce_product(sympy_shape)
+        self.known_vi_[node.output[0]].CopyFrom(
+            helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [])
+        )
+
+    def _infer_Slice(self, node):
+        def less_equal(x, y):
+            try:
+                return bool(x <= y)
+            except TypeError:
+                pass
+            try:
+                return bool(y >= x)
+            except TypeError:
+                pass
+            try:
+                return bool(-x >= -y)
+            except TypeError:
+                pass
+            try:
+                return bool(-y <= -x)
+            except TypeError:
+                # the last attempt; this may raise TypeError
+                return bool(y - x >= 0)
+
+        def handle_negative_index(index, bound):
+            """normalizes a negative index to be in [0, bound)"""
+            try:
+                if not less_equal(0, index):
+                    if is_literal(index) and index <= -self.int_max_:
+                        # this case is handled separately
+                        return index
+                    return bound + index
+            except TypeError:
+                logger.warning("Cannot determine if {} < 0".format(index))
+            return index
+
+        if get_opset(self.out_mp_) <= 9:
+            axes = get_attribute(node, "axes")
+            starts = get_attribute(node, "starts")
+            ends = get_attribute(node, "ends")
+            if not axes:
+                axes = list(range(len(starts)))
+            steps = [1] * len(axes)
+        else:
+            starts = as_list(self._try_get_value(node, 1), keep_none=True)
+            ends = as_list(self._try_get_value(node, 2), keep_none=True)
+            axes = self._try_get_value(node, 3)
+            steps = self._try_get_value(node, 4)
+            if axes is None and not (starts is None and ends is None):
+                axes = list(range(0, len(starts if starts is not None else ends)))
+            if steps is None and not (starts is None and ends is None):
+                steps = [1] * len(starts if starts is not None else ends)
+            axes = as_list(axes, keep_none=True)
+            steps = as_list(steps, keep_none=True)
+
+        new_sympy_shape = self._get_sympy_shape(node, 0)
+        if starts is None or ends is None:
+            if axes is None:
+                for i in range(len(new_sympy_shape)):
+                    new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
+            else:
+                new_sympy_shape = get_shape_from_sympy_shape(new_sympy_shape)
+                for i in axes:
+                    new_sympy_shape[i] = self._new_symbolic_dim_from_output(node, 0, i)
+        else:
+            for i, s, e, t in zip(axes, starts, ends, steps):
+                e = handle_negative_index(e, new_sympy_shape[i])
+                if is_literal(e):
+                    if e >= self.int_max_:
+                        e = new_sympy_shape[i]
+                    elif e <= -self.int_max_:
+                        e = 0 if s > 0 else -1
+                    elif is_literal(new_sympy_shape[i]):
+                        if e < 0:
+                            e = max(0, e + new_sympy_shape[i])
+                        e = min(e, new_sympy_shape[i])
+                    else:
+                        if e > 0:
+                            e = (
+                                sympy.Min(e, new_sympy_shape[i]) if e > 1 else e
+                            )  # special case for slicing first to make computation easier
+                else:
+                    if is_literal(new_sympy_shape[i]):
+                        e = sympy.Min(e, new_sympy_shape[i])
+                    else:
+                        try:
+                            if not less_equal(e, new_sympy_shape[i]):
+                                e = new_sympy_shape[i]
+                        except Exception:
+                            logger.warning(
+                                "Unable to determine if {} <= {}, treat as equal".format(e, new_sympy_shape[i])
+                            )
+                            e = new_sympy_shape[i]
+
+                s = handle_negative_index(s, new_sympy_shape[i])
+                if is_literal(new_sympy_shape[i]) and is_literal(s):
+                    s = max(0, min(s, new_sympy_shape[i]))
+
+                new_sympy_shape[i] = sympy.simplify((e - s + t + (-1 if t > 0 else 1)) // t)
+
+            self._update_computed_dims(new_sympy_shape)
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+        # handle sympy_data if needed, for slice in shape computation
+        if (
+            node.input[0] in self.sympy_data_
+            and [0] == axes
+            and len(starts) == 1
+            and len(ends) == 1
+            and len(steps) == 1
+        ):
+            input_sympy_data = self.sympy_data_[node.input[0]]
+            if type(input_sympy_data) == list or (
+                type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1
+            ):
+                self.sympy_data_[node.output[0]] = input_sympy_data[starts[0] : ends[0] : steps[0]]
+
+    def _infer_SoftmaxCrossEntropyLoss(self, node):
+        vi = self.known_vi_[node.output[0]]
+        elem_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi.type.tensor_type.elem_type = elem_type
+        vi.type.tensor_type.shape.CopyFrom(onnx.TensorShapeProto())
+
+        if len(node.output) > 1:
+            data_shape = self._get_shape(node, 0)
+            vi = self.known_vi_[node.output[1]]
+            vi.CopyFrom(helper.make_tensor_value_info(vi.name, elem_type, data_shape))
+
+    def _infer_Split_Common(self, node, make_value_info_func):
+        input_sympy_shape = self._get_sympy_shape(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", 0), len(input_sympy_shape))
+        split = get_attribute(node, "split")
+        if not split:
+            num_outputs = len(node.output)
+            split = [input_sympy_shape[axis] / sympy.Integer(num_outputs)] * num_outputs
+            self._update_computed_dims(split)
+        else:
+            split = [sympy.Integer(s) for s in split]
+
+        for i_o in range(len(split)):
+            vi = self.known_vi_[node.output[i_o]]
+            vi.CopyFrom(
+                make_value_info_func(
+                    node.output[i_o],
+                    self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                    get_shape_from_sympy_shape(input_sympy_shape[:axis] + [split[i_o]] + input_sympy_shape[axis + 1 :]),
+                )
+            )
+            self.known_vi_[vi.name] = vi
+
+    def _infer_Split(self, node):
+        self._infer_Split_Common(node, helper.make_tensor_value_info)
+
+    def _infer_SplitToSequence(self, node):
+        self._infer_Split_Common(node, helper.make_sequence_value_info)
+
+    def _infer_Squeeze(self, node):
+        input_shape = self._get_shape(node, 0)
+        op_set = get_opset(self.out_mp_)
+
+        # Depending on op-version 'axes' are provided as attribute or via 2nd input
+        if op_set < 13:
+            axes = get_attribute(node, "axes")
+            assert self._try_get_value(node, 1) is None
+        else:
+            axes = self._try_get_value(node, 1)
+            assert get_attribute(node, "axes") is None
+
+        if axes is None:
+            # No axes have been provided (neither via attribute nor via input).
+            # In this case the 'Shape' op should remove all axis with dimension 1.
+            # For symbolic dimensions we guess they are !=1.
+            output_shape = [s for s in input_shape if s != 1]
+            if self.verbose_ > 0:
+                symbolic_dimensions = [s for s in input_shape if type(s) != int]
+                if len(symbolic_dimensions) > 0:
+                    logger.debug(
+                        f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
+                        + f"Assuming the following dimensions are never equal to 1: {symbolic_dimensions}"
+                    )
+        else:
+            axes = [handle_negative_axis(a, len(input_shape)) for a in axes]
+            output_shape = []
+            for i in range(len(input_shape)):
+                if i not in axes:
+                    output_shape.append(input_shape[i])
+                else:
+                    assert input_shape[i] == 1 or type(input_shape[i]) != int
+                    if self.verbose_ > 0 and type(input_shape[i]) != int:
+                        logger.debug(
+                            f"Symbolic dimensions in input shape of op: '{node.op_type}' node: '{node.name}'. "
+                            + f"Assuming the dimension '{input_shape[i]}' at index {i} of the input to be equal to 1."
+                        )
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+        self._pass_on_sympy_data(node)
+
+    def _infer_Tile(self, node):
+        repeats_value = self._try_get_value(node, 1)
+        new_sympy_shape = []
+        if repeats_value is not None:
+            input_sympy_shape = self._get_sympy_shape(node, 0)
+            for i, d in enumerate(input_sympy_shape):
+                new_dim = d * repeats_value[i]
+                new_sympy_shape.append(new_dim)
+            self._update_computed_dims(new_sympy_shape)
+        else:
+            new_sympy_shape = self._new_symbolic_shape(self._get_shape_rank(node, 0), node)
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                vi.type.tensor_type.elem_type,
+                get_shape_from_sympy_shape(new_sympy_shape),
+            )
+        )
+
+    def _infer_TopK(self, node):
+        rank = self._get_shape_rank(node, 0)
+        axis = handle_negative_axis(get_attribute(node, "axis", -1), rank)
+        new_shape = self._get_shape(node, 0)
+
+        if get_opset(self.out_mp_) <= 9:
+            k = get_attribute(node, "k")
+        else:
+            k = self._get_int_values(node)[1]
+
+        if k == None:
+            k = self._new_symbolic_dim_from_output(node)
+        else:
+            k = as_scalar(k)
+
+        if type(k) in [int, str]:
+            new_shape[axis] = k
+        else:
+            new_sympy_shape = self._get_sympy_shape(node, 0)
+            new_sympy_shape[axis] = k
+            self._update_computed_dims(
+                new_sympy_shape
+            )  # note that TopK dim could be computed in sympy_data, so need to update computed_dims when it enters shape
+            new_shape = get_shape_from_sympy_shape(new_sympy_shape)
+
+        for i_o in range(len(node.output)):
+            vi = self.known_vi_[node.output[i_o]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[i_o], vi.type.tensor_type.elem_type, new_shape))
+
+    def _infer_Transpose(self, node):
+        if node.input[0] in self.sympy_data_:
+            data_shape = self._get_shape(node, 0)
+            perm = get_attribute(node, "perm", reversed(list(range(len(data_shape)))))
+            input_data = self.sympy_data_[node.input[0]]
+            self.sympy_data_[node.output[0]] = (
+                np.transpose(np.array(input_data).reshape(*data_shape), axes=tuple(perm)).flatten().tolist()
+            )
+
+    def _infer_Unsqueeze(self, node):
+        input_shape = self._get_shape(node, 0)
+        op_set = get_opset(self.out_mp_)
+
+        # Depending on op-version 'axes' are provided as attribute or via 2nd input
+        if op_set < 13:
+            axes = get_attribute(node, "axes")
+            assert self._try_get_value(node, 1) is None
+        else:
+            axes = self._try_get_value(node, 1)
+            assert get_attribute(node, "axes") is None
+
+        output_rank = len(input_shape) + len(axes)
+        axes = [handle_negative_axis(a, output_rank) for a in axes]
+
+        input_axis = 0
+        output_shape = []
+        for i in range(output_rank):
+            if i in axes:
+                output_shape.append(1)
+            else:
+                output_shape.append(input_shape[input_axis])
+                input_axis += 1
+
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(
+            helper.make_tensor_value_info(
+                node.output[0],
+                self.known_vi_[node.input[0]].type.tensor_type.elem_type,
+                output_shape,
+            )
+        )
+
+        self._pass_on_sympy_data(node)
+
+    def _infer_ZipMap(self, node):
+        map_key_type = None
+        if get_attribute(node, "classlabels_int64s") is not None:
+            map_key_type = onnx.TensorProto.INT64
+        elif get_attribute(node, "classlabels_strings") is not None:
+            map_key_type = onnx.TensorProto.STRING
+
+        assert map_key_type is not None
+        new_vi = onnx.ValueInfoProto()
+        new_vi.name = node.output[0]
+        new_vi.type.sequence_type.elem_type.map_type.value_type.tensor_type.elem_type = onnx.TensorProto.FLOAT
+        new_vi.type.sequence_type.elem_type.map_type.key_type = map_key_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(new_vi)
+
+    def _infer_Attention(self, node):
+        shape = self._get_shape(node, 0)
+        shape_bias = self._get_shape(node, 2)
+        assert len(shape) == 3 and len(shape_bias) == 1
+        qkv_hidden_sizes_attr = get_attribute(node, "qkv_hidden_sizes")
+        if qkv_hidden_sizes_attr is not None:
+            assert len(qkv_hidden_sizes_attr) == 3
+            shape[2] = int(qkv_hidden_sizes_attr[2])
+        else:
+            shape[2] = int(shape_bias[0] / 3)
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape))
+
+        if len(node.output) > 1:
+            # input shape: (batch_size, sequence_length, hidden_size)
+            # past shape: (2, batch_size, num_heads, past_sequence_length, head_size)
+            # mask shape: (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) or (batch_size, 1, max_seq_len, max_seq_len)
+            # present shape: (2, batch_size, num_heads, total_sequence_length, head_size), where total_sequence_length=sequence_length+past_sequence_length
+            input_shape = self._get_shape(node, 0)
+            past_shape = self._get_shape(node, 4)
+            mask_shape = self._get_shape(node, 3)
+            if len(past_shape) == 5:
+                if len(mask_shape) in [2, 3]:
+                    past_shape[3] = mask_shape[-1]
+                elif isinstance(input_shape[1], int) and isinstance(past_shape[3], int):
+                    past_shape[3] = input_shape[1] + past_shape[3]
+                else:
+                    past_shape[3] = f"{past_shape[3]}+{input_shape[1]}"
+                vi = self.known_vi_[node.output[1]]
+                vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, past_shape))
+
+    def _infer_BiasGelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_FastGelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_Gelu(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_LayerNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_LongformerAttention(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_EmbedLayerNormalization(self, node):
+        input_ids_shape = self._get_shape(node, 0)
+        word_embedding_shape = self._get_shape(node, 2)
+        assert len(input_ids_shape) == 2 and len(word_embedding_shape) == 2
+        output_shape = input_ids_shape + [word_embedding_shape[1]]
+
+        word_embedding_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], word_embedding_dtype, output_shape))
+
+        mask_index_shape = [input_ids_shape[0]]
+        vi = self.known_vi_[node.output[1]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[1], onnx.TensorProto.INT32, mask_index_shape))
+
+        if len(node.output) > 2:
+            # Optional output of add before layer nomalization is done
+            # shape is same as the output
+            vi = self.known_vi_[node.output[2]]
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[2], word_embedding_dtype, output_shape))
+
+    def _infer_SkipLayerNormalization(self, node):
+        self._propagate_shape_and_type(node)
+
+    def _infer_PythonOp(self, node):
+        output_tensor_types = get_attribute(node, "output_tensor_types")
+        assert output_tensor_types
+        output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
+        assert output_tensor_ranks
+
+        # set the context output seperately.
+        # The first output is autograd's context.
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, []))
+
+        # Outputs after autograd's context are tensors.
+        # We assume their ranks are fixed for different model inputs.
+        for i in range(len(node.output) - 1):
+            # Process the i-th tensor outputs.
+            vi = self.known_vi_[node.output[i + 1]]
+            sympy_shape = self._new_symbolic_shape(output_tensor_ranks[i], node)
+            shape = get_shape_from_sympy_shape(sympy_shape)
+            value_info = helper.make_tensor_value_info(node.output[i + 1], output_tensor_types[i], shape)
+            vi.CopyFrom(value_info)
+
+    def _propagate_shape_and_type(self, node, input_index=0, output_index=0):
+        shape = self._get_shape(node, input_index)
+        output_dtype = self.known_vi_[node.input[input_index]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[output_index]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[output_index], output_dtype, shape))
+
+    def _is_none_dim(self, dim_value):
+        if type(dim_value) != str:
+            return False
+        if "unk__" not in dim_value:
+            return False
+        if dim_value in self.symbolic_dims_.keys():
+            return False
+        return True
+
+    def _is_shape_contains_none_dim(self, out_shape):
+        for out in out_shape:
+            if self._is_none_dim(out):
+                return out
+        return None
+
+    def _infer_impl(self, start_sympy_data=None):
+        self.sympy_data_ = start_sympy_data or {}
+        self.out_mp_.graph.ClearField("value_info")
+        self._apply_suggested_merge(graph_input_only=True)
+        self.input_symbols_ = set()
+        for i in self.out_mp_.graph.input:
+            input_shape = get_shape_from_value_info(i)
+            if input_shape is None:
+                continue
+
+            if is_sequence(i.type):
+                input_dims = i.type.sequence_type.elem_type.tensor_type.shape.dim
+            else:
+                input_dims = i.type.tensor_type.shape.dim
+
+            for i_dim, dim in enumerate(input_shape):
+                if dim is None:
+                    # some models use None for symbolic dim in input, replace it with a string
+                    input_dims[i_dim].dim_param = str(self._new_symbolic_dim(i.name, i_dim))
+
+            self.input_symbols_.update([d for d in input_shape if type(d) == str])
+
+        for s in self.input_symbols_:
+            if s in self.suggested_merge_:
+                s_merge = self.suggested_merge_[s]
+                assert s_merge in self.symbolic_dims_
+                self.symbolic_dims_[s] = self.symbolic_dims_[s_merge]
+            else:
+                # Since inputs are not produced by other ops, we can assume positivity
+                self.symbolic_dims_[s] = sympy.Symbol(s, integer=True, positive=True)
+        # create a temporary ModelProto for single node inference
+        # note that we remove initializer to have faster inference
+        # for tensor ops like Reshape/Tile/Expand that read initializer, we need to do sympy computation based inference anyways
+        self.tmp_mp_ = onnx.ModelProto()
+        self.tmp_mp_.CopyFrom(self.out_mp_)
+        self.tmp_mp_.graph.ClearField("initializer")
+
+        # compute prerequesite for node for topological sort
+        # node with subgraphs may have dependency on implicit inputs, which will affect topological sort
+        prereq_for_node = {}  # map from node to all its inputs, including implicit ones in subgraph
+
+        def get_prereq(node):
+            names = set(i for i in node.input if i)
+            subgraphs = []
+            if "If" == node.op_type:
+                subgraphs = [
+                    get_attribute(node, "then_branch"),
+                    get_attribute(node, "else_branch"),
+                ]
+            elif node.op_type in ["Loop", "Scan"]:
+                subgraphs = [get_attribute(node, "body")]
+            for g in subgraphs:
+                g_outputs_and_initializers = {i.name for i in g.initializer}
+                g_prereq = set()
+                for n in g.node:
+                    g_outputs_and_initializers.update(n.output)
+                for n in g.node:
+                    g_prereq.update([i for i in get_prereq(n) if i not in g_outputs_and_initializers])
+                names.update(g_prereq)
+                # remove subgraph inputs from g_prereq since those are local-only
+                for i in g.input:
+                    if i.name in names:
+                        names.remove(i.name)
+            return names
+
+        for n in self.tmp_mp_.graph.node:
+            prereq_for_node[n.output[0]] = get_prereq(n)
+
+        # topological sort nodes, note there might be dead nodes so we check if all graph outputs are reached to terminate
+        sorted_nodes = []
+        sorted_known_vi = set([i.name for i in list(self.out_mp_.graph.input) + list(self.out_mp_.graph.initializer)])
+        if any([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
+            # Loop/Scan will have some graph output in graph inputs, so don't do topological sort
+            sorted_nodes = self.out_mp_.graph.node
+        else:
+            while not all([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
+                old_sorted_nodes_len = len(sorted_nodes)
+                for node in self.out_mp_.graph.node:
+                    if (node.output[0] not in sorted_known_vi) and all(
+                        [i in sorted_known_vi for i in prereq_for_node[node.output[0]] if i]
+                    ):
+                        sorted_known_vi.update(node.output)
+                        sorted_nodes.append(node)
+                if old_sorted_nodes_len == len(sorted_nodes) and not all(
+                    [o.name in sorted_known_vi for o in self.out_mp_.graph.output]
+                ):
+                    raise Exception("Invalid model with cyclic graph")
+
+        for node in sorted_nodes:
+            assert all([i in self.known_vi_ for i in node.input if i])
+            self._onnx_infer_single_node(node)
+            known_aten_op = False
+            if node.op_type in self.dispatcher_:
+                self.dispatcher_[node.op_type](node)
+            elif node.op_type in ["ConvTranspose"]:
+                # onnx shape inference ops like ConvTranspose may have empty shape for symbolic input
+                # before adding symbolic compute for them
+                # mark the output type as UNDEFINED to allow guessing of rank
+                vi = self.known_vi_[node.output[0]]
+                if len(vi.type.tensor_type.shape.dim) == 0:
+                    vi.type.tensor_type.elem_type = onnx.TensorProto.UNDEFINED
+            elif node.op_type == "ATen" and node.domain == "org.pytorch.aten":
+                for attr in node.attribute:
+                    # TODO: Is overload_name needed?
+                    if attr.name == "operator":
+                        aten_op_name = attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s
+                        if aten_op_name in self.aten_op_dispatcher_:
+                            known_aten_op = True
+                            self.aten_op_dispatcher_[aten_op_name](node)
+                        break
+
+            if self.verbose_ > 2:
+                logger.debug(node.op_type + ": " + node.name)
+                for i, name in enumerate(node.input):
+                    logger.debug(
+                        "  Input {}: {} {}".format(i, name, "initializer" if name in self.initializers_ else "")
+                    )
+
+            # onnx automatically merge dims with value, i.e. Mul(['aaa', 'bbb'], [1000, 1]) -> [1000, 'bbb']
+            # symbolic shape inference needs to apply merge of 'aaa' -> 1000 in this case
+            if node.op_type in [
+                "Add",
+                "Sub",
+                "Mul",
+                "Div",
+                "MatMul",
+                "MatMulInteger",
+                "MatMulInteger16",
+                "Where",
+                "Sum",
+            ]:
+                vi = self.known_vi_[node.output[0]]
+                out_rank = len(get_shape_from_type_proto(vi.type))
+                in_shapes = [self._get_shape(node, i) for i in range(len(node.input))]
+                for d in range(out_rank - (2 if node.op_type in ["MatMul", "MatMulInteger", "MatMulInteger16"] else 0)):
+                    in_dims = [s[len(s) - out_rank + d] for s in in_shapes if len(s) + d >= out_rank]
+                    if len(in_dims) > 1:
+                        self._check_merged_dims(in_dims, allow_broadcast=True)
+
+            for i_o in range(len(node.output)):
+                vi = self.known_vi_[node.output[i_o]]
+                out_type = vi.type
+                out_type_kind = out_type.WhichOneof("value")
+
+                # do not process shape for non-tensors
+                if out_type_kind not in ["tensor_type", "sparse_tensor_type", None]:
+                    if self.verbose_ > 2:
+                        if out_type_kind == "sequence_type":
+                            seq_cls_type = out_type.sequence_type.elem_type.WhichOneof("value")
+                            if "tensor_type" == seq_cls_type:
+                                logger.debug(
+                                    "  {}: sequence of {} {}".format(
+                                        node.output[i_o],
+                                        str(get_shape_from_value_info(vi)),
+                                        onnx.TensorProto.DataType.Name(
+                                            vi.type.sequence_type.elem_type.tensor_type.elem_type
+                                        ),
+                                    )
+                                )
+                            else:
+                                logger.debug("  {}: sequence of {}".format(node.output[i_o], seq_cls_type))
+                        else:
+                            logger.debug("  {}: {}".format(node.output[i_o], out_type_kind))
+                    continue
+
+                out_shape = get_shape_from_value_info(vi)
+                out_type_undefined = out_type.tensor_type.elem_type == onnx.TensorProto.UNDEFINED
+                if self.verbose_ > 2:
+                    logger.debug(
+                        "  {}: {} {}".format(
+                            node.output[i_o],
+                            str(out_shape),
+                            onnx.TensorProto.DataType.Name(vi.type.tensor_type.elem_type),
+                        )
+                    )
+                    if node.output[i_o] in self.sympy_data_:
+                        logger.debug("  Sympy Data: " + str(self.sympy_data_[node.output[i_o]]))
+
+                # onnx >= 1.11.0, use unk__#index instead of None when the shape dim is uncertain
+                if (
+                    out_shape is not None and (None in out_shape or self._is_shape_contains_none_dim(out_shape))
+                ) or out_type_undefined:
+                    if self.auto_merge_:
+                        if node.op_type in [
+                            "Add",
+                            "Sub",
+                            "Mul",
+                            "Div",
+                            "MatMul",
+                            "MatMulInteger",
+                            "MatMulInteger16",
+                            "Concat",
+                            "Where",
+                            "Sum",
+                            "Equal",
+                            "Less",
+                            "Greater",
+                            "LessOrEqual",
+                            "GreaterOrEqual",
+                            "Min",
+                            "Max",
+                        ]:
+                            shapes = [self._get_shape(node, i) for i in range(len(node.input))]
+                            if node.op_type in [
+                                "MatMul",
+                                "MatMulInteger",
+                                "MatMulInteger16",
+                            ]:
+                                if None in out_shape or self._is_shape_contains_none_dim(out_shape):
+                                    if None in out_shape:
+                                        idx = out_shape.index(None)
+                                    else:
+                                        idx = out_shape.index(self._is_shape_contains_none_dim(out_shape))
+                                    dim_idx = [len(s) - len(out_shape) + idx for s in shapes]
+                                    # only support auto merge for MatMul for dim < rank-2 when rank > 2
+                                    assert len(shapes[0]) > 2 and dim_idx[0] < len(shapes[0]) - 2
+                                    assert len(shapes[1]) > 2 and dim_idx[1] < len(shapes[1]) - 2
+                        elif node.op_type == "Expand":
+                            # auto merge for cases like Expand([min(batch, 1), min(seq, 512)], [batch, seq])
+                            shapes = [
+                                self._get_shape(node, 0),
+                                self._get_value(node, 1),
+                            ]
+                        else:
+                            shapes = []
+
+                        if shapes:
+                            for idx in range(len(out_shape)):
+                                if out_shape[idx] is not None and not self._is_none_dim(out_shape[idx]):
+                                    continue
+                                # note that the broadcasting rule aligns from right to left
+                                # if a tensor has a lower rank (dim_idx[idx] < 0), it would automatically broadcast and need no merge
+                                dim_idx = [len(s) - len(out_shape) + idx for s in shapes]
+                                if len(dim_idx) > 0:
+                                    self._add_suggested_merge(
+                                        [
+                                            s[i] if is_literal(s[i]) else str(s[i])
+                                            for s, i in zip(shapes, dim_idx)
+                                            if i >= 0
+                                        ]
+                                    )
+                            self.run_ = True
+                        else:
+                            self.run_ = False
+                    else:
+                        self.run_ = False
+
+                    # create new dynamic dims for ops not handled by symbolic shape inference
+                    if self.run_ == False and not node.op_type in self.dispatcher_ and not known_aten_op:
+                        is_unknown_op = out_type_undefined and (out_shape is None or len(out_shape) == 0)
+                        if is_unknown_op:
+                            # unknown op to ONNX, maybe from higher opset or other domain
+                            # only guess the output rank from input 0 when using guess_output_rank option
+                            out_rank = self._get_shape_rank(node, 0) if self.guess_output_rank_ else -1
+                        else:
+                            # valid ONNX op, but not handled by symbolic shape inference, just assign dynamic shape
+                            out_rank = len(out_shape)
+
+                        if out_rank >= 0:
+                            new_shape = self._new_symbolic_shape(out_rank, node, i_o)
+                            if out_type_undefined:
+                                # guess output data type from input vi if not defined
+                                out_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+                            else:
+                                # otherwise, use original data type
+                                out_dtype = vi.type.tensor_type.elem_type
+                            vi.CopyFrom(
+                                helper.make_tensor_value_info(
+                                    vi.name,
+                                    out_dtype,
+                                    get_shape_from_sympy_shape(new_shape),
+                                )
+                            )
+
+                            if self.verbose_ > 0:
+                                if is_unknown_op:
+                                    logger.debug(
+                                        "Possible unknown op: {} node: {}, guessing {} shape".format(
+                                            node.op_type, node.name, vi.name
+                                        )
+                                    )
+                                if self.verbose_ > 2:
+                                    logger.debug(
+                                        "  {}: {} {}".format(
+                                            node.output[i_o],
+                                            str(new_shape),
+                                            vi.type.tensor_type.elem_type,
+                                        )
+                                    )
+
+                            self.run_ = True
+                            continue  # continue the inference after guess, no need to stop as no merge is needed
+
+                    if self.verbose_ > 0 or not self.auto_merge_ or out_type_undefined:
+                        logger.debug("Stopping at incomplete shape inference at " + node.op_type + ": " + node.name)
+                        logger.debug("node inputs:")
+                        for i in node.input:
+                            logger.debug(self.known_vi_[i])
+                        logger.debug("node outputs:")
+                        for o in node.output:
+                            logger.debug(self.known_vi_[o])
+                        if self.auto_merge_ and not out_type_undefined:
+                            logger.debug("Merging: " + str(self.suggested_merge_))
+                    return False
+
+        self.run_ = False
+        return True
+
+    def _update_output_from_vi(self):
+        for output in self.out_mp_.graph.output:
+            if output.name in self.known_vi_:
+                output.CopyFrom(self.known_vi_[output.name])
+
+    @staticmethod
+    def infer_shapes(in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=False, verbose=0):
+        onnx_opset = get_opset(in_mp)
+        if (not onnx_opset) or onnx_opset < 7:
+            logger.warning("Only support models of onnx opset 7 and above.")
+            return None
+        symbolic_shape_inference = SymbolicShapeInference(int_max, auto_merge, guess_output_rank, verbose)
+        all_shapes_inferred = False
+        symbolic_shape_inference._preprocess(in_mp)
+        while symbolic_shape_inference.run_:
+            all_shapes_inferred = symbolic_shape_inference._infer_impl()
+        symbolic_shape_inference._update_output_from_vi()
+        if not all_shapes_inferred:
+            raise Exception("Incomplete symbolic shape inference")
+        return symbolic_shape_inference.out_mp_
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True, help="The input model file")
+    parser.add_argument("--output", help="The output model file")
+    parser.add_argument(
+        "--auto_merge",
+        help="Automatically merge symbolic dims when confliction happens",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--int_max",
+        help="maximum value for integer to be treated as boundless for ops like slice",
+        type=int,
+        default=2**31 - 1,
+    )
+    parser.add_argument(
+        "--guess_output_rank",
+        help="guess output rank to be the same as input 0 for unknown ops",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--verbose",
+        help="Prints detailed logs of inference, 0: turn off, 1: warnings, 3: detailed",
+        type=int,
+        default=0,
+    )
+    parser.add_argument(
+        "--save_as_external_data",
+        help="Saving an ONNX model to external data",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--all_tensors_to_one_file",
+        help="Saving all the external data to one file",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--external_data_location",
+        help="The file location to save the external file",
+        default="./",
+    )
+    parser.add_argument(
+        "--external_data_size_threshold",
+        help="The size threshold for external data",
+        type=int,
+        default=1024,
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    logger.info("input model: " + args.input)
+    if args.output:
+        logger.info("output model " + args.output)
+    logger.info("Doing symbolic shape inference...")
+    out_mp = SymbolicShapeInference.infer_shapes(
+        onnx.load(args.input),
+        args.int_max,
+        args.auto_merge,
+        args.guess_output_rank,
+        args.verbose,
+    )
+    if args.output and out_mp:
+        if args.save_as_external_data:
+            onnx.save_model(
+                out_mp,
+                args.output,
+                save_as_external_data=True,
+                all_tensors_to_one_file=args.all_tensors_to_one_file,
+                location=args.external_data_location,
+                size_threshold=args.external_data_size_threshold,
+                convert_attribute=False,
+            )
+        else:
+            onnx.save(out_mp, args.output)
+        logger.info("Done!")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4228878179aa0c3fa63fd9656087de5a90d5e31c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/optimizer/requirements.txt
@@ -0,0 +1,3 @@
+sympy
+packaging
+onnxsim
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..396998600124e124f2fc8f78483d68a27ca7e4ed
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
@@ -0,0 +1,19 @@
+onnx==1.15.0
+onnxsim==0.4.36
+tf2onnx==1.16.1
+onnxruntime
+onnxoptimizer==0.3.13
+bert-tensorflow==1.0.1
+
+pandas==2.1.1
+numpy==1.23.0
+matplotlib
+scikit-learn
+opencv-python==4.6.0.66
+opencv-python-headless
+tokenization==1.0.7
+tokenizers==0.13.3
+sentencepiece==0.1.96
+typing_extensions==4.10.0
+
+py-libnuma==1.2
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6ddf008338b3ed3b9297dbf81b8aac344179d77
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/runtime_backend_iluvatar.py
@@ -0,0 +1,658 @@
+# Copyright 2023 Graphcore Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import torch
+import logging
+import numpy as np
+from tqdm import tqdm
+import threading
+import importlib
+
+from general_perf.backends import runtime_backend
+from general_perf.backends.ILUVATAR.common import init_by_tensorrt, setup_io_bindings
+from general_perf.backends.ILUVATAR.common import Task, TaskThread
+from cuda import cuda, cudart
+import numa
+
+from general_perf.backends.ILUVATAR.common import load_ixrt_plugin
+
+log = logging.getLogger("RuntimeBackendILUVATAR")
+
+Dims = None
+
+pt_dtype_map = {
+    "FLOAT32": torch.float32,
+    "FLOAT16": torch.float16,
+    "INT8": torch.int8,
+    "INT32":torch.int32,
+    "LONG": torch.long,
+    "INT64": torch.int64,
+    "BOOL": torch.bool
+}
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+class RuntimeBackendILUVATAR(runtime_backend.RuntimeBackend):
+    def __init__(self):
+        super(RuntimeBackendILUVATAR, self).__init__()
+        self.hardware_type = "ILUVATAR"
+        self.need_reload = False
+        self.model_runtimes = []
+        self.configs = None
+        self.engine = None
+        self.context = None
+        self.batch_size = -1
+        self.workload = None
+        self.predict_fps = None
+        self.predict_time = None
+        self.task = None
+        self.inputs = None
+        self.outputs = None
+        self.allocations = None
+        numa.memory.set_local_alloc()
+        numa.schedule.run_on_nodes(0)
+
+    def isSDmodel(self, model_name):
+        result = False
+        if model_name == 'vae-decoder-onnx-fp32' or model_name == 'vae-encoder-onnx-fp32' or model_name == 'clip-onnx-fp32':
+            result = True
+        return result
+
+    # Dual-core inference of Tian SoC BI-150 graphics card
+    def benchmark(self, dataloader):
+        performance_reports = []
+        merged_dict = {}
+        model_name = self.configs["model"].split("-")[0]
+        
+        workers = []
+        lock = threading.Lock()
+        for i in range(2):
+            device_id = i
+            self.task = Task(self.batch_size, dataloader, device_id, self.load, self.benchmark_interact, performance_reports, lock, framework=model_name)
+
+            work = TaskThread(self.task.run, [])
+            workers.append(work)
+            work.start()
+            work.join()
+        
+        if model_name != 'gpt2':
+            if not self.isSDmodel(self.configs["model"]):
+                del self.engine
+                del self.context
+            
+        if len(performance_reports[0]) == len(performance_reports[1]):
+            if performance_reports[0].keys() == performance_reports[1].keys():
+
+                qps = performance_reports[0]['QPS'] + performance_reports[1]['QPS']
+                avg_latency = round(((performance_reports[0]['AVG Latency'] + performance_reports[1]['AVG Latency']) / 2.0), 2)
+                p99_latency = round(((performance_reports[0]['P99 Latency'] + performance_reports[1]['P99 Latency']) / 2.0), 2)
+
+                merged_dict['BS'] = performance_reports[0]['BS']
+                merged_dict['QPS'] = qps
+                merged_dict['AVG Latency'] = avg_latency
+                merged_dict["P99 Latency"] = p99_latency
+
+                if model_name != 'gpt2':
+                    predict_qps = performance_reports[0]['predict QPS'] + performance_reports[1]['predict QPS']
+                    predict_avg_latency = round(((performance_reports[0]['predict AVG Latency'] + performance_reports[1]['predict AVG Latency']) / 2.0), 2)
+                    predict_p99_latency = round(((performance_reports[0]['predict P99 Latency'] + performance_reports[1]['predict P99 Latency']) / 2.0), 2)
+
+                    merged_dict['predict QPS'] = predict_qps
+                    merged_dict['predict AVG Latency'] = predict_avg_latency
+                    merged_dict["predict P99 Latency"] = predict_p99_latency
+                
+        return merged_dict
+    
+    def init_allocs(self):
+        if self.inputs is not None:
+            for i in range(len(self.inputs)):
+                err, = cudart.cudaFree(self.inputs[i]["allocation"])
+                assert err == cudart.cudaError_t.cudaSuccess
+
+            for i in range(len(self.outputs)):
+                err, = cudart.cudaFree(self.outputs[i]["allocation"])
+                assert err == cudart.cudaError_t.cudaSuccess
+            self.inputs = None
+
+    def get_allocs(self):
+        if self.inputs is None:
+            self.inputs, self.outputs, self.allocations = setup_io_bindings(self.engine, self.context)
+        return self.inputs, self.outputs, self.allocations
+
+    def predict_dump(self, feeds):
+        input_tensors = []
+        i = 0
+
+        model_name = self.configs["model"].split("-")[0]
+    
+        if model_name != 'gpt2':
+            if model_name == 'deberta':
+                keys = list(feeds.keys())
+                input_ids = feeds[keys[0]]
+                attention_mask = feeds[keys[1]]
+                input_tensors = [input_ids, attention_mask]
+
+            else:
+                for key, _ in feeds.items():
+                    input_tensors.append(feeds[key])
+                    i += 1
+
+            # ixrt inference
+            engine = self.engine
+            assert engine
+            context = self.context
+            assert context
+
+            # set dynamic shape
+            input_tensor_map = self.configs["segments"][0]["input_tensor_map"]
+            input_shape = input_tensor_map.values()
+
+            i = 0
+            for input_name, _ in input_tensor_map.items():
+                if model_name == 'widedeep':
+                    input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32))
+                    input_names = [
+                        "new_categorical_placeholder:0",
+                        "new_numeric_placeholder:0",
+                        "import/head/predictions/zeros_like:0"
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'new_categorical_placeholder:0':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'new_numeric_placeholder:0':
+                            input_shape = input_tensors[1].shape
+                        if input_name == 'import/head/predictions/zeros_like:0':
+                            input_shape = input_tensors[2].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                elif model_name == 'deberta':
+                    input_names = [
+                        "input_ids.1",
+                        "attention_mask.1",
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'input_ids.1':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'attention_mask.1':
+                            input_shape = input_tensors[1].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                else:
+                    input_shape = input_tensors[i].shape
+                    input_idx = engine.get_binding_index(input_name)
+                    context.set_binding_shape(input_idx, Dims(input_shape))
+                    i += 1
+            
+            # Setup I/O bindings
+            inputs, outputs, allocations = self.get_allocs()
+
+            # Prepare the output data
+            outputs_list = []
+            for i in range(len(outputs)):
+                output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+                outputs_list.append(output)
+
+            data_batch_list = []
+            for i in range(len(input_tensors)):
+                data_batch = np.ascontiguousarray(input_tensors[i])
+                data_batch_list.append(data_batch)
+
+        return input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list
+
+    def predict_timing(self, input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list):
+        model_name = self.configs["model"].split("-")[0]
+        
+        # H2D: host to device
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaHostRegister(data_batch_list[i], inputs[i]["nbytes"], 2)
+
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaMemcpy(
+                        inputs[i]["allocation"],
+                        data_batch_list[i],
+                        inputs[i]["nbytes"],
+                        cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+            )
+
+        for i in range(len(inputs)):
+            (err, ) = cudart.cudaHostUnregister(data_batch_list[i])
+
+        starttime = time.time()
+        context.execute_v2(allocations)
+        endtime = time.time()
+
+        self.predict_time = endtime - starttime
+        
+        # D2H: device to host
+        for i in range(len(outputs)):
+            (err, )= cudart.cudaMemcpy(outputs_list[i], 
+                        outputs[i]["allocation"], 
+                        outputs[i]["nbytes"], 
+                        cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+            )
+        
+        result = {}
+
+        output_tensor_map = self.configs["segments"][0]["output_tensor_map"]
+        output_name = output_tensor_map.split(",")
+
+        for i in range(len(output_name)):
+            if model_name == 'yolov5':
+                result[output_name[0]] = outputs_list[0]
+                break
+
+            result[output_name[i]] = outputs_list[i]
+        
+        if model_name == 'videobert':
+            return outputs_list
+        
+        elif model_name == 'gpt2':
+            return None
+        
+        else:
+            return result
+
+    def predict(self, feeds):
+        # The deberta model is currently unable to undergo accuracy testing temporarily
+        input_tensors = []
+        i = 0
+
+        model_name = self.configs["model"].split("-")[0]
+        if self.isSDmodel(self.configs["model"]):
+            for key, _ in feeds.items():
+                tmp_tensor = torch.tensor(feeds[key],
+                                        dtype=pt_dtype_map[self.input_type[i]])
+                input_tensors.append(tmp_tensor)
+                i += 1
+
+            self.predict_sd(input_tensors)
+            return
+        
+        elif model_name != 'gpt2':
+            if model_name == 'deberta':
+                keys = list(feeds.keys())
+                input_ids = np.array(feeds[keys[0]], dtype=INPUT_TYPE[self.input_type[i]])
+                attention_mask = np.array(feeds[keys[1]], dtype=INPUT_TYPE[self.input_type[i]])
+                input_tensors = [input_ids, attention_mask]
+
+            else:
+                trans_index = [0, 1, 2]
+                if model_name == 'bert' and self.configs['compile_precision'] == 'INT8':
+                    trans_index = [0, 2, 1]
+
+                for key, _ in feeds.items():
+                    tmp_tensor = np.array(feeds[key], dtype=INPUT_TYPE[self.input_type[trans_index[i]]])
+                    input_tensors.append(tmp_tensor)
+                    i += 1
+
+            # ixrt inference
+            engine = self.engine
+            assert engine
+            context = self.context
+            assert context
+
+            # set dynamic shape
+            input_tensor_map = self.configs["segments"][0]["input_tensor_map"]
+            input_shape = input_tensor_map.values()
+
+            i = 0
+            for input_name, _ in input_tensor_map.items():
+                if model_name == 'widedeep':
+                    input_tensors.append(np.zeros((self.batch_size, 1), dtype=np.float32))
+                    input_names = [
+                        "new_categorical_placeholder:0",
+                        "new_numeric_placeholder:0",
+                        "import/head/predictions/zeros_like:0"
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'new_categorical_placeholder:0':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'new_numeric_placeholder:0':
+                            input_shape = input_tensors[1].shape
+                        if input_name == 'import/head/predictions/zeros_like:0':
+                            input_shape = input_tensors[2].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                elif model_name == 'deberta':
+                    input_names = [
+                        "input_ids.1",
+                        "attention_mask.1",
+                    ]
+                    for input_name in input_names:
+                        if input_name == 'input_ids.1':
+                            input_shape = input_tensors[0].shape
+                        if input_name == 'attention_mask.1':
+                            input_shape = input_tensors[1].shape
+                    
+                        input_idx = engine.get_binding_index(input_name)
+                        context.set_binding_shape(input_idx, Dims(input_shape))
+
+                else:
+                    input_shape = input_tensors[i].shape
+                    input_idx = engine.get_binding_index(input_name)
+                    context.set_binding_shape(input_idx, Dims(input_shape))
+                    i += 1
+            
+            # Setup I/O bindings
+            inputs, outputs, allocations = self.get_allocs()
+
+            # Prepare the output data
+            outputs_list = []
+            for i in range(len(outputs)):
+                output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+                outputs_list.append(output)
+
+            data_batch_list = []
+            for i in range(len(input_tensors)):
+                data_batch = np.ascontiguousarray(input_tensors[i])
+                data_batch_list.append(data_batch)
+
+            # H2D: host to device
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaHostRegister(data_batch_list[i], inputs[i]["nbytes"], 2)
+
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaMemcpy(
+                            inputs[i]["allocation"],
+                            data_batch_list[i],
+                            inputs[i]["nbytes"],
+                            cudart.cudaMemcpyKind.cudaMemcpyHostToDevice
+                )
+            
+            for i in range(len(inputs)):
+                (err, ) = cudart.cudaHostUnregister(data_batch_list[i])
+
+            starttime = time.time()
+            context.execute_v2(allocations)
+            endtime = time.time()
+
+            self.predict_time = endtime - starttime
+            
+            # D2H: device to host
+            for i in range(len(outputs)):
+                (err, )= cudart.cudaMemcpy(outputs_list[i], 
+                            outputs[i]["allocation"], 
+                            outputs[i]["nbytes"], 
+                            cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost
+                )
+            
+            # Free Gpu Memory
+            # cuda-python
+            self.init_allocs()
+            
+            result = {}
+
+            output_tensor_map = self.configs["segments"][0]["output_tensor_map"]
+            output_name = output_tensor_map.split(",")
+
+            for i in range(len(output_name)):
+                if model_name == 'yolov5':
+                    result[output_name[0]] = outputs_list[0]
+                    break
+
+                result[output_name[i]] = outputs_list[i]
+        else:
+            self.predict_igie(feeds)
+            
+        if model_name == 'videobert':
+            return outputs_list
+        
+        elif model_name == 'gpt2':
+            return None
+        
+        else:
+            return result
+    
+    def predict_igie(self, dataloader):
+        tvm = importlib.import_module("tvm")
+        self.task.module.set_input("input_ids", tvm.nd.array(dataloader["input_ids"].astype('int64'), self.device))
+        self.task.module.run()
+        output = self.task.module.get_output(0)
+
+        return output
+    
+    def benchmark_interact(self, dataloader):
+        batch_size = self.get_loaded_batch_size()
+        iterations = self.workload['iterations']
+        model_name = self.configs["model"].split("-")[0]
+        times_range = []
+        predict_range = []
+        report = {}
+        report["BS"] = batch_size
+
+        if model_name == 'gpt2':
+            self.load_igie(batch_size)
+        elif self.isSDmodel(self.configs["model"]):
+            self.load_sd(batch_size)   
+    
+        test_data = self._get_fake_samples(batch_size=batch_size,
+                        shape=self.configs['segments'][0]['input_tensor_map'],
+                        input_type=self.configs['input_type'])
+        
+        # Free Gpu Memory
+        # cuda-python
+        self.init_allocs()
+
+        for _ in range(30):
+            self.predict(test_data)
+
+        for _ in range(iterations):
+            if model_name != 'gpt2' and model_name != 'vae' and model_name != 'clip':
+                input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list = self.predict_dump(test_data)
+
+                start_time = time.time()
+                self.predict_timing(input_tensors, inputs, outputs, data_batch_list, allocations, context, outputs_list)
+                end_time = time.time()
+            
+            else:
+                start_time = time.time()
+                self.predict(test_data)
+                end_time = time.time()
+
+            times_range.append(end_time - start_time)
+            predict_range.append(self.predict_time)           
+
+        times_range.sort()
+        tail_latency = round(
+            times_range[int(len(times_range) * 0.99)] * 1000, 2)
+        avg_latency = round(sum(times_range) / iterations * 1000, 2)
+        qps = int(1000.0 * self.batch_size / avg_latency)
+
+        if model_name != 'gpt2':
+            predict_range.sort()
+            predict_tail_latency = round(
+                predict_range[int(len(predict_range) * 0.99)] * 1000, 2)
+            predict_avg_latency = round(sum(predict_range) / iterations * 1000, 2)
+            fps = int(1000.0 * batch_size / predict_avg_latency)
+
+        log.info(
+            'Batch size is {}, QPS: {}, Avg Latency:{}, Tail Latency:{}'.
+            format(self.batch_size, qps, avg_latency, tail_latency))
+        
+        # log.info(
+        #     'Batch size is {}, fps: {}, predict_avg_latency:{}, predict_tail_latency:{}'.
+        #     format(self.batch_size, fps, predict_avg_latency, tail_latency))
+
+
+        report['QPS'] = qps
+        report['AVG Latency'] = avg_latency
+        report['P99 Latency'] = tail_latency
+
+        if model_name != 'gpt2':
+            report['predict QPS'] = fps
+            report['predict AVG Latency'] = predict_avg_latency
+            report['predict P99 Latency'] = predict_tail_latency
+
+        return report
+
+    def get_loaded_batch_size(self):
+        # return self.workload['batch_sizes'][0]
+        return self.batch_size
+
+    def load(self, batch_size) -> None:
+        global Dims
+
+        # load engine
+        model = self.configs['model']
+        model_name = self.configs['model'].split("-")[0]
+        model_path = self.configs['model_path']
+
+        precision = self.configs['compile_precision'].replace('FP32', 'FP16')
+
+        if precision == 'FP16':
+            if model_name == 'resnet50' or model_name == 'bert' or model_name == 'albert' or model == 'deberta' or model_name == 'yolov5':
+                mod = importlib.import_module("tensorrt")
+                Dims = getattr(mod, "Dims")
+            else:
+                mod = importlib.import_module("tensorrt")
+                Dims = getattr(mod, "Dims")
+
+        if precision == 'INT8':
+            mod = importlib.import_module("tensorrt")
+            Dims = getattr(mod, "Dims")     
+
+        load_ixrt_plugin(model=model_name, precision=precision)
+
+        if model_name == 'gpt2':
+            self.batch_size = batch_size
+            return
+        
+        elif self.isSDmodel(model):
+            self.batch_size = batch_size
+            #self.load_sd(batch_size)
+            return
+        
+        if self.configs['compile_precision'] == 'FP16':
+            if model_name == 'videobert' or model_name == 'conformer' or model_name == 'yolov5':
+                engine_path = model_path.split(".")[0] + "_end.engine"
+
+            elif model_name == 'widedeep' or model_name == 'roformer':
+                engine_path = model_path + "/" + model + "_end.engine"
+                    
+            elif model_name == 'bert' or model_name == 'albert' or model_name == 'roberta' or model_name == 'deberta' or model_name == 'swin' \
+                or model_name == 'resnet50':
+                engine_path = os.path.dirname(model_path) + "/" + model + "_end.engine" 
+
+            else:
+                engine_path = os.path.dirname(model_path) + "/" + model + ".engine"
+            
+            if model_name == 'widedeep':      
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/widedeep_dynamicshape" + ".engine"
+
+            if model_name == 'roformer':
+                engine_path = "general_perf/model_zoo/popular/open_roformer/roformer-frozen_end" + ".engine"     
+            
+            if model_name == 'deberta':
+                engine_path = "general_perf/model_zoo/popular/open_deberta/deberta-sim-drop-clip-drop-invaild-cast_end" + ".engine"
+
+        if self.configs['compile_precision'] == 'INT8':
+            if model_name == 'widedeep':
+                engine_path = "general_perf/model_zoo/regular/open_wide_deep_saved_model/quantized_widedeep_staticshape" + ".engine"    
+            
+            if model_name == 'resnet50':
+                engine_path = "general_perf/model_zoo/regular/open_resnet50/quantized_Resnet50" + ".engine"
+
+            if model_name == 'yolov5':
+                engine_path = "general_perf/model_zoo/popular/open_yolov5/quantized_yolov5s" + ".engine"    
+
+            if model_name == 'bert':
+                engine_path = "general_perf/model_zoo/regular/open_bert/bert_zijie_int8_b196.engine"
+
+        engine, context = init_by_tensorrt(engine_path)
+
+        self.model_runtimes.append(engine)
+
+        self.input_type = self.configs['input_type']
+        
+        self.batch_size = batch_size
+        self.engine = engine
+        self.context = context
+
+
+    def load_sd(self, batch_size):
+        model_path = self.configs['model_path']
+
+        import onnx
+        from onnx2torch import convert
+        
+        device = "cpu"
+        if torch.cuda.is_available():
+            device = "cuda"
+
+        self.model_sd = convert(model_path).to(device)
+
+        self.input_type = self.configs['input_type']
+        self.batch_size = batch_size
+        pass
+
+    def predict_sd(self, dataloader):
+        self.model_sd = self.model_sd.eval()
+        dataloader = dataloader[0].to('cuda')
+        torch.cuda.synchronize()
+        starttime = time.time()
+        out = self.model_sd(dataloader)
+        torch.cuda.synchronize()
+        endtime = time.time()
+
+        self.predict_time = endtime - starttime
+
+        return out
+
+    def load_igie(self, batch_size):
+        model = self.configs['model']
+        model_path = self.configs['model_path']
+
+        tvm = importlib.import_module("tvm")
+        from general_perf.backends.ILUVATAR.utils import get_target
+
+        target, _ = get_target('iluvatar_with_all_libs')
+        device = tvm.device(target.kind.name, self.task.device_id)
+        engine_path = os.path.dirname(model_path) + "/" + model + "_bs" + str(batch_size) + ".so"
+        lib = tvm.runtime.load_module(engine_path)
+        self.task.module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+        self.device = device
+        self.batch_size = batch_size
+
+    def _get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key != "text":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6b310791e21426c778fed9cee85fc069ff1630
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/__init__.py
@@ -0,0 +1,20 @@
+from .file import load_json, save_json
+from .timer import Timer
+
+
+from .argument import get_args
+from .import_model import import_model_to_igie
+from .target import get_target
+
+from .dataloader import get_dataloader_from_args, download_builtin_data
+
+
+from .imagenet_metric import get_topk_accuracy
+from .coco_metric import COCO2017Evaluator, COCO2017EvaluatorForYolox, COCO2017EvaluatorForYolov4
+
+from .quantization import igie_quantize_model_from_args, onnx_quantize_model_from_args
+
+from .mod_rewriter import modify_seq_len_for_nlp
+from .stauts_checker import check_status
+
+from .compile_engine import compile_engine_from_args
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c2f253fff5ab8ec3d06ea60f0d0331c99df009d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/argument.py
@@ -0,0 +1,331 @@
+import argparse
+import os
+import sys
+import json
+from numbers import Number
+
+def to_bool(value):
+    if isinstance(value, bool):
+        return value
+    elif isinstance(value, str):
+        return value.lower() in ("yes", "true", "t", "1")
+    elif isinstance(value, Number):
+        return value != 0
+    else:
+        return False
+
+
+def get_args_parser():
+
+    parser = argparse.ArgumentParser()
+
+    # always required
+    parser.add_argument("--model_path",
+                        type=str,
+                        required=True,
+                        help="model path or model name in torchviso")
+
+    parser.add_argument("--input",
+                        type=str,
+                        required=True,
+                        dest="input",
+                        nargs='+',
+                        help="""
+                            input name and shape/dtype, format shoul be input_name:input_shape or input_name:input_shape/dtype,
+                            and use space to connect multiple inputs,
+                            if dtype is not given, we assuem the dtype is float32
+                            single input case: --input input1:1,3,224,224
+                            multiple inputs case: --input input1:32,3,224,224 input2:32,100
+                            miltiple inputs with differnet dtype case: --input input1:32,3,224,224/float32 input2:32,100/int64
+                            """)
+                        
+    parser.add_argument("--precision",
+                        type=str,
+                        choices=["fp32", "fp16", "int8"],
+                        required=True,
+                        help="model inference precision")
+    
+    ## common optional
+    parser.add_argument("--target",
+                        type=str,
+                        choices=["llvm", "iluvatar", "iluvatar_with_cudnn_cublas",  "iluvatar_with_ixinfer", "iluvatar_with_all_libs"],
+                        default="iluvatar_with_all_libs",
+                        help="""IGIE compile target
+                            llvm: cpu only
+                            iluvatar: gpu without any other accerelate library
+                            iluvatar_with_cudnn_cublas: gpu with all accerelate library cudnn/cublas
+                            iluvatar_with_ixinfer: gpu with all accerelate library ixinfer
+                            iluvatar_with_all_libs: gpu with all accerelate library cudnn/cublas/ixinfer
+                            """)
+    
+    parser.add_argument("--engine_path",
+                        type=str,
+                        default=None,
+                        help="save path of engine, save in pwd if not provided")
+
+    parser.add_argument("--warmup",
+                        type=int,
+                        default=3,
+                        help="numbe of warmup before test")
+    
+    # parser.add_argument("--test_count",
+    #                     type=int,
+    #                     default=None,
+    #                     help="number of batch to test, test all batch if not specified")
+
+    parser.add_argument("--verbose",
+                        type=to_bool,
+                        default=False,
+                        help="dump igie mod to file if is True")
+    
+    parser.add_argument("--num_workers",
+                        type=int,
+                        default=16,
+                        help="number of workers used in pytorch dataloader")
+    
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=None,
+                        help="""model batch size for datalodaer,
+                            use the first dimension of the first input when not specified 
+                            this argument will be useful for multi-input case:
+                            e.g. input_ids:1000,22 pixel_values:32,3,224,224 attention_mask:1000,22
+                            """)
+    
+    ## dataset
+    parser.add_argument("--use_imagenet",
+                        type=to_bool,
+                        default=False,
+                        help="use imagenet val dataet for calibration and test")
+    
+    parser.add_argument("--use_coco2017",
+                        type=to_bool,
+                        default=False,
+                        help="use coco2017 val datatset for calibration and test")
+
+    # parser.add_argument("--custom_data_path",
+    #                     type=str,
+    #                     default=None,
+    #                     help="user-provided custom data path to define user's datalodaer"
+    #                     )
+
+    parser.add_argument("--input_layout",
+                        type=str,
+                        choices=["NHWC", "NCHW"],
+                        default="NCHW",
+                        help="model input layout, only works for cv model")
+
+    parser.add_argument("--calibration_file_path",
+                        type=str,
+                        default=None,
+                        help="user-provided calibration npy data path, only used for calibration")
+    
+    ## custom quantization config
+    parser.add_argument("--automatic_yolo_quantization",
+                        type=to_bool,
+                        default=False,
+                        help="automaticlly find the best strategy for yolo by skipping the yolo detect node quantization")    
+    
+    parser.add_argument("--quantization_config_path",
+                        type=str,
+                        default=None,
+                        help="quantization config path for onnxruntime, should be a json file, refer to igie-doc for more infomation")    
+    
+    
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=to_bool,
+                        default=False,
+                        help="run performance test only")
+    
+    parser.add_argument('--just_export',
+                        type=to_bool,
+                        default=False,
+                        help="just export engine and return")
+    
+    ## other custom option
+    
+    parser.add_argument("--custom_option",
+                        type=str,
+                        default=None,
+                        dest="custom_option",
+                        nargs='+',
+                        help="""
+                            user-provided custom key:value option, use space to connect multiple option,
+                            bool value will be cast to Python bool type automaticaly,
+                            single option case: --custom_option my_data_path:/local/data
+                            multiple option case: --custom_option my_data_path:/local/data use_optionA:True
+                            """)
+    
+    
+    return parser
+
+
+
+def _parse_framework(args_dict):
+    model_path_or_name = args_dict["model_path"]
+    framework = None
+ 
+    # NOTE(chen.chen):
+    # We rely on the suffix to distinguish the source framework of the model,
+    # e.g. model.onnx, model.pb, etc. 
+    
+    # But if the model_path is_not exists, we will try to find it from torchvision and raise except when not found
+    # e.g. resnet18, resnet50
+    
+    if os.path.exists(model_path_or_name):
+        ext = os.path.splitext(model_path_or_name)[1]
+        
+        if ext == ".onnx":
+            framework = "onnx"
+        elif ext == ".pb":
+            framework = "tensorflow"
+        elif ext == ".pt":
+            framework = "pytorch"
+        else:
+            raise ValueError(f"{ext} is not supported yet")
+    else:            
+        # NOTE(chen.chen)
+        # paddle model saved as a directory
+        # so we need check if it is a paddle model here
+        paddle_model = f"{model_path_or_name}.pdmodel"
+        if os.path.exists(paddle_model):
+            framework = "paddle"
+        else:        
+            # NOTE(chen.chen):
+            # we support use torchvision pretrained model
+            # when model_path has no extension, we will try to find it from torchvision
+            # e.g. --model_path resnet50
+            framework = "pytorch"
+
+    args_dict["model_framework"] = framework
+
+        
+
+def _parse_input(args_dict):
+    input_list = args_dict.pop("input")    
+    
+    input_dict = {}
+    input_name_list = []
+    input_shape_list = []
+    input_dtype_list = []
+    batch_size = None
+    for i in input_list:
+        name, shape_dtype = i.rsplit(":", 1)
+        if "/" in shape_dtype:
+            shape, dtype = shape_dtype.split("/")
+            dtype = dtype.replace("fp", "float")
+            input_dtype_list.append(dtype)
+        else:
+            shape = shape_dtype
+            input_dtype_list.append("float32")
+        shape = tuple([int(j) for j in shape.split(",")])
+        input_dict[name] = shape
+        input_name_list.append(name)
+        input_shape_list.append(shape)
+        
+        if batch_size is None:
+            batch_size = shape[0]
+    
+    args_dict["input_dict"] = input_dict
+    args_dict["input_name_list"] = input_name_list
+    args_dict["input_shape_list"] = input_shape_list
+    args_dict["input_dtype_list"] = input_dtype_list
+    if args_dict["batch_size"] is None:
+        args_dict["batch_size"] = batch_size
+
+
+def _parse_engine_path(args_dict):
+    if args_dict["engine_path"] is None:
+        model_base_name = os.path.splitext(os.path.split(args_dict["model_path"])[1])[0]
+        args_dict["engine_path"] = f"{model_base_name}_batchsize_{args_dict['batch_size']}_{args_dict['precision']}.so"
+    assert args_dict["engine_path"].endswith("so")
+
+   
+def _parse_custom_option(args_dict):
+    custom_option_dict = {}
+    if args_dict["custom_option"] is not None :
+        custom_option = args_dict.pop("custom_option")
+        
+        for option in custom_option:
+            key, value = option.split(":", 1)
+            if value.lower() == "true":
+                value = True
+            elif value.lower() == "false":
+                value = False
+            elif "," in value:
+                value = value.split(",")
+            custom_option_dict[key] = value
+    
+    required_pass = custom_option_dict.get("required_pass", [])
+    if not isinstance(required_pass, list):
+        required_pass = [required_pass]
+    
+    args_dict["required_pass"] = required_pass
+    args_dict["custom_option"] = custom_option_dict
+
+
+def _parse_dataset(args_dict):
+    args_dict["use_builtin_data"] = args_dict["use_imagenet"] or args_dict["use_coco2017"]
+    if not args_dict["use_builtin_data"]:
+        args_dict["perf_only"] = True
+
+def _parse_quantization_config(args_dict):
+    
+    quantization_config_path = args_dict["quantization_config_path"]
+    if quantization_config_path is not None:
+        assert os.path.exists(quantization_config_path)
+        
+        with open(quantization_config_path, "r") as f:
+            data = json.load(f)
+        args_dict["quantization_config"] = data
+    else:
+        args_dict["quantization_config"] = {}
+
+
+
+def get_args(return_dict=False):   
+    if sys.version_info.major != 3 and sys.version_info.minor < 7:
+        raise ValueError(f"need at least python3.7, got {sys.version}")
+    
+    args_dict = vars(get_args_parser().parse_args())
+
+    _parse_framework(args_dict)
+    _parse_input(args_dict)
+    _parse_engine_path(args_dict)
+    _parse_quantization_config(args_dict)
+    _parse_dataset(args_dict)
+    _parse_custom_option(args_dict)
+    
+    from pprint import pprint
+    pprint(args_dict, indent=2)  
+
+    if return_dict:
+        return args_dict
+    
+    return argparse.Namespace(**args_dict)
+    
+
+
+if __name__ == "__main__":
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444/float32 input2:32,100/int64 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444/float32 input2:32,100/fp16 --precision=int8
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8 --custom_option my_data_path:/local/data use_optionA:True
+    # python3 argument.py --model_path=a/b/c.onnx --input input1:32,3,224,224,44444 input2:32,100 --precision=int8 --custom_option my_data_path:/local/data use_optionA:True required_pass:pass1,pass2,pass3
+    args = get_args(return_dict=True)
+    
+    from pprint import pprint
+    pprint(args)
+    
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4e468bb79d66a0c75b593460369c516fba3f309
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/coco_metric.py
@@ -0,0 +1,622 @@
+import os
+import json
+import cv2
+import numpy as np
+
+import torch
+import torchvision
+from pycocotools.coco import COCO
+
+def get_coco_accuracy(pred_json, ann_json):
+    coco = COCO(annotation_file=ann_json)
+    coco_pred = coco.loadRes(pred_json)
+    try:
+        from .fastCoCoeval.fast_coco_eval_api import COCOeval_opt as COCOeval
+        coco_evaluator = COCOeval(cocoGt=coco, cocoDt=coco_pred, iouType="bbox")
+    except:
+        from pycocotools.cocoeval import COCOeval
+        print("Can't import fastCoCoeval, Using PyCoCcotools API ...")
+        coco_evaluator = COCOeval(cocoGt=coco, cocoDt=coco_pred, iouType="bbox")
+            
+    coco_evaluator.evaluate()
+    coco_evaluator.accumulate()
+    coco_evaluator.summarize()
+    return coco_evaluator.stats
+
+coco80_to_coco91 = [
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+    23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+    46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+    65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88,
+    89, 90
+]
+
+coco80_to_coco91_dict = {idx: i for idx, i in enumerate(coco80_to_coco91)}
+coco91_to_coco80_dict = {i: idx for idx, i in enumerate(coco80_to_coco91)}
+
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114)):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[
+        1]  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im,
+                            top,
+                            bottom,
+                            left,
+                            right,
+                            cv2.BORDER_CONSTANT,
+                            value=color)  # add border
+    return im, ratio, (dw, dh)
+
+
+def box_area(box):
+    # box = xyxy(4,n)
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+
+def box_iou(box1, box2, eps=1e-7):
+    # https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        box1 (Tensor[N, 4])
+        box2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+
+    # inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
+    (a1, a2), (b1, b2) = box1[:, None].chunk(2, 2), box2.chunk(2, 1)
+    inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
+
+    # IoU = inter / (area1 + area2 - inter)
+    return inter / (box_area(box1.T)[:, None] + box_area(box2.T) - inter + eps)
+
+
+def xyxy2xywh(x):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
+    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
+    y[:, 2] = x[:, 2] - x[:, 0]  # width
+    y[:, 3] = x[:, 3] - x[:, 1]  # height
+    return y
+
+
+def xywh2xyxy(x):
+    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
+    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
+    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
+    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
+    return y
+
+
+def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
+    # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = w * (x[:, 0] - x[:, 2] / 2) + padw  # top left x
+    y[:, 1] = h * (x[:, 1] - x[:, 3] / 2) + padh  # top left y
+    y[:, 2] = w * (x[:, 0] + x[:, 2] / 2) + padw  # bottom right x
+    y[:, 3] = h * (x[:, 1] + x[:, 3] / 2) + padh  # bottom right y
+    return y
+
+
+def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
+    if clip:
+        clip_boxes(x, (h - eps, w - eps))  # warning: inplace clip
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = ((x[:, 0] + x[:, 2]) / 2) / w  # x center
+    y[:, 1] = ((x[:, 1] + x[:, 3]) / 2) / h  # y center
+    y[:, 2] = (x[:, 2] - x[:, 0]) / w  # width
+    y[:, 3] = (x[:, 3] - x[:, 1]) / h  # height
+    return y
+
+
+def xyn2xy(x, w=640, h=640, padw=0, padh=0):
+    # Convert normalized segments into pixel segments, shape (n,2)
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[:, 0] = w * x[:, 0] + padw  # top left x
+    y[:, 1] = h * x[:, 1] + padh  # top left y
+    return y
+
+
+def segment2box(segment, width=640, height=640):
+    # Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy)
+    x, y = segment.T  # segment xy
+    inside = (x >= 0) & (y >= 0) & (x <= width) & (y <= height)
+    x, y, = x[inside], y[inside]
+    return np.array([x.min(), y.min(), x.max(),
+                     y.max()]) if any(x) else np.zeros((1, 4))  # xyxy
+
+
+def segments2boxes(segments):
+    # Convert segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh)
+    boxes = []
+    for s in segments:
+        x, y = s.T  # segment xy
+        boxes.append([x.min(), y.min(), x.max(), y.max()])  # cls, xyxy
+    return xyxy2xywh(np.array(boxes))  # cls, xywh
+
+
+def resample_segments(segments, n=1000):
+    # Up-sample an (n,2) segment
+    for i, s in enumerate(segments):
+        s = np.concatenate((s, s[0:1, :]), axis=0)
+        x = np.linspace(0, len(s) - 1, n)
+        xp = np.arange(len(s))
+        segments[i] = np.concatenate([
+            np.interp(x, xp, s[:, i]) for i in range(2)
+        ]).reshape(2, -1).T  # segment xy
+    return segments
+
+
+def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
+    # Rescale boxes (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    boxes[:, [0, 2]] -= pad[0]  # x padding
+    boxes[:, [1, 3]] -= pad[1]  # y padding
+    boxes[:, :4] /= gain
+    clip_boxes(boxes, img0_shape)
+    return boxes
+
+
+def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None):
+    # Rescale coords (xyxy) from img1_shape to img0_shape
+    if ratio_pad is None:  # calculate from img0_shape
+        gain = min(img1_shape[0] / img0_shape[0],
+                   img1_shape[1] / img0_shape[1])  # gain  = old / new
+        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (
+            img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
+    else:
+        gain = ratio_pad[0][0]
+        pad = ratio_pad[1]
+
+    segments[:, 0] -= pad[0]  # x padding
+    segments[:, 1] -= pad[1]  # y padding
+    segments /= gain
+    clip_segments(segments, img0_shape)
+    return segments
+
+
+def clip_boxes(boxes, shape):
+    # Clip boxes (xyxy) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[:, 0].clamp_(0, shape[1])  # x1
+        boxes[:, 1].clamp_(0, shape[0])  # y1
+        boxes[:, 2].clamp_(0, shape[1])  # x2
+        boxes[:, 3].clamp_(0, shape[0])  # y2
+    else:  # np.array (faster grouped)
+        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+
+
+def clip_segments(boxes, shape):
+    # Clip segments (xy1,xy2,...) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[:, 0].clamp_(0, shape[1])  # x
+        boxes[:, 1].clamp_(0, shape[0])  # y
+    else:  # np.array (faster grouped)
+        boxes[:, 0] = boxes[:, 0].clip(0, shape[1])  # x
+        boxes[:, 1] = boxes[:, 1].clip(0, shape[0])  # y
+
+
+def non_max_suppression(
+        prediction,
+        conf_thres=0.25,
+        iou_thres=0.45,
+        classes=None,
+        agnostic=False,
+        multi_label=True,
+        labels=(),
+        max_det=300,
+        nm=0,  # number of masks
+):
+    """Non-Maximum Suppression (NMS) on inference results to reject overlapping detections
+
+    Returns:
+         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
+    """
+
+    if isinstance(
+            prediction, (list, tuple)
+    ):  # YOLOv5 model in validation model, output = (inference_out, loss_out)
+        prediction = prediction[0]  # select only inference output
+
+    bs = prediction.shape[0]  # batch size
+    nc = prediction.shape[2] - nm - 5  # number of classes
+    xc = prediction[..., 4] > conf_thres  # candidates
+
+    # Checks
+    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
+    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
+
+    # Settings
+    # min_wh = 2  # (pixels) minimum box width and height
+    max_wh = 7680  # (pixels) maximum box width and height
+    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
+    time_limit = 0.5 + 0.05 * bs  # seconds to quit after
+    redundant = True  # require redundant detections
+    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
+    merge = False  # use merge-NMS
+
+    # t = time.time()
+    mi = 5 + nc  # mask start index
+    output = [torch.zeros((0, 6 + nm))] * bs
+    for xi, x in enumerate(prediction):  # image index, image inference
+        # Apply constraints
+        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
+        x = x[xc[xi]]  # confidence
+
+        # Cat apriori labels if autolabelling
+        if labels and len(labels[xi]):
+            lb = labels[xi]
+            v = torch.zeros((len(lb), nc + nm + 5), device=x.device)
+            v[:, :4] = lb[:, 1:5]  # box
+            v[:, 4] = 1.0  # conf
+            v[range(len(lb)), lb[:, 0].long() + 5] = 1.0  # cls
+            x = torch.cat((x, v), 0)
+
+        # If none remain process next image
+        if not x.shape[0]:
+            continue
+
+        # Compute conf
+        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
+
+        # Box/Mask
+        box = xywh2xyxy(
+            x[:, :4])  # center_x, center_y, width, height) to (x1, y1, x2, y2)
+        mask = x[:, mi:]  # zero columns if no masks
+
+        # Detections matrix nx6 (xyxy, conf, cls)
+        if multi_label:
+            i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
+            x = torch.cat(
+                (box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
+        else:  # best class only
+            conf, j = x[:, 5:mi].max(1, keepdim=True)
+            x = torch.cat((box, conf, j.float(), mask),
+                          1)[conf.view(-1) > conf_thres]
+
+        # Filter by class
+        if classes is not None:
+            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
+
+        # Apply finite constraint
+        # if not torch.isfinite(x).all():
+        #     x = x[torch.isfinite(x).all(1)]
+
+        # Check shape
+        n = x.shape[0]  # number of boxes
+        if not n:  # no boxes
+            continue
+        elif n > max_nms:  # excess boxes
+            x = x[x[:, 4].argsort(
+                descending=True)[:max_nms]]  # sort by confidence
+        else:
+            x = x[x[:, 4].argsort(descending=True)]  # sort by confidence
+
+        # Batched NMS
+        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
+        boxes, scores = x[:, :4] + c, x[:,
+                                        4]  # boxes (offset by class), scores
+        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
+        if i.shape[0] > max_det:  # limit detections
+            i = i[:max_det]
+        if merge and (1 < n <
+                      3E3):  # Merge NMS (boxes merged using weighted mean)
+            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
+            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
+            weights = iou * scores[None]  # box weights
+            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
+                1, keepdim=True)  # merged boxes
+            if redundant:
+                i = i[iou.sum(1) > 1]  # require redundancy
+
+        output[xi] = x[i]
+    return output
+
+
+
+
+#NOTE(chen.chen):
+# just work for coco2017 val using pycocotools
+# maybe we need some abstraction here for generic coco-like dataset
+class COCO2017Evaluator:    
+    def __init__(self,
+                 label_path,
+                 image_size=640,
+                 with_nms=False,
+                 conf_thres=0.001,
+                 iou_thres=0.65):
+        self.with_nms = with_nms
+        self.conf_thres = conf_thres
+        self.iou_thres = iou_thres
+        self.label_path = label_path
+        self.image_size = image_size
+
+        self.jdict = []
+
+        # iou vector for mAP@0.5:0.95
+        self.iouv = torch.linspace(0.5, 0.95, 10)  
+        self.niou = self.iouv.numel()
+    
+    def evaluate(self, pred, all_inputs, nms_count=None):
+        im = all_inputs[0]
+        targets = all_inputs[1]
+        paths = all_inputs[2]
+        shapes = all_inputs[3]
+
+        _, _, height, width = im.shape
+        targets[:, 2:] *= np.array((width, height, width, height))
+        
+        if self.with_nms:
+            assert nms_count is not None
+            tmp_out = []
+            for boxes, count in zip(pred, nms_count):
+                count = count[0]
+                boxes = boxes[:count, :]
+                boxes_cp = boxes.copy()
+                # (x1,y1,x2,y2,class_id,score)
+                # To (x1,y1,x2,y2,score,class_id)
+                boxes[:, 4] = boxes_cp[:, 5]
+                boxes[:, 5] = boxes_cp[:, 4]
+                tmp_out.append(torch.from_numpy(boxes))
+            pred = tmp_out   
+        else:
+            pred = torch.from_numpy(pred)
+            pred = non_max_suppression(pred, self.conf_thres, self.iou_thres)
+        for idx, det in enumerate(pred):
+            img_path = paths[idx]
+
+            predn = det
+            shape = shapes[idx][0]
+            scale_boxes(im[idx].shape[1:], predn[:, :4], shape, shapes[idx][1])  # native-space pred
+
+            self._save_one_json(predn, self.jdict, img_path, coco80_to_coco91)  # append to COCO-JSON dictionary
+        
+
+    def _save_one_json(self, predn, jdict, path, class_map):
+        # Save one JSON result in the format
+        # {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}
+        image_id = int(os.path.splitext(os.path.basename(path))[0])
+        box = xyxy2xywh(predn[:, :4])
+        box[:, :2] -= box[:, 2:] / 2
+        for p, b in zip(predn.tolist(), box.tolist()):
+            jdict.append({
+                'image_id': image_id,
+                'category_id': class_map[int(p[5])],
+                'bbox': [round(x, 3) for x in b],
+                'score': round(p[4], 5)
+            })
+
+
+    def summary(self):
+        if len(self.jdict):
+            pred_json = os.path.join("coco2017_predictions.json")
+            with open(pred_json, 'w') as f:
+                json.dump(self.jdict, f)
+            result = get_coco_accuracy(pred_json, self.label_path)
+        else:
+            raise ValueError("can not find generated json dict for pycocotools")
+        return result
+
+# coco2017 val evaluator For Yolox
+class COCO2017EvaluatorForYolox(COCO2017Evaluator):
+    def evaluate(self, pred, all_inputs):
+        im = all_inputs[0]
+        img_path = all_inputs[1]
+        img_info = all_inputs[2]
+        
+        _, _, height, width = im.shape
+        img_size = [height, width]
+
+        pred = torch.from_numpy(self.Detect(pred, img_size=[height, width]))
+
+        nms_outputs = self.postprocess(
+                    pred, conf_thre=self.conf_thres, nms_thre=self.iou_thres
+                )
+
+        for (output, org_img, path) in zip(nms_outputs, img_info, img_path):
+            if output is None:
+                continue
+            
+            bboxes = output[:, 0:4]
+
+            img_h, img_w = org_img
+
+            scale = min(img_size[0] / float(img_h), img_size[1] / float(img_w))
+
+            bboxes /= scale
+            cls = output[:, 6]
+            scores = output[:, 4] * output[:, 5]
+            
+            bboxes = self._xyxy2xywh(bboxes)
+
+            self._save_one_json(bboxes, cls, scores, self.jdict, path, coco80_to_coco91)
+
+    def Detect(self, outputs, img_size):
+        grids = []
+        expanded_strides = []
+
+        strides = [8, 16, 32]
+
+        hsizes = [img_size[0] // stride for stride in strides]
+        wsizes = [img_size[1] // stride for stride in strides]
+
+        for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+            xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+            grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            expanded_strides.append(np.full((*shape, 1), stride))
+
+        grids = np.concatenate(grids, 1)
+        expanded_strides = np.concatenate(expanded_strides, 1)
+        outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+        outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+        
+        return outputs
+    
+    def postprocess(self, prediction, num_classes=80, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+        box_corner = prediction.new(prediction.shape)
+        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+        prediction[:, :, :4] = box_corner[:, :, :4]
+
+        output = [None for _ in range(len(prediction))]
+
+        for i, image_pred in enumerate(prediction):
+            # If none are remaining => process next image
+            if not image_pred.size(0):
+                continue
+            # Get score and class with highest confidence
+            class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+
+            conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+            # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+            detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
+            detections = detections[conf_mask]
+            
+            if not detections.size(0):
+                continue
+            if class_agnostic:
+                nms_out_index = torchvision.ops.nms(
+                    detections[:, :4],
+                    detections[:, 4] * detections[:, 5],
+                    nms_thre,
+                )
+            else:
+                nms_out_index = torchvision.ops.batched_nms(
+                    detections[:, :4],
+                    detections[:, 4] * detections[:, 5],
+                    detections[:, 6],
+                    nms_thre,
+                )
+            detections = detections[nms_out_index]
+
+            if output[i] is None:
+                output[i] = detections
+            else:
+                output[i] = torch.cat((output[i], detections))
+
+        return output
+
+    def _xyxy2xywh(self, bboxes):
+        bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
+        bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
+        return bboxes
+
+    def _save_one_json(self, bboxes, class_, scores, jdict, path, class_map):
+        image_id = int(os.path.splitext(os.path.basename(path))[0])
+        for box, score, cls in zip(bboxes.numpy().tolist(), scores.numpy().tolist(), class_.numpy().tolist()):
+            jdict.append({
+                'image_id': image_id,
+                'category_id': class_map[int(cls)],
+                'bbox': box,
+                'score': score
+            })
+
+
+# coco2017 val evaluator For Yolov4
+class COCO2017EvaluatorForYolov4(COCO2017EvaluatorForYolox):
+    def evaluate(self, pred, all_inputs):
+        im = all_inputs[0]
+        img_path = all_inputs[1]
+        img_info = all_inputs[2]
+
+        boxes = torch.squeeze(torch.from_numpy(pred[0]), dim=2)
+        confs = torch.from_numpy(pred[1])
+        detections = torch.cat((boxes, confs.float()), 2)
+
+        nms_outputs = self.postprocess(
+            detections, conf_thre=self.conf_thres, nms_thre=self.iou_thres
+        )
+
+        for (output, org_img, path) in zip(nms_outputs, img_info, img_path):
+            if output is None:
+                continue
+            
+            bboxes = output[:, 0:4]
+            img_h, img_w = org_img
+            bboxes[:, 0] *= img_w
+            bboxes[:, 2] *= img_w
+            bboxes[:, 1] *= img_h
+            bboxes[:, 3] *= img_h
+
+            cls = output[:, 5]
+            scores = output[:, 4]
+            
+            bboxes = self._xyxy2xywh(bboxes)
+            self._save_one_json(bboxes, cls, scores, self.jdict, path, coco80_to_coco91)
+    
+    def postprocess(self, prediction, num_classes=80, conf_thre=0.7, nms_thre=0.45, class_agnostic=False):
+        output = [None for _ in range(len(prediction))]
+
+        for i, image_pred in enumerate(prediction):
+            # If none are remaining => process next image
+            if not image_pred.size(0):
+                continue
+            # Get score and class with highest confidence
+            class_conf, class_pred = torch.max(image_pred[:, 4: 4 + num_classes], 1, keepdim=True)
+
+            conf_mask = (class_conf.squeeze() >= conf_thre).squeeze()
+            detections = torch.cat((image_pred[:, :4], class_conf, class_pred.float()), 1)
+            detections = detections[conf_mask]
+
+            if not detections.size(0):
+                continue
+            if class_agnostic:
+                nms_out_index = torchvision.ops.nms(
+                    detections[:, :4],
+                    detections[:, 4],
+                    nms_thre,
+                )
+            else:
+                nms_out_index = torchvision.ops.batched_nms(
+                    detections[:, :4],
+                    detections[:, 4],
+                    detections[:, 5],
+                    nms_thre,
+                )
+            detections = detections[nms_out_index]
+
+            if output[i] is None:
+                output[i] = detections
+            else:
+                output[i] = torch.cat((output[i], detections))
+
+        return output
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..e310f6b8bd6d5d53414c820fc7ca1c2b1c471d9e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/compile_engine.py
@@ -0,0 +1,19 @@
+import os
+import tvm
+
+from .import_model import import_model_to_igie
+from .target import get_target
+
+
+# a simple wrapper for compile engine and get module
+def compile_engine_from_args(args):
+    target, device = get_target(args.target)
+    
+    if not os.path.exists(args.engine_path):
+        mod, params = import_model_to_igie(args.model_path, args.input_dict, args.model_framework)
+        lib = tvm.relay.build(mod, target=target, params=params, precision=args.precision, verbose=args.verbose, required_pass=args.required_pass)
+        lib.export_library(args.engine_path)
+    else:
+        lib = tvm.runtime.load_module(args.engine_path)   
+    module = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+    return module
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a01ef7e7485abc2ec3f2effbb6b2faf97dc3229
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/dataloader.py
@@ -0,0 +1,595 @@
+import os
+import numpy as np
+from PIL import Image
+from collections import defaultdict
+
+import tensorflow as tf
+try:
+    tf = tf.compat.v1
+except ImportError:
+    tf = tf
+tf.enable_eager_execution()
+
+import torch
+import torchvision
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+from pycocotools.coco import COCO
+
+from .coco_metric import *
+
+_igie_cache_dir = os.path.expanduser("~/.igie_cache")
+_bulitin_data_url = "http://10.113.3.3/data/CI_DATA/ci_data.tar.gz"
+_builtin_data_path = os.path.join(_igie_cache_dir, "modelzoo_data")
+_symbolic_link_data_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data")
+
+
+### Tensorflow image pre-process function
+def _mean_image_subtraction(image, means):
+    """Subtracts the given means from each image channel."""
+    if image.get_shape().ndims != 3:
+        raise ValueError('Input must be of size [height, width, C>0]')
+    num_channels = image.get_shape().as_list()[-1]
+    if len(means) != num_channels:
+        raise ValueError('len(means) must match the number of channels')
+    channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
+    for i in range(num_channels):
+        channels[i] -= means[i]
+    return tf.concat(axis=2, values=channels)
+
+def _central_crop(image, crop_height, crop_width):
+    shape = tf.shape(image)
+    height, width = shape[0], shape[1]
+
+    amount_to_be_cropped_h = (height - crop_height)
+    crop_top = amount_to_be_cropped_h // 2
+    amount_to_be_cropped_w = (width - crop_width)
+    crop_left = amount_to_be_cropped_w // 2
+    return tf.slice(image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
+
+def _aspect_preserving_resize(image, resize_min):
+    """Resize images preserving the original aspect ratio.
+    """
+    shape = tf.shape(image)
+    height, width = shape[0], shape[1]
+    new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+    return _resize_image(image, new_height, new_width)
+
+def _smallest_size_at_least(height, width, resize_min):
+    resize_min = tf.cast(resize_min, tf.float32)
+    # Convert to floats to make subsequent calculations go smoothly.
+    height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+    smaller_dim = tf.minimum(height, width)
+    scale_ratio = resize_min / smaller_dim
+    # Convert back to ints to make heights and widths that TF ops will accept.
+    new_height = tf.cast(height * scale_ratio, tf.int32)
+    new_width = tf.cast(width * scale_ratio, tf.int32)
+    return new_height, new_width
+
+def _resize_image(image, height, width):
+    return tf.image.resize(image, [height, width], method=tf.image.ResizeMethod.BILINEAR, preserve_aspect_ratio=False)
+
+
+
+### Pytorch image pre-process function
+def _torch_imagenet_preprocess(image_path):
+    img = Image.open(image_path).convert('RGB')
+    # preprocess image to nomalized tensor for pytorch
+    _PYTORCH_IMAGENET_PREPROCESS = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
+                                    0.229, 0.224, 0.225]),
+        ]
+    )
+    img = _PYTORCH_IMAGENET_PREPROCESS(img)
+    return img
+
+
+### Tensorflow image pre-process function
+def _tf_imagenet_preprocess(image_path):
+    img = Image.open(image_path).convert('RGB')
+    _TF_IMAGENET_PREPROCESS = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+        ]
+    )
+    
+    img = _TF_IMAGENET_PREPROCESS(img)
+    img *= 255.0
+    assert len(img.shape) == 3
+    img = transforms.Normalize(mean=[123.68, 116.78, 103.94], std=[1, 1, 1])(img)
+    img = img.permute((1, 2, 0)) # CHW -> HWC
+    
+    return img
+
+
+class ImageNetDataset(torch.utils.data.Dataset):
+    def __init__(self, image_dir_path, label_dir_path="", layout="NHWC", image_size=(224, 224)):
+        super().__init__()
+        self.image_dir_path = image_dir_path
+        self.label_dir_path = label_dir_path
+        self.layout = layout
+        
+        if len(image_size) == 1:
+            self.image_height = self.image_width = image_size
+        if len(image_size) == 2:
+            self.image_height = image_size[0]
+            self.image_width = image_size[1]
+        assert self.layout in ["NHWC", "NCHW"], f"layout should be NHWC or NCHW, got {self.layout} "
+        self.img_list = os.listdir(self.image_dir_path)
+        self.label_dict = self.get_label_dict()
+        
+        self.images = []
+        self.length = 0
+
+        for image_dir in self.img_list:
+            image_path = os.path.join(self.image_dir_path, image_dir)
+            if os.path.isdir(image_path):
+                for image in os.listdir(image_path):
+                    self.images.append(os.path.join(image_path, image))
+                    self.length += 1
+
+    def __getitem__(self, index):
+        ## NHWC pre-process for tensorflow
+        if self.layout == "NHWC":
+            processed_image = _tf_imagenet_preprocess(self.images[index])
+            # image = cv2.imread(self.images[index])
+            # image = cv2.cvtColor(image, 4)
+            # resize_image = _aspect_preserving_resize(image, 256)
+            # crop_image = _central_crop(resize_image, self.image_height, self.image_width)  
+            # crop_image.set_shape([self.image_height, self.image_width, 3])
+            # crop_image = tf.to_float(crop_image)
+            # processed_image = _mean_image_subtraction(crop_image, [123.68, 116.78, 103.94]).numpy()
+        
+        ## NCHW pre-process for Pytorch
+        elif self.layout == "NCHW":
+            processed_image = _torch_imagenet_preprocess(self.images[index])
+        else:
+            raise ValueError("Unsupported data layout")
+
+        image_name = self.images[index].split('/')[-1].strip()
+        label = self.label_dict[image_name]
+
+        return processed_image, label
+
+    def __len__(self):
+        return self.length
+
+    def get_label_dict(self):
+        image_label = {}
+        label_path = os.path.join(self.image_dir_path, 'val.txt')
+        if self.label_dir_path != "":
+            label_path = self.label_dir_path
+        if os.path.exists(label_path):
+            with open(label_path, 'r') as file:
+                lines = file.readlines()
+        
+            for line in lines:
+                image = line.split(' ')[0].strip()
+                label = line.split(' ')[1].strip()
+                image_label[image] = int(label)
+        
+        return image_label
+
+def get_imagenet_dataloader(data_path, batch_size, num_workers, model_framework, input_layout):
+    if model_framework == "tensorflow":
+        val_dir = os.path.join(data_path, "val")
+        dataset = ImageNetDataset(val_dir, layout="NHWC")
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers, drop_last=True)
+
+    else:
+        assert input_layout == "NCHW"
+        val_dir = os.path.join(data_path, 'validation')
+        assert os.path.isdir(val_dir), f"{val_dir} does not exist, please specify correct data path"
+
+        dataset = torchvision.datasets.ImageFolder(
+            val_dir,
+            transforms.Compose(
+                [
+                    transforms.Resize(256, interpolation=InterpolationMode.BILINEAR),
+                    transforms.CenterCrop(224),
+                    transforms.PILToTensor(),
+                    transforms.ConvertImageDtype(torch.float),
+                    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+                ]
+            )
+        )
+
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size, num_workers=num_workers, drop_last=True)
+
+    return dataloader
+
+class COCO2017Dataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 image_dir_path,
+                 label_json_path,
+                 image_size=640,
+                 pad_color=114,
+                 val_mode=True,
+                 input_layout="NCHW"):
+
+        self.image_dir_path = image_dir_path
+        self.label_json_path = label_json_path
+        self.image_size = image_size
+        self.pad_color = pad_color
+        self.val_mode = val_mode
+        self.input_layout = input_layout
+
+        self.coco = COCO(annotation_file=self.label_json_path)
+        
+        if self.val_mode:
+            self.img_ids = list(sorted(self.coco.imgs.keys()))  # 5000
+        else:  # train mode need images with labels
+            self.img_ids = sorted(list(self.coco.imgToAnns.keys()))  # 4952
+
+    def __len__(self):
+        return len(self.img_ids)
+
+    def __getitem__(self, index):
+        img_path = self._get_image_path(index)
+        img, (h0, w0), (h, w) = self._load_image(index)
+
+        img, ratio, pad = letterbox(img,
+                                    self.image_size,
+                                    color=(self.pad_color, self.pad_color, self.pad_color))
+        shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling
+
+        # load label
+        raw_label = self._load_json_label(index)
+        # normalized xywh to pixel xyxy format
+        raw_label[:, 1:] = xywhn2xyxy(raw_label[:, 1:],
+                                      ratio[0] * w,
+                                      ratio[1] * h,
+                                      padw=pad[0],
+                                      padh=pad[1])
+
+        raw_label[:, 1:] = xyxy2xywhn(raw_label[:, 1:],
+                                      w=img.shape[1],
+                                      h=img.shape[0],
+                                      clip=True,
+                                      eps=1E-3)
+
+        nl = len(raw_label)  # number of labels
+        labels_out = np.zeros((nl, 6))
+        labels_out[:, 1:] = raw_label
+
+        # Convert
+        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+        img = np.ascontiguousarray(img) / 255.0  # 0~1 np array
+        if self.input_layout == "NHWC":
+            img = img.transpose((1, 2, 0))
+
+        return img, labels_out, img_path, shapes
+
+    def _get_image_path(self, index):
+        idx = self.img_ids[index]
+        path = self.coco.loadImgs(idx)[0]["file_name"]
+        img_path = os.path.join(self.image_dir_path, path)
+        return img_path
+
+    def _load_image(self, index):
+        img_path = self._get_image_path(index)
+
+        im = cv2.imread(img_path)  # BGR
+        h0, w0 = im.shape[:2]  # orig hw
+        r = self.image_size / max(h0, w0)  # ratio
+        if r != 1:  # if sizes are not equal
+            im = cv2.resize(im, (int(w0 * r), int(h0 * r)), interpolation=cv2.INTER_LINEAR)
+        return im.astype("float32"), (h0, w0), im.shape[:2]  # im, hw_original, hw_resized
+
+    def _load_json_label(self, index):
+        _, (h0, w0), _ = self._load_image(index)
+
+        idx = self.img_ids[index]
+        ann_ids = self.coco.getAnnIds(imgIds=idx)
+        targets = self.coco.loadAnns(ids=ann_ids)
+
+        labels = []
+        for target in targets:
+            cat = target["category_id"]
+            coco80_cat = coco91_to_coco80_dict[cat]
+            cat = np.array([[coco80_cat]])
+
+            x, y, w, h = target["bbox"]
+            x1, y1, x2, y2 = x, y, int(x + w), int(y + h)
+            xyxy = np.array([[x1, y1, x2, y2]])
+            xywhn = xyxy2xywhn(xyxy, w0, h0)
+            labels.append(np.hstack((cat, xywhn)))
+
+        if labels:
+            labels = np.vstack(labels)
+        else:
+            if self.val_mode:
+                # for some image without label
+                labels = np.zeros((1, 5))
+            else:
+                raise ValueError(f"set val_mode = False to use images with labels")
+
+        return labels
+
+    @staticmethod
+    def collate_fn(batch):
+        im, label, path, shapes = zip(*batch)
+        for i, lb in enumerate(label):
+            lb[:, 0] = i
+        return np.concatenate([i[None] for i in im], axis=0), np.concatenate(label, 0), path, shapes
+
+# Datasets just for Yolox
+class COCO2017DatasetForYolox(COCO2017Dataset):
+    def __getitem__(self, index):
+        img_path = self._get_image_path(index)
+        img = self._load_image(img_path)
+
+        img, r = self.preproc(img, input_size=self.image_size)
+        
+        return img, img_path, r
+
+    def _load_image(self, img_path):
+        img = cv2.imread(img_path)
+        assert img is not None, f"file {img_path} not found"
+
+        return img
+    
+    def preproc(self, img, input_size, swap=(2, 0, 1)):
+        if len(img.shape) == 3:
+            padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+        else:
+            padded_img = np.ones(input_size, dtype=np.uint8) * 114
+        
+        org_img = (img.shape[0], img.shape[1])
+        r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+        resized_img = cv2.resize(
+            img,
+            (int(img.shape[1] * r), int(img.shape[0] * r)),
+            interpolation=cv2.INTER_LINEAR,
+        ).astype(np.uint8)
+        padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+        padded_img = padded_img.transpose(swap)
+        padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+        return padded_img, org_img
+
+    @staticmethod
+    def collate_fn(batch):
+        im, img_path, r = zip(*batch)
+        return np.concatenate([i[None] for i in im], axis=0), img_path, r
+
+# Datasets just for Yolox
+class COCO2017DatasetForYolov4(COCO2017DatasetForYolox):
+    def preproc(self, img, input_size, swap=(2, 0, 1)):
+        org_img = (img.shape[0], img.shape[1])
+        img_ = cv2.resize(img, (input_size[0], input_size[1]))
+        img_ = cv2.cvtColor(img_, cv2.COLOR_BGR2RGB)
+        img_ = img_.transpose(swap) / 255.0
+        img_ = np.ascontiguousarray(img_, dtype=np.float32)
+        return img_, org_img
+    
+def get_coco2017_dataloader(data_path, label_path, batch_size, image_size, num_workers, model_framework, input_layout, custom_option=None):
+    # TODO(chen.chen)
+    # we only support pytorch-like coco2017 data preprocess
+    # some problems may occur when the data preprocess is different, e.g. tensorflow
+    assert model_framework != "tensorflow"
+    if custom_option == 'yolox':
+        dataset = COCO2017DatasetForYolox(data_path, label_path, image_size=(image_size, image_size), input_layout=input_layout)
+    elif custom_option == 'yolov4':
+        dataset = COCO2017DatasetForYolov4(data_path, label_path, image_size=(image_size, image_size), input_layout=input_layout)
+    else:
+        dataset = COCO2017Dataset(data_path, label_path, image_size, input_layout=input_layout)
+        
+    # NOTE(chen.chen)
+    # we should validate all images in the datasets to use pycocotools
+    # so we do not drop last batch which maybe smaller than a normal batch
+    # you should pad the batch dimension in the outside
+    dataloader = torch.utils.data.DataLoader(dataset,
+                                            batch_size=batch_size,
+                                            drop_last=False,
+                                            num_workers=num_workers,
+                                            collate_fn=dataset.collate_fn)
+
+    return dataloader
+
+
+class FakeDataSet(torch.utils.data.Dataset):
+    def __init__(self, input_name_list, input_shape_list, input_dtype_list):
+        self.input_name_list = input_name_list
+        self.input_shape_list = input_shape_list
+        self.input_dtype_list = input_dtype_list
+
+        self.max_length = 100000
+
+    def __len__(self):
+        return self.max_length
+        
+    def __getitem__(self, _):
+        input_data = []
+        for shape, dtype in zip(self.input_shape_list, self.input_dtype_list):
+            if dtype.startswith("float"):
+                data = np.random.randn(*shape[1:]).astype(dtype)
+            elif dtype.startswith("int"):
+                data = np.random.randint(0, 10, shape[1:]).astype(dtype)
+            else:
+                raise ValueError(f"unsupported dtype: {dtype}")
+        
+            input_data.append(data)
+            
+        return tuple(input_data)
+        
+
+    @staticmethod
+    def collate_fn(batch):
+        batch_input_data = []
+        for i in zip(*batch):
+            data = np.concatenate([j[np.newaxis,:] for j in i], axis=0)
+            batch_input_data.append(data)
+        return tuple(batch_input_data)        
+
+class NumpyDataSet(torch.utils.data.Dataset):
+    def __init__(self, input_name_list, input_shape_list, input_dtype_list, path):
+        self.input_name_list = input_name_list
+        self.input_shape_list = input_shape_list
+        self.input_dtype_list = input_dtype_list
+        self.path = path
+
+        self.ext = os.path.splitext(self.path)[-1]
+        assert self.ext.endswith(".npy") or self.ext.endswith(".data")
+
+        self.dtype_size_map = {
+            "fp32": np.dtype("float32"),
+            "float32": np.dtype("float32"),
+            "fp16": np.dtype("float16"),
+            "float16": np.dtype("float16"),
+            "int8": np.dtype("int8")
+        }
+        
+        self._process_numpy_data()
+   
+    def _process_numpy_data(self):
+        if self.ext.endswith(".npy"):
+            self.total_data_number = len(self.input_name_list)
+            
+            self.data = np.load(self.path, allow_pickle=True)
+            assert len(self.data) == self.total_data_number, f"np data length should be {self.total_data_number}, got {len(self.data)}"        
+            self.length = self.data[0].shape[0]
+        
+        elif self.ext.endswith(".data"): 
+            with open(self.path, mode='rb') as f:
+                calibrate_data = f.read()
+            
+            total_bytes = 0
+            input_size_list = []
+            for shape, dtype in zip(self.input_shape_list, self.input_dtype_list):
+                size = np.prod(shape) * self.dtype_size_map[dtype].itemsize
+                input_size_list.append(size)
+                total_bytes += size
+            
+            assert (len(calibrate_data) % total_bytes == 0), f"calibrate_data size({len(calibrate_data)}) don't match one batch size({total_bytes}) multiple."
+            
+            index = 0
+            npy_data_dict = defaultdict(list)
+            while index < len(calibrate_data):
+                for name, shape, dtype, size in zip(self.input_name_list, self.input_shape_list, self.input_dtype_list, input_size_list):   
+                    data = np.frombuffer(calibrate_data[index: index + size], dtype=dtype).reshape(shape)
+                    npy_data_dict[name].append(data)
+                    index += size
+
+            self.data = []
+            for v in npy_data_dict.values():
+                data = np.concatenate(v, axis=0)
+                self.data.append(data)
+                
+            self.length = self.data[0].shape[0]
+        else:
+            raise 
+
+    def __len__(self):
+        return self.length
+        
+    def __getitem__(self, index):
+        input_data = []
+        for i in self.data:
+            input_data.append(i[index])
+        return tuple(input_data)
+        
+    @staticmethod
+    def collate_fn(batch):
+        batch_input_data = []
+        for i in zip(*batch):
+            data = np.concatenate([j[np.newaxis,:] for j in i], axis=0)
+            batch_input_data.append(data)
+        return tuple(batch_input_data)        
+
+def download_builtin_data():
+    if not os.path.exists(_builtin_data_path):
+        if not os.path.exists(_igie_cache_dir):
+            os.makedirs(_igie_cache_dir)
+
+        pwd = os.getcwd()
+        os.chdir(_igie_cache_dir)
+        
+        cmd = f"wget {_bulitin_data_url}"
+        os.system(cmd)
+
+        cmd = f"tar -xzf ci_data.tar.gz"
+        os.system(cmd)
+                
+        os.chdir(pwd)
+    
+    if os.path.exists(_builtin_data_path) and not os.path.exists(_symbolic_link_data_path):
+        cmd = f"ln -s {_builtin_data_path} {_symbolic_link_data_path}"
+        os.system(cmd)
+        
+    print(f"Use builtin dataset path: {_builtin_data_path}")
+        
+
+def get_dataloader_from_args(args):
+    ## use built-in dataset
+    if args.use_builtin_data:
+        download_builtin_data()
+ 
+        if args.use_imagenet:
+            args.data_path = os.path.join(_builtin_data_path, "datasets", "imagenet")
+            
+            return get_imagenet_dataloader(args.data_path, args.batch_size, args.num_workers, args.model_framework, args.input_layout)
+            
+        elif args.use_coco2017:
+            args.data_path = os.path.join(_builtin_data_path, "datasets", "coco", "images", "val2017")
+            args.label_path = os.path.join(_builtin_data_path, "datasets", "coco", "annotations", "instances_val2017.json")
+
+            input_shape = args.input_shape_list[0]            
+            assert len(input_shape) == 4, f"input should be a 4d tensor, format as NCHW or NHWC, got {len(input_shape)}"
+            if args.input_layout == "NCHW":
+                assert input_shape[2] == input_shape[3], f"HW should be the same, got {input_shape[2]} and {input_shape[3]}"
+                args.image_size = input_shape[2]
+            else: #NHWC
+                assert input_shape[1] == input_shape[2], f"HW should be the same, got {input_shape[1]} and {input_shape[2]}"
+                args.image_size = input_shape[1]
+
+            # use custom option do preprocessing
+            if args.custom_option is not None  and 'process' in args.custom_option:
+                return get_coco2017_dataloader(args.data_path, args.label_path, args.batch_size, args.image_size, args.num_workers, args.model_framework, args.input_layout, args.custom_option['process'])
+            else:   
+                return get_coco2017_dataloader(args.data_path, args.label_path, args.batch_size, args.image_size, args.num_workers, args.model_framework, args.input_layout)
+            
+    
+    elif args.calibration_file_path is not None:
+        ## NOTE(chen.chen)
+        ## user-provided dataset, just use it as calibration data
+        ## we support two format .npy and .data
+        
+        ## if extetion is .npy, it should be a single npy file,
+        ## each input should be saved in a np.ndarray which has beed preprocessed
+        ## e.g. for two inputs model
+        ## the npy should be a list of two array, the shape of each array is like below
+        ## ((100, 3, 224, 224), (100, 1000))
+        
+        ## if extension is .data, we will call np.frombuffer to load the data
+        ## this is for paddle-igie compatibility and only support single input now
+        
+        
+        calibration_file_path = args.calibration_file_path
+        assert os.path.exists(calibration_file_path), f"can not find calibration file:{calibration_file_path}"
+        ext = os.path.splitext(calibration_file_path)[-1]
+        
+        assert ext in [".npy", ".data"], f"unspported calibration file format {ext}, it should be .npy or .data"
+        
+        dataset = NumpyDataSet(args.input_name_list, args.input_shape_list, args.input_dtype_list, calibration_file_path)
+        
+        dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, drop_last=True, collate_fn=dataset.collate_fn)
+          
+        return dataloader
+    
+    else:
+        ## NOTE(chen.chen)
+        ## use fake data for calibration, just used for perf test
+        ## here we should know the shape/dtype info of the input to generate the fake input data
+        dataset = FakeDataSet(args.input_name_list, args.input_shape_list, args.input_dtype_list)
+        dataloader = torch.utils.data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, drop_last=True, collate_fn=dataset.collate_fn)
+
+        return dataloader
+    
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8229884b17d817c0b45938dc1d44e6840b15e02b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/__init__.py
@@ -0,0 +1,9 @@
+# import torch first to make jit op work without `ImportError of libc10.so`
+import torch
+
+from .jit_ops import FastCOCOEvalOp, JitOp
+
+try:
+    from .fast_coco_eval_api import COCOeval_opt
+except ImportError:  #  exception will be raised when users build yolox from source
+    pass
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2e63bc9952918060f55999ec100b283d83616b46
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.cpp
@@ -0,0 +1,502 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "cocoeval.h"
+#include <time.h>
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+using namespace pybind11::literals;
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    std::vector<uint64_t>* detection_sorted_indices) {
+  detection_sorted_indices->resize(detection_instances.size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_instances](size_t j1, size_t j2) {
+        return detection_instances[j1].score > detection_instances[j2].score;
+      });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+    const std::array<double, 2>& area_range,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    std::vector<uint64_t>* ground_truth_sorted_indices,
+    std::vector<bool>* ignores) {
+  ignores->clear();
+  ignores->reserve(ground_truth_instances.size());
+  for (auto o : ground_truth_instances) {
+    ignores->push_back(
+        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+  }
+
+  ground_truth_sorted_indices->resize(ground_truth_instances.size());
+  std::iota(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      0);
+  std::stable_sort(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      [&ignores](size_t j1, size_t j2) {
+        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+      });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    const std::vector<uint64_t>& ground_truth_sorted_indices,
+    const std::vector<bool>& ignores,
+    const std::vector<std::vector<double>>& ious,
+    const std::vector<double>& iou_thresholds,
+    const std::array<double, 2>& area_range,
+    ImageEvaluation* results) {
+  // Initialize memory to store return data matches and ignore
+  const int num_iou_thresholds = iou_thresholds.size();
+  const int num_ground_truth = ground_truth_sorted_indices.size();
+  const int num_detections = detection_sorted_indices.size();
+  std::vector<uint64_t> ground_truth_matches(
+      num_iou_thresholds * num_ground_truth, 0);
+  std::vector<uint64_t>& detection_matches = results->detection_matches;
+  std::vector<bool>& detection_ignores = results->detection_ignores;
+  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
+  detection_matches.resize(num_iou_thresholds * num_detections, 0);
+  detection_ignores.resize(num_iou_thresholds * num_detections, false);
+  ground_truth_ignores.resize(num_ground_truth);
+  for (auto g = 0; g < num_ground_truth; ++g) {
+    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+  }
+
+  for (auto t = 0; t < num_iou_thresholds; ++t) {
+    for (auto d = 0; d < num_detections; ++d) {
+      // information about best match so far (match=-1 -> unmatched)
+      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+      int match = -1;
+      for (auto g = 0; g < num_ground_truth; ++g) {
+        // if this ground truth instance is already matched and not a
+        // crowd, it cannot be matched to another detection
+        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+          continue;
+        }
+
+        // if detected instance matched to a regular ground truth
+        // instance, we can break on the first ground truth instance
+        // tagged as ignore (because they are sorted by the ignore tag)
+        if (match >= 0 && !ground_truth_ignores[match] &&
+            ground_truth_ignores[g]) {
+          break;
+        }
+
+        // if IOU overlap is the best so far, store the match appropriately
+        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+          best_iou = ious[d][ground_truth_sorted_indices[g]];
+          match = g;
+        }
+      }
+      // if match was made, store id of match for both detection and
+      // ground truth
+      if (match >= 0) {
+        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+        detection_matches[t * num_detections + d] =
+            ground_truth_instances[ground_truth_sorted_indices[match]].id;
+        ground_truth_matches[t * num_ground_truth + match] =
+            detection_instances[detection_sorted_indices[d]].id;
+      }
+
+      // set unmatched detections outside of area range to ignore
+      const InstanceAnnotation& detection =
+          detection_instances[detection_sorted_indices[d]];
+      detection_ignores[t * num_detections + d] =
+          detection_ignores[t * num_detections + d] ||
+          (detection_matches[t * num_detections + d] == 0 &&
+           (detection.area < area_range[0] || detection.area > area_range[1]));
+    }
+  }
+
+  // store detection score results
+  results->detection_scores.resize(detection_sorted_indices.size());
+  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+    results->detection_scores[d] =
+        detection_instances[detection_sorted_indices[d]].score;
+  }
+}
+
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges,
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances) {
+  const int num_area_ranges = area_ranges.size();
+  const int num_images = image_category_ground_truth_instances.size();
+  const int num_categories =
+      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+  std::vector<uint64_t> detection_sorted_indices;
+  std::vector<uint64_t> ground_truth_sorted_indices;
+  std::vector<bool> ignores;
+  std::vector<ImageEvaluation> results_all(
+      num_images * num_area_ranges * num_categories);
+
+  // Store results for each image, category, and area range combination. Results
+  // for each IOU threshold are packed into the same ImageEvaluation object
+  for (auto i = 0; i < num_images; ++i) {
+    for (auto c = 0; c < num_categories; ++c) {
+      const std::vector<InstanceAnnotation>& ground_truth_instances =
+          image_category_ground_truth_instances[i][c];
+      const std::vector<InstanceAnnotation>& detection_instances =
+          image_category_detection_instances[i][c];
+
+      SortInstancesByDetectionScore(
+          detection_instances, &detection_sorted_indices);
+      if ((int)detection_sorted_indices.size() > max_detections) {
+        detection_sorted_indices.resize(max_detections);
+      }
+
+      for (size_t a = 0; a < area_ranges.size(); ++a) {
+        SortInstancesByIgnore(
+            area_ranges[a],
+            ground_truth_instances,
+            &ground_truth_sorted_indices,
+            &ignores);
+
+        MatchDetectionsToGroundTruth(
+            detection_instances,
+            detection_sorted_indices,
+            ground_truth_instances,
+            ground_truth_sorted_indices,
+            ignores,
+            image_category_ious[i][c],
+            iou_thresholds,
+            area_ranges[a],
+            &results_all
+                [c * num_area_ranges * num_images + a * num_images + i]);
+      }
+    }
+  }
+
+  return results_all;
+}
+
+// Convert a python list to a vector
+template <typename T>
+std::vector<T> list_to_vec(const py::list& l) {
+  std::vector<T> v(py::len(l));
+  for (int i = 0; i < (int)py::len(l); ++i) {
+    v[i] = l[i].cast<T>();
+  }
+  return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index].  Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i.  detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+    const std::vector<ImageEvaluation>& evaluations,
+    const int64_t evaluation_index,
+    const int64_t num_images,
+    const int max_detections,
+    std::vector<uint64_t>* evaluation_indices,
+    std::vector<double>* detection_scores,
+    std::vector<uint64_t>* detection_sorted_indices,
+    std::vector<uint64_t>* image_detection_indices) {
+  assert(evaluations.size() >= evaluation_index + num_images);
+
+  // Extract a list of object instances of the applicable category, area
+  // range, and max detections requirements such that they can be sorted
+  image_detection_indices->clear();
+  evaluation_indices->clear();
+  detection_scores->clear();
+  image_detection_indices->reserve(num_images * max_detections);
+  evaluation_indices->reserve(num_images * max_detections);
+  detection_scores->reserve(num_images * max_detections);
+  int num_valid_ground_truth = 0;
+  for (auto i = 0; i < num_images; ++i) {
+    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+    for (int d = 0;
+         d < (int)evaluation.detection_scores.size() && d < max_detections;
+         ++d) { // detected instances
+      evaluation_indices->push_back(evaluation_index + i);
+      image_detection_indices->push_back(d);
+      detection_scores->push_back(evaluation.detection_scores[d]);
+    }
+    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+      if (!ground_truth_ignore) {
+        ++num_valid_ground_truth;
+      }
+    }
+  }
+
+  // Sort detections by decreasing score, using stable sort to match
+  // python implementation
+  detection_sorted_indices->resize(detection_scores->size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_scores](size_t j1, size_t j2) {
+        return (*detection_scores)[j1] > (*detection_scores)[j2];
+      });
+
+  return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+    const int64_t precisions_out_index,
+    const int64_t precisions_out_stride,
+    const int64_t recalls_out_index,
+    const std::vector<double>& recall_thresholds,
+    const int iou_threshold_index,
+    const int num_iou_thresholds,
+    const int num_valid_ground_truth,
+    const std::vector<ImageEvaluation>& evaluations,
+    const std::vector<uint64_t>& evaluation_indices,
+    const std::vector<double>& detection_scores,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<uint64_t>& image_detection_indices,
+    std::vector<double>* precisions,
+    std::vector<double>* recalls,
+    std::vector<double>* precisions_out,
+    std::vector<double>* scores_out,
+    std::vector<double>* recalls_out) {
+  assert(recalls_out->size() > recalls_out_index);
+
+  // Compute precision/recall for each instance in the sorted list of detections
+  int64_t true_positives_sum = 0, false_positives_sum = 0;
+  precisions->clear();
+  recalls->clear();
+  precisions->reserve(detection_sorted_indices.size());
+  recalls->reserve(detection_sorted_indices.size());
+  assert(!evaluations.empty() || detection_sorted_indices.empty());
+  for (auto detection_sorted_index : detection_sorted_indices) {
+    const ImageEvaluation& evaluation =
+        evaluations[evaluation_indices[detection_sorted_index]];
+    const auto num_detections =
+        evaluation.detection_matches.size() / num_iou_thresholds;
+    const auto detection_index = iou_threshold_index * num_detections +
+        image_detection_indices[detection_sorted_index];
+    assert(evaluation.detection_matches.size() > detection_index);
+    assert(evaluation.detection_ignores.size() > detection_index);
+    const int64_t detection_match =
+        evaluation.detection_matches[detection_index];
+    const bool detection_ignores =
+        evaluation.detection_ignores[detection_index];
+    const auto true_positive = detection_match > 0 && !detection_ignores;
+    const auto false_positive = detection_match == 0 && !detection_ignores;
+    if (true_positive) {
+      ++true_positives_sum;
+    }
+    if (false_positive) {
+      ++false_positives_sum;
+    }
+
+    const double recall =
+        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
+    recalls->push_back(recall);
+    const int64_t num_valid_detections =
+        true_positives_sum + false_positives_sum;
+    const double precision = num_valid_detections > 0
+        ? static_cast<double>(true_positives_sum) / num_valid_detections
+        : 0.0;
+    precisions->push_back(precision);
+  }
+
+  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
+    if ((*precisions)[i] > (*precisions)[i - 1]) {
+      (*precisions)[i - 1] = (*precisions)[i];
+    }
+  }
+
+  // Sample the per instance precision/recall list at each recall threshold
+  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+    // first index in recalls >= recall_thresholds[r]
+    std::vector<double>::iterator low = std::lower_bound(
+        recalls->begin(), recalls->end(), recall_thresholds[r]);
+    size_t precisions_index = low - recalls->begin();
+
+    const auto results_ind = precisions_out_index + r * precisions_out_stride;
+    assert(results_ind < precisions_out->size());
+    assert(results_ind < scores_out->size());
+    if (precisions_index < precisions->size()) {
+      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+      (*scores_out)[results_ind] =
+          detection_scores[detection_sorted_indices[precisions_index]];
+    } else {
+      (*precisions_out)[results_ind] = 0;
+      (*scores_out)[results_ind] = 0;
+    }
+  }
+}
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evaluations) {
+  const std::vector<double> recall_thresholds =
+      list_to_vec<double>(params.attr("recThrs"));
+  const std::vector<int> max_detections =
+      list_to_vec<int>(params.attr("maxDets"));
+  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+  const int num_recall_thresholds = py::len(params.attr("recThrs"));
+  const int num_categories = params.attr("useCats").cast<int>() == 1
+      ? py::len(params.attr("catIds"))
+      : 1;
+  const int num_area_ranges = py::len(params.attr("areaRng"));
+  const int num_max_detections = py::len(params.attr("maxDets"));
+  const int num_images = py::len(params.attr("imgIds"));
+
+  std::vector<double> precisions_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+  std::vector<double> recalls_out(
+      num_iou_thresholds * num_categories * num_area_ranges *
+          num_max_detections,
+      -1);
+  std::vector<double> scores_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+
+  // Consider the list of all detected instances in the entire dataset in one
+  // large list.  evaluation_indices, detection_scores,
+  // image_detection_indices, and detection_sorted_indices all have the same
+  // length as this list, such that each entry corresponds to one detected
+  // instance
+  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
+  std::vector<double> detection_scores; // detection scores of each instance
+  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
+                                                  // instances in the dataset
+  std::vector<uint64_t>
+      image_detection_indices; // indices into the list of detected instances in
+                               // the same image as each instance
+  std::vector<double> precisions, recalls;
+
+  for (auto c = 0; c < num_categories; ++c) {
+    for (auto a = 0; a < num_area_ranges; ++a) {
+      for (auto m = 0; m < num_max_detections; ++m) {
+        // The COCO PythonAPI assumes evaluations[] (the return value of
+        // COCOeval::EvaluateImages() is one long list storing results for each
+        // combination of category, area range, and image id, with categories in
+        // the outermost loop and images in the innermost loop.
+        const int64_t evaluations_index =
+            c * num_area_ranges * num_images + a * num_images;
+        int num_valid_ground_truth = BuildSortedDetectionList(
+            evaluations,
+            evaluations_index,
+            num_images,
+            max_detections[m],
+            &evaluation_indices,
+            &detection_scores,
+            &detection_sorted_indices,
+            &image_detection_indices);
+
+        if (num_valid_ground_truth == 0) {
+          continue;
+        }
+
+        for (auto t = 0; t < num_iou_thresholds; ++t) {
+          // recalls_out is a flattened vectors representing a
+          // num_iou_thresholds X num_categories X num_area_ranges X
+          // num_max_detections matrix
+          const int64_t recalls_out_index =
+              t * num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          // precisions_out and scores_out are flattened vectors
+          // representing a num_iou_thresholds X num_recall_thresholds X
+          // num_categories X num_area_ranges X num_max_detections matrix
+          const int64_t precisions_out_stride =
+              num_categories * num_area_ranges * num_max_detections;
+          const int64_t precisions_out_index = t * num_recall_thresholds *
+                  num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          ComputePrecisionRecallCurve(
+              precisions_out_index,
+              precisions_out_stride,
+              recalls_out_index,
+              recall_thresholds,
+              t,
+              num_iou_thresholds,
+              num_valid_ground_truth,
+              evaluations,
+              evaluation_indices,
+              detection_scores,
+              detection_sorted_indices,
+              image_detection_indices,
+              &precisions,
+              &recalls,
+              &precisions_out,
+              &scores_out,
+              &recalls_out);
+        }
+      }
+    }
+  }
+
+  time_t rawtime;
+  struct tm local_time;
+  std::array<char, 200> buffer;
+  time(&rawtime);
+#ifdef _WIN32
+  localtime_s(&local_time, &rawtime);
+#else
+  localtime_r(&rawtime, &local_time);
+#endif
+  strftime(
+      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+  return py::dict(
+      "params"_a = params,
+      "counts"_a = std::vector<int64_t>({num_iou_thresholds,
+                                         num_recall_thresholds,
+                                         num_categories,
+                                         num_area_ranges,
+                                         num_max_detections}),
+      "date"_a = buffer,
+      "precision"_a = precisions_out,
+      "recall"_a = recalls_out,
+      "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbf5aab4b8303b8e199f10e1ecf2f634ca29cb42
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/cocoeval/cocoeval.h
@@ -0,0 +1,98 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+  InstanceAnnotation(
+      uint64_t id,
+      double score,
+      double area,
+      bool is_crowd,
+      bool ignore)
+      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+  uint64_t id;
+  double score = 0.;
+  double area = 0.;
+  bool is_crowd = false;
+  bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+  // For each of the D detected instances, the id of the matched ground truth
+  // instance, or 0 if unmatched
+  std::vector<uint64_t> detection_matches;
+
+  // The detection score of each of the D detected instances
+  std::vector<double> detection_scores;
+
+  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+  // because it's outside area_range)
+  std::vector<bool> ground_truth_ignores;
+
+  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+  // because it's outside aRng)
+  std::vector<bool> detection_ignores;
+};
+
+template <class T>
+using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves.  The parameters of nested vectors have the following semantics:
+//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
+//     detected instance and g'th ground truth instance of
+//     category category_ids[c] in image image_ids[i]
+//   image_category_ground_truth_instances[i][c] is a vector of ground truth
+//     instances in image image_ids[i] of category category_ids[c]
+//   image_category_detection_instances[i][c] is a vector of detected
+//     instances in image image_ids[i] of category category_ids[c]
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters.  It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evalutations);
+
+} // namespace COCOeval
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+    m.def(
+        "COCOevalEvaluateImages",
+        &COCOeval::EvaluateImages,
+        "COCOeval::EvaluateImages");
+    pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
+        .def(pybind11::init<uint64_t, double, double, bool, bool>());
+    pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
+        .def(pybind11::init<>());
+}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..374031ab8fa8738f96cbfec50985d2e0ae406e53
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/fast_coco_eval_api.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# This file comes from
+# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Megvii Inc. All rights reserved.
+
+import copy
+import time
+
+import numpy as np
+from pycocotools.cocoeval import COCOeval
+
+from .jit_ops import FastCOCOEvalOp
+
+
+class COCOeval_opt(COCOeval):
+    """
+    This is a slightly modified version of the original COCO API, where the functions evaluateImg()
+    and accumulate() are implemented in C++ to speedup evaluation
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        try:
+            self.module = FastCOCOEvalOp().load()
+        except:
+            raise ImportError
+        
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results in self.evalImgs_cpp, a
+        datastructure that isn't readable from Python but is used by a c++ implementation of
+        accumulate().  Unlike the original COCO PythonAPI, we don't populate the datastructure
+        self.evalImgs because this datastructure is a computational bottleneck.
+        :return: None
+        """
+        tic = time.time()
+
+        print("Running per image evaluation...")
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = "segm" if p.useSegm == 1 else "bbox"
+            print(
+                "useSegm (deprecated) is not None. Running {} evaluation".format(
+                    p.iouType
+                )
+            )
+        print("Evaluate annotation type *{}*".format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == "segm" or p.iouType == "bbox":
+            computeIoU = self.computeIoU
+        elif p.iouType == "keypoints":
+            computeIoU = self.computeOks
+        self.ious = {
+            (imgId, catId): computeIoU(imgId, catId)
+            for imgId in p.imgIds
+            for catId in catIds
+        }
+
+        maxDet = p.maxDets[-1]
+
+        # <<<< Beginning of code differences with original COCO API
+        def convert_instances_to_cpp(instances, is_det=False):
+            # Convert annotations for a list of instances in an image to a format that's fast
+            # to access in C++
+            instances_cpp = []
+            for instance in instances:
+                instance_cpp = self.module.InstanceAnnotation(
+                    int(instance["id"]),
+                    instance["score"] if is_det else instance.get("score", 0.0),
+                    instance["area"],
+                    bool(instance.get("iscrowd", 0)),
+                    bool(instance.get("ignore", 0)),
+                )
+                instances_cpp.append(instance_cpp)
+            return instances_cpp
+
+        # Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
+        ground_truth_instances = [
+            [convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
+            for imgId in p.imgIds
+        ]
+        detected_instances = [
+            [
+                convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
+                for catId in p.catIds
+            ]
+            for imgId in p.imgIds
+        ]
+        ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
+
+        if not p.useCats:
+            # For each image, flatten per-category lists into a single list
+            ground_truth_instances = [
+                [[o for c in i for o in c]] for i in ground_truth_instances
+            ]
+            detected_instances = [
+                [[o for c in i for o in c]] for i in detected_instances
+            ]
+
+        # Call C++ implementation of self.evaluateImgs()
+        self._evalImgs_cpp = self.module.COCOevalEvaluateImages(
+            p.areaRng,
+            maxDet,
+            p.iouThrs,
+            ious,
+            ground_truth_instances,
+            detected_instances,
+        )
+        self._evalImgs = None
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
+        # >>>> End of code differences with original COCO API
+
+    def accumulate(self):
+        """
+        Accumulate per image evaluation results and store the result in self.eval.  Does not
+        support changing parameter settings from those used by self.evaluate()
+        """
+        print("Accumulating evaluation results...")
+        tic = time.time()
+        if not hasattr(self, "_evalImgs_cpp"):
+            print("Please run evaluate() first")
+
+        self.eval = self.module.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
+
+        # recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
+        self.eval["recall"] = np.array(self.eval["recall"]).reshape(
+            self.eval["counts"][:1] + self.eval["counts"][2:]
+        )
+
+        # precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
+        # num_area_ranges X num_max_detections
+        self.eval["precision"] = np.array(self.eval["precision"]).reshape(
+            self.eval["counts"]
+        )
+        self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
+        toc = time.time()
+        print(
+            "COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)
+        )
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce3195ff4b796542670a23bc32bcd67dc8aed55
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/fastCoCoeval/jit_ops.py
@@ -0,0 +1,179 @@
+import glob
+import importlib
+import os
+import sys
+import time
+from typing import List
+from torch import distributed as dist
+from contextlib import contextmanager
+
+__all__ = ["JitOp", "FastCOCOEvalOp"]
+
+_LOCAL_PROCESS_GROUP = None
+
+def get_rank() -> int:
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if _LOCAL_PROCESS_GROUP is None:
+        return get_rank()
+
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
+
+@contextmanager
+def wait_for_the_master(local_rank: int = None):
+    """
+    Make all processes waiting for the master to do some task.
+
+    Args:
+        local_rank (int): the rank of the current process. Default to None.
+            If None, it will use the rank of the current process.
+    """
+    if local_rank is None:
+        local_rank = get_local_rank()
+
+    if local_rank > 0:
+        dist.barrier()
+    yield
+    if local_rank == 0:
+        if not dist.is_available():
+            return
+        if not dist.is_initialized():
+            return
+        else:
+            dist.barrier()
+
+class JitOp:
+    """
+    Just-in-time compilation of ops.
+
+    Some code of `JitOp` is inspired by `deepspeed.op_builder`,
+    check the following link for more details:
+    https://github.com/microsoft/DeepSpeed/blob/master/op_builder/builder.py
+    """
+
+    def __init__(self, name):
+        self.name = name
+
+    def absolute_name(self) -> str:
+        """Get absolute build path for cases where the op is pre-installed."""
+        pass
+
+    def sources(self) -> List:
+        """Get path list of source files of op.
+
+        NOTE: the path should be elative to root of package during building,
+            Otherwise, exception will be raised when building package.
+            However, for runtime building, path will be absolute.
+        """
+        pass
+
+    def include_dirs(self) -> List:
+        """
+        Get list of include paths, relative to root of package.
+
+        NOTE: the path should be elative to root of package.
+            Otherwise, exception will be raised when building package.
+        """
+        return []
+
+    def define_macros(self) -> List:
+        """Get list of macros to define for op"""
+        return []
+
+    def cxx_args(self) -> List:
+        """Get optional list of compiler flags to forward"""
+        args = ["-O2"] if sys.platform == "win32" else ["-O3", "-std=c++14", "-g", "-Wno-reorder"]
+        return args
+
+    def nvcc_args(self) -> List:
+        """Get optional list of compiler flags to forward to nvcc when building CUDA sources"""
+        args = [
+            "-O3", "--use_fast_math",
+            "-std=c++17" if sys.platform == "win32" else "-std=c++14",
+            "-U__CUDA_NO_HALF_OPERATORS__",
+            "-U__CUDA_NO_HALF_CONVERSIONS__",
+            "-U__CUDA_NO_HALF2_OPERATORS__",
+        ]
+        return args
+
+    def build_op(self):
+        from torch.utils.cpp_extension import CppExtension
+        return CppExtension(
+            name=self.absolute_name(),
+            sources=self.sources(),
+            include_dirs=self.include_dirs(),
+            define_macros=self.define_macros(),
+            extra_compile_args={
+                "cxx": self.cxx_args(),
+            },
+        )
+
+    def load(self, verbose=False):
+        try:
+            # try to import op from pre-installed package
+            return importlib.import_module(self.absolute_name())
+        except Exception:  # op not compiled, jit load
+            with wait_for_the_master():  # to avoid race condition
+                return self.jit_load(verbose)
+
+    def jit_load(self, verbose=False):
+        from torch.utils.cpp_extension import load
+        try:
+            import ninja  # noqa
+        except ImportError:
+            if verbose:
+                print(
+                    f"Ninja is not installed, fall back to normal installation for {self.name}."
+                )
+
+        build_tik = time.time()
+        # build op and load
+        op_module = load(
+            name=self.name,
+            sources=self.sources(),
+            extra_cflags=self.cxx_args(),
+            extra_cuda_cflags=self.nvcc_args(),
+            verbose=verbose,
+        )
+        build_duration = time.time() - build_tik
+        if verbose:
+            print(f"Load {self.name} op in {build_duration:.3f}s.")
+        return op_module
+
+    def clear_dynamic_library(self):
+        """Remove dynamic libraray files generated by JIT compilation."""
+        module = self.load()
+        os.remove(module.__file__)
+
+
+class FastCOCOEvalOp(JitOp):
+
+    def __init__(self, name="fast_cocoeval"):
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'fastCoCoeval.{self.name}'
+
+    def sources(self):
+        sources = glob.glob(os.path.join("fastCoCoeval", "cocoeval", "*.cpp"))
+        if not sources:  # source will be empty list if the so file is removed after install
+            # use abosolute path to compile
+            code_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "cocoeval", "*.cpp")
+            sources = glob.glob(code_path)
+        return sources
+
+    def include_dirs(self):
+        return [os.path.join("fastCoCoeval", "cocoeval")]
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b413788a026c025aec5ec69fad0018cfc470b1f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/file.py
@@ -0,0 +1,20 @@
+import os
+import json
+
+def download_file(src_url, save_path):
+    if not os.path.exists(save_path):
+        cmd = f"wget {src_url}"
+        os.system(cmd)
+        
+    assert os.path.exists(save_path)
+    
+
+def load_json(path):
+    with open(path, "r") as f:
+        data = json.load(f)
+    return data
+
+
+def save_json(data, path):
+    with open(path, "w") as f:
+        json.dump(data, f, indent=4)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d034bc0a15441cfc0c920d5bfc21a1cc4e24d6b8
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/imagenet_metric.py
@@ -0,0 +1,23 @@
+import torch
+import numpy as np
+
+
+def get_topk_accuracy(pred, label):
+    if isinstance(pred, np.ndarray):
+        pred = torch.from_numpy(pred)
+        
+    if isinstance(label, np.ndarray):
+        label = torch.from_numpy(label)
+    
+    top1_acc = 0
+    top5_acc = 0
+    for idx in range(len(label)):
+        label_value = label[idx]
+        if label_value == torch.topk(pred[idx].float(), 1).indices.data:
+            top1_acc += 1
+            top5_acc += 1
+
+        elif label_value in torch.topk(pred[idx].float(), 5).indices.data:
+            top5_acc += 1
+            
+    return top1_acc, top5_acc
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0b5e12c5ab86fcd4149eb7b7354e0a8222ad124
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/import_model.py
@@ -0,0 +1,113 @@
+import os
+import shutil
+import onnx
+import torch
+import torchvision
+import tensorflow as tf
+try:
+    tf_compat_v1 = tf.compat.v1
+except ImportError:
+    tf_compat_v1 = tf
+
+import tvm
+from tvm import relay
+import tvm.relay.testing.tf as tf_testing
+from .onnx_util import get_batch_size, rewrite_int64_input_to_int32
+from .onnx_rewrite_batch_size import rewrite_batch_size
+from .argument import to_bool
+from tvm.relay.transform.iluvatar import SimplifyGraph
+
+def import_model_to_igie(model_path_or_name, input_dict, model_framework):
+    
+    base_name = os.path.splitext(os.path.split(model_path_or_name)[1])[0]
+    cache_hash = f"{base_name}_cache_dir"
+    mod_path = os.path.join(cache_hash, "mod.cache")
+    params_path = os.path.join(cache_hash, "params.cache")
+    
+    # find cached mod and params
+    if os.path.exists(cache_hash) and to_bool(os.environ.get("IGIE_USE_CACHE", False)):
+        with open(mod_path, "r") as mod_file:
+            mod = tvm.parser.fromtext(mod_file.read())
+        
+        with open(params_path, "rb") as params_file:
+            params = relay.load_param_dict(params_file.read())
+
+        return mod, params
+    
+    paddle_dir_path = os.path.split(model_path_or_name)[0]
+    if os.path.exists(model_path_or_name) or os.path.exists(paddle_dir_path):
+        if model_framework == "onnx":
+            batch_size = list(input_dict.values())[0][0]
+            model_path = model_path_or_name
+            
+                  
+            # we don't want to handle multi_input case here,
+            # e.g. input_ids:1000,22 pixel_values:32,3,224,224 attention_mask:1000,22 for clip model
+            if len(input_dict) == 1:
+                batch_size_from_model = get_batch_size(model_path_or_name)
+                if isinstance(batch_size_from_model, int) and batch_size_from_model != batch_size:
+                    model_path = f"{model_path[:-5]}_rewrite_b{batch_size}.onnx"
+                    rewrite_batch_size(model_path_or_name, batch_size, save_model_path=model_path)
+
+            model = onnx.load(model_path)
+            # model = rewrite_int64_input_to_int32(model)
+            mod, params = relay.frontend.from_onnx(model, input_dict, freeze_params=True)
+    
+        elif model_framework == "pytorch":
+            scripted_model = torch.jit.load(model_path_or_name).eval()
+            input_infos = [(k, v) for k, v in input_dict.items()]
+            mod, params = relay.frontend.from_pytorch(scripted_model, input_infos=input_infos)
+    
+        elif model_framework == "tensorflow":
+            with tf_compat_v1.gfile.GFile(model_path_or_name, "rb") as f:
+                graph_def = tf_compat_v1.GraphDef()
+                graph_def.ParseFromString(f.read())
+                graph_def = tf_testing.ProcessGraphDefParam(graph_def)
+            mod, params = relay.frontend.from_tensorflow(graph_def, shape=input_dict)
+            
+        elif model_framework == "paddle":
+                import paddle
+                model = paddle.jit.load(model_path_or_name)
+                mod, params = relay.frontend.from_paddle(model, input_dict)
+        else:
+            raise ValueError(f"framwork {model_framework} is not supported yet")
+        
+    else:
+        # In this case we will try to find from tochvision
+        # e.g. model_path_or_name="resnet18"
+
+        try:
+            import ssl 
+            ssl._create_default_https_context = ssl._create_unverified_context
+            model = getattr(torchvision.models, model_path_or_name)(pretrained=True).eval()
+        except:
+            raise ValueError(f"can not find model {model_path_or_name} from torchvision and current working directory")
+        
+        
+        input_datas = []
+        for shape in input_dict.values():
+            # currently torchvision model should always use float32 input
+            input_datas.append(torch.randn(shape))
+        
+        scripted_model = torch.jit.trace(model, tuple(input_datas)).eval()
+        input_infos = [(k, v) for k, v in input_dict.items()]
+        mod, params = relay.frontend.from_pytorch(scripted_model, input_infos=input_infos) 
+
+    # save cache
+    if to_bool(os.environ.get("IGIE_USE_CACHE", False)):
+        if os.path.exists(cache_hash):
+            shutil.rmtree(cache_hash)
+        os.makedirs(cache_hash)
+        
+        mod_path = os.path.join(cache_hash, "mod.cache")
+        with open(mod_path, "w") as mod_file:
+            mod_file.write(mod.astext())
+
+        params_path = os.path.join(cache_hash, "params.cache")
+        with open(params_path, "wb") as params_file:
+            params_file.write(relay.save_param_dict(params))
+    
+    # need SimlifyGraph mod when importing onnx models, especially the model contains Q/DQ node
+    mod = SimplifyGraph(mod, params)   
+    
+    return mod, params
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..452916efb0db565d38b9ce4baeb68ed01bd94815
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/mod_rewriter.py
@@ -0,0 +1,81 @@
+import tvm
+from tvm import relay
+from tvm.relay import Expr
+from tvm.relay.dataflow_pattern import wildcard, is_constant, is_op, DFPatternCallback, rewrite
+from tvm.relay.expr_functor import ExprMutator
+
+#TODO(chen.chen): we should move this class to igie repo
+class MainFunctionParamsRewriter(ExprMutator):
+    def __init__(self, target_input_dict, preprocess_rewriter=None):        
+        self.target_input = target_input_dict
+        self.preprocess_rewriter = preprocess_rewriter
+        self.target_input_name_list = list(self.target_input.keys())
+        
+        super().__init__()
+        
+    def visit_function(self, fn):
+        params = [self.visit(i) for i in fn.params]
+        body  = self.visit(fn.body)
+        
+        original_input_name_list = [param.name_hint for param in params]
+        assert len(set(self.target_input_name_list) - set(original_input_name_list)) == 0, f"invalid target_input_name: {set(self.target_input_name_list) - set(original_input_name_list)}"
+        
+        new_params = []
+        bind = {}
+        for param in params:
+            old_param = param
+            name = param.name_hint
+            
+            new_param = old_param
+            if name in self.target_input:
+                shape = self.target_input[name][0]
+                if len(self.target_input[name]) == 2:
+                    dtype = self.target_input[name][1]
+                else:
+                    dtype = old_param.type_annotation.dtype
+                new_param = relay.var(name_hint=name, shape=shape, dtype=dtype)
+
+            new_params.append(new_param)
+            bind[old_param] = new_param
+            
+        new_body = relay.expr.bind(body, bind)
+        
+        new_function = relay.Function(params=new_params,
+                                      body=new_body,
+                                      ret_type=None,
+                                      type_params=fn.type_params,
+                                      attrs=fn.attrs)
+        return new_function            
+               
+    def __call__(self, mod):
+        if self.preprocess_rewriter:
+            mod["main"] = rewrite(self.preprocess_rewriter, mod["main"])
+        mod["main"] = self.visit(mod["main"])
+        return mod
+    
+    
+# TODO(chen.chen) this function is designeg for bert model, but it doesn't work now
+# the reason is that, position_embedding is fixed when mod is generated from onnx
+# e.g. the meta[relay.Constant][51] is fixed as 256
+# even if we rewrite the seq_len to 384, the InferType will failed for %9 = add(%8, meta[relay.Constant][51] /* ty=Tensor[(1, 256, 768), float32] */)
+
+# def @main(%input_ids: Tensor[(8, 256), int64], %attention_mask: Tensor[(8, 256), int64], %token_type_ids: Tensor[(8, 256), int64]) -> (Tensor[(8, 256), float32], Tensor[(8, 256), float32]) {
+#   %0 = less(%input_ids, 0 /* ty=int64 */) /* ty=Tensor[(8, 256), bool] */;
+#   %1 = add(%input_ids, 30522 /* ty=int64 */) /* ty=Tensor[(8, 256), int64] */;
+#   %2 = where(%0, %1, %input_ids) /* ty=Tensor[(8, 256), int64] */;
+#   %3 = less(%token_type_ids, 0 /* ty=int64 */) /* ty=Tensor[(8, 256), bool] */;
+#   %4 = add(%token_type_ids, 2 /* ty=int64 */) /* ty=Tensor[(8, 256), int64] */;
+#   %5 = where(%3, %4, %token_type_ids) /* ty=Tensor[(8, 256), int64] */;
+#   %6 = take(meta[relay.Constant][49] /* ty=Tensor[(30522, 768), float32] */, %2, axis=0) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %7 = take(meta[relay.Constant][50] /* ty=Tensor[(2, 768), float32] */, %5, axis=0) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %8 = add(%6, %7) /* ty=Tensor[(8, 256, 768), float32] */;
+#   %9 = add(%8, meta[relay.Constant][51] /* ty=Tensor[(1, 256, 768), float32] */) /* ty=Tensor[(8, 256, 768), float32] */;
+  
+  
+def modify_seq_len_for_nlp(mod, input_dict, target_seq_len):
+    target_input_dict = {}
+    for name, shape in input_dict.items():
+        target_input_dict[name] = [(shape[0], target_seq_len)]
+    mod = relay.transform.InferType()(mod)
+    mod = MainFunctionParamsRewriter(target_input_dict=target_input_dict)(mod)
+    return mod
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py
new file mode 100644
index 0000000000000000000000000000000000000000..5332febfb8f2ce169bafaf9c74683814f07ac8b0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_rewrite_batch_size.py
@@ -0,0 +1,113 @@
+"""
+rewrite src onnx model and infer shape if possible, current sypport
+
+1. rewrite batch_size, e.g 1x3x640x640 -> 32x3x640x640
+
+Attention:
+1. all inputs/outputs batchszie dim will be modified together, which means some NLP/Audio senquence models will introduce problems
+
+
+"""
+import onnx
+from onnx import OperatorSetIdProto
+import onnx.numpy_helper
+
+import onnxoptimizer
+from onnxsim import simplify
+
+from .onnx_util import get_batch_size, rewrite_tensor_batch_size
+
+def rewrite_batch_size(model,
+                       batch_size,
+                       modify_reshape_dim=True,
+                       save_model_path=None):
+
+    ## rewrite input and output
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+        
+    ## there is a issue that when the onnx model comes from tf,
+    ## some shape info is stored as constant node's output instead of initializer
+    passes = [
+        "extract_constant_to_initializer", "eliminate_unused_initializer"
+    ]
+    model = onnxoptimizer.optimize(model, passes)
+    
+    
+
+    # to support qlinear op if the opset_import is not supported
+    # if we have some ohter domains need to import, add them here
+    ms_opset = OperatorSetIdProto()
+    ms_opset.domain = "com.microsoft"
+    ms_opset.version = 1
+
+    ori_opset_import = model.opset_import
+
+    if ms_opset not in ori_opset_import:
+        ori_opset_import.append(ms_opset)
+
+    model, check = simplify(model)
+    assert check, "Simplified ONNX model could not be validated"
+    
+
+    graph = model.graph
+    initializer = graph.initializer
+    inputs = graph.input
+    outputs = graph.output
+    nodes = graph.node
+
+    ori_batch_size = get_batch_size(model)
+
+    ## in case that some onnx model inputs contain initializers' shape info, we will remove them to avoid rewriting input failure
+
+    initializer_names = set([i.name for i in initializer])
+    import copy
+    tmp_inputs = copy.deepcopy(inputs)
+    for i in tmp_inputs:
+        if i.name in initializer_names:
+            inputs.remove(i)
+
+    for i in inputs:
+        rewrite_tensor_batch_size(i, batch_size)
+
+    for i in outputs:
+        rewrite_tensor_batch_size(i, batch_size)
+
+    ## we may need to modify reshape initializer if we modify input batchsize
+    ## this code only works when the target shape is fixed, and occurs as a input initializer in the node
+    ## so this may introduce some other problems when the purpose of reshape operations are totally different
+
+    if modify_reshape_dim:
+        reshape_input = []
+        for idx, i in enumerate(nodes):
+            if i.op_type == "Reshape":
+                reshape_input.extend(i.input)
+            if i.op_type == "Resize" and len(i.input) == 4:
+                reshape_input.append(i.input[3])
+        for idx, i in enumerate(initializer):
+            if i.name in reshape_input:
+                shape = onnx.numpy_helper.to_array(i).copy()
+                if shape.dtype == "int64":
+                    shape[0] = batch_size
+                    initializer[idx].CopyFrom(
+                        onnx.numpy_helper.from_array(shape, i.name))
+
+    for i in graph.value_info:
+        if i.type.tensor_type.shape.dim:
+            if i.type.tensor_type.shape.dim[0].dim_value == ori_batch_size:
+                i.type.tensor_type.shape.dim[0].dim_value = batch_size
+
+    model, check = simplify(model)
+    assert check, "Simplified ONNX model could not be validated"
+
+    model = onnx.shape_inference.infer_shapes(model,
+                                              check_type=True,
+                                              strict_mode=True,
+                                              data_prop=True)
+    onnx.checker.check_model(model)
+
+    if save_model_path:
+        onnx.save(model, save_model_path)
+    return model
+
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..96823647216acb23b8f1a6be39d00aacf53107ec
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/onnx_util.py
@@ -0,0 +1,130 @@
+import onnx
+from collections import defaultdict
+
+import onnx
+import os
+
+## FYI
+ONNX_DTYPE = {
+    0: onnx.TensorProto.FLOAT,
+    1: onnx.TensorProto.FLOAT,
+    2: onnx.TensorProto.UINT8,
+    3: onnx.TensorProto.INT8,
+    4: onnx.TensorProto.UINT16,
+    5: onnx.TensorProto.INT16,
+    6: onnx.TensorProto.INT32,
+    7: onnx.TensorProto.INT64,
+    8: onnx.TensorProto.STRING,
+    9: onnx.TensorProto.BOOL,
+}
+
+
+def rewrite_tensor_dim(tensor, dim_value_dict):
+    if isinstance(dim_value_dict, list):
+        dim_value_dict = {idx: i for idx, i in enumerate(dim_value_dict)}
+    all_dim = tensor.type.tensor_type.shape.dim
+    for idx, value in dim_value_dict.items():
+        if isinstance(value, str):
+            all_dim[idx].dim_param = "batch"
+        else:
+            all_dim[idx].dim_value = value
+
+
+def rewrite_tensor_batch_size(tensor, batch_size):
+
+    dim_value_dict = {0: batch_size}
+    rewrite_tensor_dim(tensor, dim_value_dict)
+
+
+def get_tensor_dim(tensor):
+    dims = []
+    all_dim = tensor.type.tensor_type.shape.dim
+    rank = len(all_dim)
+    for i in range(rank):
+        if all_dim[i].dim_value:
+            dims.append(all_dim[i].dim_value)
+        else:
+            dims.append(all_dim[i].dim_param)
+    return dims
+
+
+def get_tensor_name(tensor):
+    return tensor.name
+
+
+def nchw_dim_to_nhwc_dim(dim_list):
+    assert len(dim_list) == 4
+    new_dim = [dim_list[0], dim_list[2], dim_list[3], dim_list[1]]
+    return new_dim
+
+
+def get_input_number(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+    inputs = model.graph.input
+    return len(inputs)
+
+def get_batch_size(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+    inputs = model.graph.input
+    return get_tensor_dim(inputs[0])[0]
+
+
+def count_op_type(model):
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+    nodes = model.graph.node
+
+    node2count = defaultdict(int)
+    for i in nodes:
+        node2count[i.op_type] += 1
+
+    return node2count
+
+
+def contain_qlinear_opearator(onnx_model):
+    if isinstance(onnx_model, str):
+        onnx_model = onnx.load(onnx_model)
+
+    nodes = onnx_model.graph.node
+
+    for i in nodes:
+        op_type = i.op_type.lower()
+        if op_type.startswith("qlinear") or op_type.startswith("qgemm"):
+            return True
+    return False
+
+
+def get_all_node_name(model, exclude_constant=False, pretty_print=False):
+    if isinstance(model, str):
+        model = onnx.load(model)
+
+    nodes = model.graph.node
+    if exclude_constant:
+        all_node = [i.name for i in nodes if i.op_type != "Constant"]
+    else:
+        all_node = [i.name for i in nodes]
+
+    all_node.sort()
+    if pretty_print:
+        res = [f'"{i}"' for i in all_node]
+        res = ",\n".join(res)
+        res = f'[\n{res}\n]'
+        print(res)
+
+    return all_node
+
+def rewrite_int64_input_to_int32(model):
+    inputs = model.graph.input
+    
+    for i in inputs:
+        if i.type.tensor_type.elem_type == 7:
+            i.type.tensor_type.elem_type = 6
+    
+    print(inputs)
+    import pdb;pdb.set_trace()
+    onnx.checker.check_model(model)
+
+    return model
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..2490c6643cae10a61790262dde71a25273bd4be9
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/quantization.py
@@ -0,0 +1,531 @@
+import os
+import psutil
+from itertools import permutations
+import numpy as np
+
+import tvm
+from tvm import relay
+
+import onnx
+import onnx.helper as onnx_helper
+import onnxoptimizer
+from onnxsim import simplify
+from onnxruntime.quantization import (CalibrationDataReader, QuantFormat,
+                                      quantize_static, QuantType,
+                                      CalibrationMethod)
+
+from .onnx_util import contain_qlinear_opearator, rewrite_tensor_dim
+from .onnx_rewrite_batch_size import rewrite_batch_size
+from .dataloader import get_dataloader_from_args
+
+class Node:
+    def __init__(self, name, op_type, input, output):
+        self.name = name
+        self.op_type = op_type
+        self.input = input
+        self.output = output
+        
+        
+        self.from_node = []
+        self.to_node = []
+
+    def __repr__(self) -> str:
+        return f"{self.name} [{self.op_type}], input = {self.input}, output = {self.output}"
+
+    
+    @staticmethod
+    def connect(node_list):
+        perm = permutations(node_list, 2)
+        for (i, j) in perm:
+            i._connect(j)    
+    
+    def _connect(self, node):
+        if node in self.from_node or node in self.to_node:
+            return
+        for output in node.output:
+            if output in set(self.input):
+                node.to_node.append(self)
+                self.from_node.append(node)
+
+class Model:
+    @staticmethod
+    def add_ms_opset_domain(model,
+                            ms_opset_domain="com.microsoft",
+                            ms_opset_version=1):
+        found = False
+        for i in model.opset_import:
+            if i.domain == ms_opset_domain:
+                found = True
+                break
+
+        if not found:
+            ms_opset = onnx_helper.make_operatorsetid(ms_opset_domain,
+                                                        ms_opset_version)
+            model.opset_import.append(ms_opset)
+
+        return model
+
+    @staticmethod
+    def preprocess_onnx(model):
+        model = Model.add_ms_opset_domain(model)
+
+        passes = onnxoptimizer.get_available_passes()
+
+        no_need = [
+            # NOTE(chen.chen): the following passes cause some error, need to debug
+            "lift_lexical_references",
+            "split_init",
+            "split_predict",
+
+            # we do not want to rename anything
+            "rename_input_output",
+            "set_unique_name_for_nodes"
+        ]
+        passes = [i for i in passes if i not in no_need]       
+        model = onnxoptimizer.optimize(model, passes)
+
+        model, check = simplify(model)
+        assert check, "Simplified ONNX model could not be validated"
+
+        # model = onnx.shape_inference.infer_shapes(model, check_type=True, strict_mode=True, data_prop=True)
+        return model
+    
+    def __init__(self, model):
+        if isinstance(model, str):
+            model = onnx.load(model)
+        self.model = Model.preprocess_onnx(model)
+        
+        self.graph = self.model.graph
+        self.nodes = self.graph.node
+        self.node_list = []
+        for i in self.nodes:
+            self.node_list.append(Node(i.name, i.op_type, i.input, i.output))
+        Node.connect(self.node_list)
+        
+    
+    
+def find_detect_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    node_list = model.node_list
+    
+    
+    last_conv = []
+    # find last conv nodes before detect
+    for i in range(len(node_list) - 1, -1,  -1):
+        node = node_list[i]
+        if not node.op_type == "Conv":
+            continue
+        
+        after_node = node.to_node[:]
+        find_conv = False
+        while after_node:
+            last = after_node.pop()
+            after_node.extend(last.to_node)
+            
+            if last.op_type == "Conv":
+                find_conv = True
+                break
+
+        if not find_conv:
+            last_conv.append(node)
+    
+    
+    
+    exclude_detect_node_type = [
+        "Add", "Mul", "Concat",  
+        # "Reshape", "Exp", "Power", "Slice", "Split" ## these node will not be quantized actually
+        ]
+    exclude_detect_node_name = []
+    for i in last_conv:
+        after_node = i.to_node[:]
+        while after_node:
+            last = after_node.pop()
+            after_node.extend(last.to_node)
+            
+            if last.op_type in exclude_detect_node_type:
+                exclude_detect_node_name.append(last.name)
+    
+    exclude_detect_node_name = sorted(list(set(exclude_detect_node_name)))
+    return exclude_detect_node_name
+
+
+def find_unsupported_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    node_list = model.node_list
+    
+    
+    igie_not_supported_node_type = [
+        "Softmax",
+        "Gemm", # igie onnx frontend error for mobilenetv2
+    ]
+    exclude_node_name = []
+    for i in node_list:
+        if i.op_type in igie_not_supported_node_type:
+            exclude_node_name.append(i.name)
+       
+    return exclude_node_name
+
+
+def find_group_conv_node(model):
+    if isinstance(model, str):
+        model = Model(model)
+    assert isinstance(model, Model)
+    
+    nodes = model.graph.node
+
+    exclude_node_name = []
+    for node in nodes:
+        if node.op_type == "Conv":
+            attrs = node.attribute
+            for j in attrs:
+                if j.name == "group" and j.i != 1:
+                    exclude_node_name.append(node.name)
+       
+    return exclude_node_name
+
+class BaseDataReader(CalibrationDataReader):
+
+    def __init__(self, dataloader, cnt_limit=500):
+        # pytorch-like dataloader
+        self.dataloader = dataloader
+        self.cnt = 0
+        self.cnt_limit = cnt_limit
+        self.rewind()
+
+    def get_next(self):
+        raise NotImplementedError
+
+    def reset_dataloader(self):
+        self.dataloader_iter = iter(self.dataloader)
+
+    def rewind(self):
+        self.reset_dataloader()
+        self.cnt = 0
+
+    def set_dataloader(self, dataloader):
+        self.dataloader = dataloader
+        self.rewind()
+
+    def should_stop(self, memory_upper_bound=80):
+        # avoid oom
+        if BaseDataReader._exceed_memory_upper_bound(
+                upper_bound=memory_upper_bound
+        ) or self.cnt + 1 > self.cnt_limit:
+            return True
+        self.cnt += 1
+        return False
+
+    def get_next_data(self):
+        data = next(self.dataloader_iter, None)
+        if data is None:
+            self.reset_dataloader()
+            data = next(self.dataloader_iter, None)
+        return data
+
+    @staticmethod
+    def _exceed_memory_upper_bound(upper_bound=90):
+        # upper_bound in [0, 100]
+
+        info = psutil.virtual_memory()
+        total_percent = info.percent
+        if total_percent >= upper_bound:
+            return True
+        return False
+
+class ONNXDataReader(BaseDataReader):
+    def __init__(self, input_name_list, dataloader, cnt_limit=500):
+        self.input_name_list = input_name_list
+        super().__init__(dataloader, cnt_limit)
+    
+    def get_next(self):
+        if self.should_stop(memory_upper_bound=90):
+            return None
+        print(f"onnx calibration data count: {self.cnt}")
+        all_input = self.get_next_data()
+        
+        #NOTE(chen.chen)
+        # we assumen the all_input contains each input tensorin input_name_list with the same order
+        assert len(all_input) >= len(self.input_name_list)
+        ort_input = {k: np.array(v) for k, v in zip(self.input_name_list, all_input)}
+        return ort_input
+            
+
+def fill_onnx_input_shape(model_path, input_shape_list, model_save_path=None):
+    model = onnx.load(model_path)
+    inputs = model.graph.input
+
+    assert len(inputs) == len(input_shape_list), f"input number error, should be {len(inputs)}, got {len(input_shape_list)}"
+    for tensor, shape in zip(inputs, input_shape_list):
+        rewrite_tensor_dim(tensor, shape)
+        
+    model = Model.preprocess_onnx(model)
+    
+    if model_save_path is None:
+        model_save_path = f"{model_path[:-5]}_fill_input.onnx"
+    onnx.save(model, model_save_path)
+    
+    return model_save_path
+
+
+def onnx_quantize_model_from_args(args):
+    ori_model_path = args.model_path
+    assert ori_model_path.endswith(".onnx")
+    
+    # NOTE(chen.chen)
+    # we should just rewrite input_shape here since some batch_size dim of reshape op is fixed
+    # ori_model_path = fill_onnx_input_shape(ori_model_path, args.input_shape_list)
+    
+    # skip model which has been quantized
+    if contain_qlinear_opearator(ori_model_path):
+        return ori_model_path
+    
+    # check if quantization_config is valid
+    # NOTE(chen.chen)
+    # if user has not specified the quantization_config
+    # we should have a default config here
+
+    config = args.quantization_config.get("onnx", {})  
+    quant_format = config.get("quant_format", "qoperator").lower()
+    if quant_format == "qdq":   
+        quant_format = QuantFormat.QDQ
+    elif quant_format == "qoperator":
+        quant_format = QuantFormat.QOperator
+    else:
+        raise ValueError(f"invalid quant_format: {quant_format}")
+    
+    
+    
+    op_types_to_quantize = config.get("op_types_to_quantize", [])
+    per_channel = config.get("per_channel", False)
+    reduce_range = config.get("reduce_range", False)
+    nodes_to_quantize = config.get("nodes_to_quantize", [])
+    nodes_to_exclude = config.get("nodes_to_exclude", [])
+    skip_group_conv_layer = config.get("skip_group_conv_layer", False)
+    
+    if args.automatic_yolo_quantization:
+        yolo_detect_nodes = find_detect_node(ori_model_path)
+        nodes_to_exclude.extend([i for i in yolo_detect_nodes if i not in nodes_to_exclude])
+        
+    if skip_group_conv_layer:
+        group_conv_node = find_group_conv_node(ori_model_path)
+        print(group_conv_node)
+        nodes_to_exclude.extend([i for i in group_conv_node if i not in nodes_to_exclude])
+    
+    unsupport_node = find_unsupported_node(ori_model_path)
+    nodes_to_exclude.extend([i for i in unsupport_node if i not in nodes_to_exclude])
+    
+    calibrate_method = config.get("calibrate_method", "percentile").lower()
+    if calibrate_method == "minmax":
+        calibrate_method=CalibrationMethod.MinMax
+    elif calibrate_method == "entropy":
+        calibrate_method=CalibrationMethod.Entropy
+    elif calibrate_method == "percentile":
+        calibrate_method=CalibrationMethod.Percentile
+    else:
+        raise ValueError(f"invalid calibrate_method: {calibrate_method}")
+    
+    quant_model_path = f"{os.path.split(ori_model_path)[1][:-5]}_quant.onnx"
+    
+    
+    ## NOTE(chen.chen)
+    ## for memory issue, we will try to change the batchsize of model to 1 during quantization
+    ## but it only works for simple cv model
+    ## we reserve a field for user to control this behavior to avoid some strange batch-rewriting result 
+    memory_efficient_quant = config.get("memory_efficient_quant", True)
+    batch_size =  args.batch_size
+    if memory_efficient_quant:
+        model_input = ori_model_path[:-5] + "_b1.onnx"
+        rewrite_batch_size(ori_model_path, 
+                           batch_size=1,
+                           save_model_path=model_input)
+        args.batch_size = 1
+    else:
+        model_input = ori_model_path
+        
+    dataloader = get_dataloader_from_args(args)
+    
+    calibrate_data_count = config.get("calibrate_data_count", 20)
+    datareader = ONNXDataReader(args.input_name_list, dataloader, calibrate_data_count)
+    
+    args.batch_size = batch_size    
+    
+    if args.verbose:
+        print("onnx quanziation config:")
+        print("model_input: ", model_input)
+        print("model_output: ", quant_model_path)
+        print("quant_format: ", quant_format)
+        print("op_types_to_quantize: ", op_types_to_quantize)
+        print("per_channel: ", per_channel)
+        print("reduce_range: ", reduce_range)
+        print("nodes_to_quantize: ", nodes_to_quantize)
+        print("nodes_to_exclude: ", nodes_to_exclude)
+        print("calibrate_method: ", calibrate_method)
+        print("skip_group_conv_layer: ", skip_group_conv_layer)
+    
+    symmetric_quantize(
+        model_input=model_input,
+        model_output=quant_model_path, 
+        calibration_data_reader=datareader,
+        quant_format=quant_format,
+        op_types_to_quantize=op_types_to_quantize,
+        per_channel=per_channel,
+        reduce_range=reduce_range,
+        nodes_to_quantize=nodes_to_quantize,
+        nodes_to_exclude=nodes_to_exclude,
+        calibrate_method=calibrate_method)
+    
+    ## NOTE(chen.chen)
+    ## rewrite the batchsize back to the origin batchsize
+    if memory_efficient_quant: 
+        rewrite_batch_size(quant_model_path, 
+                           batch_size=args.batch_size,
+                           save_model_path=quant_model_path)
+    
+    return quant_model_path
+
+
+
+
+def igie_calibrate_dataset(dataloader, input_name_list, calibrate_data_count=3):
+    calibration_data_list = []
+    for idx, batch in enumerate(dataloader):
+        if idx >= calibrate_data_count:
+            break
+        
+        data_dict = {}
+        for data, name in zip(batch, input_name_list):
+            data_dict[name] = data
+        
+        calibration_data_list.append(data_dict)
+    return calibration_data_list
+
+def igie_quantize_model_from_args(mod, params, args):
+    
+    # NOTE(chen.chen)
+    # we need to remove unused function for tensorflow
+    from tvm.relay.transform.iluvatar import SimplifyGraph
+    mod = SimplifyGraph(mod, params)
+    
+    
+    config = args.quantization_config.get("igie", {})  
+    
+    
+    base_name = os.path.splitext(os.path.split(args.model_path)[1])[0]
+    
+    scale_file_path = config.get("scale_file_path", "")
+    if scale_file_path == "":
+        scale_file_path = f"quantize_scale_file_{base_name}_{args.target}.npy"
+    calibrate_mode = config.get("calibrate_mode", "percentile")
+    weight_scale = config.get("weight_scale", "max")
+    
+    
+    skip_first_conv_layer = config.get("skip_first_conv_layer", False)
+    if args.target != "iluvatar_with_all_libs":
+        skip_first_conv_layer = True
+        
+    skip_conv_layers = None
+    if skip_first_conv_layer:
+        skip_conv_layers = [0]
+
+    skip_dense_layer = config.get("skip_dense_layer", False)
+    calibrate_chunk_by = config.get("calibrate_chunk_by", -1)
+    skip_group_conv_layer = config.get("skip_group_conv_layer", False)
+    
+    global_scale = config.get("global_scale", 0.8)
+    calibrate_data_count = config.get("calibrate_data_count", 3)
+    
+    if args.verbose:
+        print("igie quanziation config:")
+        print("calibrate_mode: ", calibrate_mode)
+        print("weight_scale: ", weight_scale)
+        print("scale_file_path: ", scale_file_path)
+        print("skip_dense_layer: ", skip_dense_layer)
+        print("skip_first_conv_layer: ", skip_first_conv_layer)
+        print("skip_group_conv_layer: ", skip_group_conv_layer)
+        print("calibrate_chunk_by: ", calibrate_chunk_by)
+        print("global_scale: ", global_scale)
+        print("calibrate_data_count: ", calibrate_data_count)
+    
+    
+    if calibrate_mode == "global_scale":
+        with tvm.transform.PassContext(opt_level=3):
+            with relay.quantize.qconfig(calibrate_mode=calibrate_mode,
+                                        global_scale=global_scale,
+                                        skip_conv_layers=skip_conv_layers,
+                                        skip_dense_layer=skip_dense_layer):
+                mod = relay.quantize.quantize(mod, params)
+    
+    elif calibrate_mode == "percentile" or calibrate_mode == "kl_divergence":
+
+        dataloader = get_dataloader_from_args(args)
+        dataset = igie_calibrate_dataset(dataloader, args.input_name_list, calibrate_data_count)
+            
+        with tvm.transform.PassContext(opt_level=3):
+            with relay.quantize.qconfig(calibrate_mode=calibrate_mode,
+                                        weight_scale=weight_scale,
+                                        skip_conv_layers=skip_conv_layers,
+                                        skip_dense_layer=skip_dense_layer,
+                                        calibrate_chunk_by=calibrate_chunk_by,
+                                        import_scale_file=scale_file_path,
+                                        skip_group_conv_layers=skip_group_conv_layer):
+                mod = relay.quantize.quantize(mod, params, dataset=dataset)
+        
+    else:
+        raise ValueError(f"unsupported calibrate_mode: {calibrate_mode}")
+    
+
+    
+    
+    return mod, params
+
+
+
+
+def _modify_symmetric(extra_options):
+    if extra_options is None:
+        extra_options = {"ActivationSymmetric": True, "WeightSymmetric": True}
+    else:
+        extra_options["ActivationSymmetric"] = True
+        extra_options["WeightSymmetric"] = True
+
+    return extra_options
+
+
+
+def symmetric_quantize(
+    model_input,
+    model_output,
+    calibration_data_reader: CalibrationDataReader,
+    quant_format=QuantFormat.QOperator,
+    op_types_to_quantize=None,
+    per_channel=False,
+    reduce_range=False,
+    nodes_to_quantize=None,
+    nodes_to_exclude=None,
+    optimize_model=False,
+    calibrate_method=CalibrationMethod.Percentile,
+    extra_options=None,
+):
+    extra_options = _modify_symmetric(extra_options)
+    assert quant_format in [QuantFormat.QOperator, QuantFormat.QDQ]
+    quantize_static(model_input,
+                    model_output,
+                    calibration_data_reader=calibration_data_reader,
+                    quant_format=quant_format,
+                    op_types_to_quantize=op_types_to_quantize,
+                    per_channel=per_channel,
+                    reduce_range=reduce_range,
+                    activation_type=QuantType.QInt8,
+                    weight_type=QuantType.QInt8,
+                    nodes_to_quantize=nodes_to_quantize,
+                    nodes_to_exclude=nodes_to_exclude,
+                    optimize_model=optimize_model,
+                    use_external_data_format=False,
+                    calibrate_method=calibrate_method,
+                    extra_options=extra_options)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..907288df18e949c0d7e7163b6c81f46ff9d0ce8f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/stauts_checker.py
@@ -0,0 +1,21 @@
+def check_status(result_dict, args):
+    is_valid = True
+    if args.acc_target is not None:
+        acc_result = result_dict["acc_result"]
+        if acc_result < args.acc_target:
+            print(f"Expected acc_target is {args.acc_target}, got {acc_result}")
+            is_valid = False
+            
+    if args.fps_target is not None:
+        fps_result = result_dict["fps_result"]
+        if fps_result < args.fps_target:
+            print(f"Expected fps_target is {args.fps_target}, got {fps_result}")
+            is_valid = False
+    
+    if is_valid:
+        print("\n====Test Success!====\n")
+    else:
+        print("\n====Test failed!====\n")
+        exit(1)
+    
+    
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df46829cd9c33650b4e9d6b4b0beb71d597866b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/target.py
@@ -0,0 +1,24 @@
+import tvm
+
+def get_target(target_name):
+    
+    target = None
+    if target_name == "llvm":
+        target = tvm.target.Target(target_name)
+    
+    elif target_name == "iluvatar":
+        target = tvm.target.iluvatar(model="MR")
+    
+    elif target_name == "iluvatar_with_cudnn_cublas":
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas")
+    elif target_name == "iluvatar_with_ixinfer":
+        target = tvm.target.iluvatar(model="MR", options="-libs=ixinfer")
+    elif target_name == "iluvatar_with_all_libs":
+        target = tvm.target.iluvatar(model="MR", options="-libs=cudnn,cublas,ixinfer")
+
+    else:
+        raise Exception(f"Unsupport Target name: {target_name}!")
+    
+    device = tvm.device(target.kind.name, 0)
+    
+    return target, device
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed0ad0f73d23587d10b0ceb6b248fdb0959c6014
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/utils/timer.py
@@ -0,0 +1,81 @@
+import tvm
+import time
+from contextlib import contextmanager
+
+
+_get_timer = tvm.get_global_func("profiling.get_timer")
+_start = tvm.get_global_func("profiling.start")
+_stop = tvm.get_global_func("profiling.stop")
+_elapse_time = tvm.get_global_func("profiling.elapse_time")
+
+
+class Timer:
+    def __init__(self, device=None):
+        self.last_duration = 0  # ms
+        self.duration_list = []  # ms
+        
+        self.device = device
+        self._timer = None
+        if device is not None:
+            self._timer =  _get_timer(device)
+
+        self.start_cnt = 0
+        self.end_cnt = 0
+
+    def total_duration(self):
+        return sum(self.duration_list)
+
+    def _update(self, duration):
+        self.last_duration = duration
+        self.duration_list.append(self.last_duration)
+
+
+    def start(self):
+        assert self._timer is not None
+        self.start_cnt += 1
+        self.device.sync()
+        _start(self._timer)
+         
+    
+    def stop(self):
+        assert self._timer is not None
+        self.end_cnt += 1
+        assert self.end_cnt == self.start_cnt
+    
+        _stop(self._timer)
+        self._update(_elapse_time(self._timer) / 1e6)  ## ns / 1e6 -> ms
+
+
+
+    # @contextmanager
+    # def timeit_sync(self, device, use_host_time=False):
+    #     # NOTE(chen.chen)
+    #     # not works as expected when use device timer
+    #     # it seems python contextmanager always use host time?
+    #     if use_host_time:
+    #         device.sync()
+    #         t1 = time.time()
+
+    #         yield
+
+    #         device.sync()
+    #         t2 = time.time()
+    #         self._update((t2 - t1) * 1e3)  ## s * 1e3 -> ms
+    #     else:
+    #         timer = _get_timer(device)
+    #         device.sync()
+    #         _start(timer)
+
+    #         yield
+
+    #         _stop(timer)
+    #         self._update(_elapse_time(timer) / 1e6)  ## ns / 1e6 -> ms
+
+    # @contextmanager
+    # def timeit(self):
+    #     t1 = time.time()
+
+    #     yield
+
+    #     t2 = time.time()
+    #     self._update((t2 - t1) * 1e3)  ## s * 1e3 -> ms
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/compile_backend.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/compile_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..535a777c1f993a64de4a173ee6efe498626178fc
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/compile_backend.py
@@ -0,0 +1,84 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional
+
+
+class CompileBackend(object):
+    def __init__(self):
+        self.hardware_type = 'UnKnown'
+        self.need_reload = False
+        self.need_quant = False
+
+    def version(self) -> str:
+        """
+        Return compile backend version details
+        """
+        raise NotImplementedError("CompileBackend:version")
+
+    def pre_optimize(self, configs: Dict[str, Any]):
+        """
+        Model pre-optimization interface. Requirements: Model pre-optimization
+        cannot change the model format. Torch model export to ONNX is allowed.
+        """
+        return configs
+
+    def compile(self,
+                configs: Dict[str, Any],
+                dataloader=None) -> Dict[str, Any]:
+        """
+        Model compilation interface. Model conversion and compilation 
+        can be performed here. The model format can be changed here.
+
+        Arguments:
+            configs (list of ``str``s, optional): model configs.
+        """
+        raise NotImplementedError("CompileBackend:compile")
+
+    def tuning(self, configs: Dict[str, Any]):
+        """
+        This interface is reserved for the future. The purpose is
+        that some compilation optimization needs to be improved
+        according to the results of the first compilation and operation.
+        The tuning interface provides such a window for tuning.
+        """
+        return
+
+    def segment(self, configs: Dict[str, Any]):
+        """
+        This interface is reserved for the future. The purpose is
+        to better adapt to the scene of subgraph compilation in the future.
+        For manufacturers who place segment and compile in the same stage,
+        this interface can be ignored.
+        """
+        return
+
+    def get_interact_profile(self, config: Dict[str, Any]):
+        """
+        Load the interactive configuration interface. If the vendor needs
+        the user to provide some additional information, you can load the
+        json file you added here and return a list of dict. mlperf will 
+        display the content of the profile to the user and is responsible
+        for collecting feedback about the profile. If the user does not need
+        to provide additional information, return None here. get_interact_profile
+        can already get some workload info and model info, and the vendor can
+        also generate some options other than json under this API.
+        """
+        raise NotImplementedError("CompileBackend:get_interact_profile")
+
+    def get_best_batch_size(self) -> Optional[List[int]]:
+        """
+        Get Best Batch Size for the model
+        """
+        raise NotImplementedError("CompileBackend:get_best_batch_size")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/runtime_backend.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/runtime_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..db856b0bd897539bf6d40845b7dcf7b325f158a6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/backends/runtime_backend.py
@@ -0,0 +1,65 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict
+
+
+class RuntimeBackend(object):
+    def __init__(self):
+        self.hardware_type = 'UnKnown'
+        self.need_reload = False
+        self.need_quant = False
+
+    def version(self) -> str:
+        """
+        Return runtime backend version details
+        """
+        raise NotImplementedError("RuntimeBackend:version")
+
+    def load(self, batch_size) -> str:
+        """
+        Return runtime backend version details
+        """
+        raise NotImplementedError("RuntimeBackend:load")
+
+    def get_loaded_batch_size(self) -> int:
+        """
+        Get Currect batch size
+        """
+        raise NotImplementedError("RuntimeBackend:get_loaded_batch_size")
+
+    def predict(self, data):
+        """
+        Run the compiled model and return the model output corresponding to the data.
+        """
+        raise NotImplementedError("RuntimeBackend:predict")
+
+    def is_qs_mode_supported(self) -> bool:
+        """
+        Used to check whether QSv2 Runtime is enabled
+        """
+        return False
+
+    def generate_qs_config(self) -> Dict[str, Any]:
+        """
+        Used only when is_qs_ported return True. Generate QS Config
+        File for QSv2 Runtime
+        """
+        return None
+
+    def benchmark(self, dataloader):
+        """
+        Performance Testing when qs mode is not enabled.
+        """
+        raise NotImplementedError("RuntimeBackend:benchmark")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/__init__.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/backend_store.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/backend_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..80b1e8fbfaacd479f968d6dfa2e710d82a9f2534
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/backend_store.py
@@ -0,0 +1,62 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+from general_perf.backends.compile_backend import CompileBackend
+from general_perf.backends.runtime_backend import RuntimeBackend
+
+log = logging.getLogger("BackendStore")
+
+__all__ = [
+    "CompileBackend",
+]
+
+
+def init_compile_backend(hardware_type: str) -> CompileBackend:
+    """
+    Load related compile backend with input hardware type
+
+    Arguments: str
+
+    Returns: CompileBackend()
+    """
+    log.info("Loading Compile Backend: {}".format(hardware_type))
+
+    compile_backend = importlib.import_module('general_perf.backends.' +
+                                              hardware_type +
+                                              ".compile_backend_" +
+                                              hardware_type.lower())
+    compile_backend = getattr(compile_backend,
+                              "CompileBackend" + hardware_type)
+    return compile_backend()
+
+
+def init_runtime_backend(hardware_type: str) -> RuntimeBackend:
+    """
+    Load related compile backend with input hardware type
+
+    Arguments: str
+
+    Returns: RuntimeBackend()
+    """
+    log.info("Loading Runtime Backend: {}".format(hardware_type))
+
+    runtime_backend = importlib.import_module('general_perf.backends.' +
+                                              hardware_type +
+                                              ".runtime_backend_" +
+                                              hardware_type.lower())
+    runtime_backend = getattr(runtime_backend,
+                              "RuntimeBackend" + hardware_type)
+    return runtime_backend()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/dataset_store.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/dataset_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..5309ce5a4f68f5b3a12d7bdcc611cd526f9bbaef
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/dataset_store.py
@@ -0,0 +1,43 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+from typing import Any, Dict
+import os
+import sys
+from general_perf.datasets.data_loader import Dataset
+
+log = logging.getLogger("DatasetStore")
+
+
+def load_dataset(config: Dict[str, Any]) -> Dataset:
+    """
+    Load related dataset class with config file
+    Args: Dict
+
+    Returns: Dataloader()
+    """
+    if config['dataset_name']:
+        dataset_name = config['dataset_name']
+        log.info("Loading Dataset: {}".format(config['dataset_name']))
+    else:
+        dataset_name = 'fake_dataset'
+        log.info("Loading Dataset: Dataset does not exist, using fake data")
+
+    data_loader = importlib.import_module('general_perf.datasets.' +
+                                          dataset_name + ".data_loader")
+    data_loader = getattr(data_loader, 'DataLoader')
+    dataset = data_loader(config)
+    return dataset
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/workload_store.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/workload_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed95088abad6da4a75263f479e03201d94360d42
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/configs/workload_store.py
@@ -0,0 +1,46 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import logging
+from typing import Any, List, Dict
+
+log = logging.getLogger("WorkloadStore")
+
+
+def load_workload(task: str) -> Dict[str, Any]:
+    """
+    Return a list of dictionary with model Configuration
+
+    Args: List[str]
+
+    Returns: List[dic]
+    """
+    modules_dir = os.path.dirname(os.path.dirname(
+        os.path.dirname(__file__))) + '/workloads'
+
+    for file in os.listdir(modules_dir):
+        path = os.path.join(modules_dir, file)
+        if (not file.startswith('_') and not file.startswith('.')
+                and (file.endswith('.json') or os.path.isdir(path))
+                and file[:file.find('.json')] == task):
+            module_name = file
+            with open("general_perf/workloads/" + module_name, 'r') as f:
+                workload_dict = json.load(f)
+            return workload_dict
+    else:
+        log.error(
+            "Task name: [ {} ] was not found, please check your task name".
+            format(task))
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b7a4df767042ab408b6cc1194ffdec1bdac976f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
@@ -0,0 +1,395 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+import virtualenv
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+from general_perf.tools.build_pdf import build_pdf
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="resnet50-tf-fp32",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Run compilation only")
+
+    args = parser.parse_args()
+    return args
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.compile_backend = None
+        self.old_os_path = os.environ['PATH']
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+        self.compile_only_mode = False
+
+    def start_engine(self) -> None:
+        '''
+        Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+        '''
+        success, total = 0, len(self.workload)
+        if total == 0:
+            return
+        log.info("******************* Backend Env Initization *******************")
+        status = self.activate_venv(self.backend_type)
+        if not status:
+            log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.compile_backend = init_compile_backend(self.backend_type)
+        self.runtime_backend = init_runtime_backend(self.backend_type)
+
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+        
+        status = self.single_workload_perf(self.workload)
+
+    def single_workload_perf(
+            self, workload: Dict[str, Any]) -> bool:
+        log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+        # Check Compile Only Mode
+        self.compile_only_mode = False
+        if self.args.compile_only or workload['compile_only']:
+            self.compile_only_mode = True
+
+        base_report = {
+            "Model": workload['model'].upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name()
+        }
+
+        # Initalize Model Config Info
+        model_info = self.get_model_info(workload['model'])
+        pre_compile_config = {"workload": workload, 'model_info': model_info}
+        interact_info = self.check_interact_info(pre_compile_config)
+        pre_compile_config['interact_info'] = interact_info
+        if not model_info['dataset_name']:
+            model_info['dataset_name'] = 'fake_dataset'
+
+
+        '''
+        Compile Backend could do some optimization like convert model format here
+        '''
+        log.info("******************************************* Running Backend Compilation... *******************************************")
+        log.info("Running Backend Preoptimization...")
+        pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+        # Initalize dataset
+        dataset = load_dataset(model_info)
+        dataset.preprocess()
+        base_report['Dataset'] = model_info['dataset_name'].upper(
+        ) if model_info['dataset_name'] else None
+
+        #Placeholder Only
+        segment_info = self.compile_backend.segment(pre_compile_config)
+
+        best_batch_sizes = self.compile_backend.get_best_batch_size()
+        if isinstance(best_batch_sizes, list):
+            pre_compile_config['workload'][
+                'batch_sizes'] = best_batch_sizes
+
+        log.info("Start to compile the model...")
+        start = time.time()
+        compile_info = self.compile_backend.compile(pre_compile_config,
+                                                    dataset)
+        end = time.time()
+
+        graph_compile_report = {}
+        graph_compile_report["Compile Duration"] = round(end - start, 5)
+        graph_compile_report["Compile Precision"] = compile_info[
+            'compile_precision']
+        graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+        if 'optimizations' in compile_info:
+            graph_compile_report['Optimizations'] = compile_info['optimizations']
+        if 'instance_count' in compile_info:
+            base_report['Instance Count'] = compile_info['instance_count']
+        if 'device_count' in compile_info:
+            base_report['Device Count'] = compile_info['device_count']
+        base_report['Graph Compile'] = graph_compile_report
+
+        # Initalize Output Dir and Reports
+        output_dir = os.path.abspath('general_perf/reports/' +
+                                     self.backend_type + '/' +
+                                     workload['model'])
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Compile only mode will stop here
+        if self.compile_only_mode:
+            base_report.pop("Backend")
+            return compile_info["compile_status"], base_report
+
+        # load runtime backend
+        """
+        Start Here
+        """
+        batch_sizes = pre_compile_config['workload']['batch_sizes']
+        self.runtime_backend.configs = compile_info
+        self.runtime_backend.workload = workload
+        self.runtime_backend.model_info = model_info
+
+        self.runtime_backend.load(workload['batch_sizes'][0])
+        # test accuracy
+        accuracy_report = {}
+        AccuracyChecker = self.get_accuracy_checker(
+            model_info['dataset_name']
+            if model_info['dataset_name'] else 'fake_dataset')
+        AccuracyChecker.runtime_backend = self.runtime_backend
+        AccuracyChecker.dataloader = dataset
+        AccuracyChecker.output_dir = output_dir
+        AccuracyChecker.configs = compile_info
+
+        if workload['test_accuracy']:
+            log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            accuracy_results = AccuracyChecker.calculate_acc(
+                workload['data_percent'])
+
+            accuracy_report['Data Percent'] = workload['data_percent']
+            accuracy_report.update(accuracy_results)
+
+        # test numeric
+        if workload['test_numeric']:
+            log.info("******************************************* Running Numeric Checker... *******************************************")
+
+            dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+            if not workload['test_accuracy']:
+                accuracy_results = AccuracyChecker.calculate_acc(
+                    workload['data_percent'])
+            diff_results = AccuracyChecker.calculate_diff()
+            accuracy_report.update(diff_results)
+            accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+        if accuracy_report:
+            base_report['Accuracy'] = accuracy_report
+
+        # function to test qps and latency
+        if workload['test_perf']:
+            log.info("******************************************* Runing QPS Checker... *******************************************")
+            performance_reports = []
+            qs_status = self.runtime_backend.is_qs_mode_supported()
+            if qs_status:
+                qs_config = self.runtime_backend.generate_qs_config()
+                performance_reports = self.qs_benchmark(qs_config)
+            else:
+                for bs in batch_sizes:
+                    self.runtime_backend.load(bs)
+                    batch_reports = self.runtime_backend.benchmark(dataset)
+                    performance_reports.append(batch_reports)
+            base_report['Performance'] = performance_reports
+
+        if "Instance Count" not in base_report:
+            log.warning("Vendors need to Add # of instances")
+        if "Device Count" not in base_report:
+            log.warning("Vendors need to Add # of devices")
+
+        # write output to json file
+        output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+        with open(output_report_path, 'w') as file:
+            json.dump(base_report, file, indent=4)
+
+        base_report.pop("Backend")
+        log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+                 format(output_dir[output_dir.rfind('general_perf'):],
+                 os.path.basename(output_report_path)))
+        build_pdf(output_report_path)
+        log.info("PDF Version is saved in path: [ {}/{}-TO-{}.pdf ]".format(
+            output_dir[output_dir.rfind('general_perf'):],
+            base_report['Model'],
+            output_report_path.split('/')[-1].split('-')[1].upper()))
+
+        return compile_info["compile_status"]
+
+    #WIP
+    def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+        return []
+
+    def get_accuracy_checker(self, dataset_name: str):
+        AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+                                                  dataset_name +
+                                                  ".test_accuracy")
+        AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+        return AccuracyChecker()
+
+    def get_model_info(self, model_name: str) -> Dict[str, Any]:
+        with open("general_perf/model_zoo/" + model_name + '.json',
+                  'r') as file:
+            model_info = json.load(file)
+        return model_info
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def check_interact_info(
+            self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+        interact_info = self.compile_backend.get_interact_profile(
+            pre_compile_config)
+
+        answer = {}
+        if len(interact_info) == 0:
+            return answer
+
+        dialog_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+        })
+
+        input_style = Style.from_dict({
+            'dialog': 'bg:#88b8ff',
+            'dialog frame.label': 'bg:#ffffff #000000',
+            'dialog.body': 'bg:#000000 #a0acde',
+            'dialog shadow': 'bg:#004aaa',
+            'text-area.prompt': 'bg:#ffffff',
+            'text-area': '#000000',
+        })
+
+        option = yes_no_dialog(title=self.backend_type + '编译配置',
+                               text='[请选择]：是否进行编译后端配置:',
+                               style=dialog_style).run()
+        if option:
+            sum_question = len(interact_info)
+            for i, question in enumerate(interact_info):
+                if question['depends']:
+                    state = 0
+                    for title in question['depends'].split(','):
+                        if not answer[title]:
+                            state = 1
+                    if state:
+                        continue
+                if question['dialog_type'] == 'Yes/No Dialog':
+                    option = yes_no_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=dialog_style).run()
+                elif question['dialog_type'] == "Input Dialog":
+                    option = input_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        style=input_style).run()
+                elif question['dialog_type'] == "Radiolist Dialog":
+                    choice = [(i, text)
+                              for i, text in enumerate(question['options'])]
+                    num = radiolist_dialog(
+                        title=self.backend_type + '编译配置进度(' + str(i + 1) +
+                        '/' + str(sum_question) + ')',
+                        text="[Backend " + self.backend_type + "]: " +
+                        question['note'],
+                        values=choice,
+                        style=dialog_style).run()
+                    option = question['options'][num] if num is not None else question[
+                        'default']
+                answer[question['name']] = option
+
+        return answer
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        if os.path.exists('general_perf/backends/' + hardware_type +
+                          '/requirements.txt'):
+            log.info("Activating Virtual Env for " + hardware_type)
+
+            venv_dir = os.path.join("general_perf/backends",
+                                    hardware_type + "/venv")
+            activate_file = os.path.join(venv_dir, 'bin', 'activate_this.py')
+            if not os.path.exists(venv_dir):
+                log.info("venv not exist, Creating Virtual Env for " +
+                         hardware_type)
+                if (hardware_type == "HPU"):
+                    virtualenv.create_environment(venv_dir,True)
+                else:
+                    virtualenv.create_environment(venv_dir)
+                exec(open(activate_file).read(), {'__file__': activate_file})
+                python_path = os.path.join(venv_dir, 'bin', 'python3')
+                subprocess.call([
+                    python_path, '-m', 'pip', 'install', '--upgrade', 'pip', '--quiet'
+                ])
+                subprocess.call([
+                    python_path, '-m', 'pip', 'install', '-r', 'general_perf/backends/' +
+                    hardware_type + '/requirements.txt', '-q'
+                ])
+            else:
+                exec(open(activate_file).read(), {'__file__': activate_file})
+                '''
+                just in case install failed in pre-run.
+                '''
+                python_path = os.path.join(venv_dir, 'bin', 'python3')
+                subprocess.call([
+                    python_path, '-m', 'pip', 'install', '--upgrade', 'pip', '--quiet'
+                ])
+                subprocess.call([
+                    python_path, '-m', 'pip', 'install', '-r', 'general_perf/backends/' +
+                    hardware_type + '/requirements.txt', '-q'
+                ])
+
+                if not hasattr(sys, 'real_prefix'):
+                    return False
+                return True
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:
+                 0] = self.prev_sys_path  #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..c029187c538951aba1268b049618625313e42bd3
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/data_loader.py
@@ -0,0 +1,91 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+
+log = logging.getLogger("Dataset")
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "FLOAT16": np.float16,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+
+class Dataset():
+    def __init__(self, config):
+        self.config = config
+        self.cur_bs = 1
+        self.batched_data = []
+        self.labels = []
+        self.items = 0
+        self.batch_num = int(self.items / self.cur_bs)
+
+    def name(self) -> str:
+        """
+        Return the name of dataset
+        """
+        raise NotImplementedError("Dataset:name")
+
+    def get_item_count(self) -> int:
+        """
+        Return the number of data loaded
+        """
+        return self.items
+
+    def get_batch_count(self) -> int:
+        """
+        Return the number of batched data
+        """
+        return self.batch_num
+
+    def preprocess(self):
+        """
+        Data preprocess will happened here
+        """
+        return
+
+    def get_samples(self, sample_id):
+        """
+        Query data with sample id
+        """
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+        return self.batched_data[sample_id], self.labels[sample_id]
+
+    def rebatch(self, new_bs, skip=True) -> None:
+        """
+        Rebatch Datasets to specified number
+        """
+        raise NotImplementedError("Dataset:rebatch")
+
+    def get_fake_samples(self, batch_size, shape, input_type):
+        """
+        Generate fake data for testing
+        """
+        data = {}
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [val[0] * batch_size] + val[1:]
+            data[key] = np.random.random(size=val).astype(
+                INPUT_TYPE[input_type[i]])
+            i += 1
+        return data
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aa109d6b500e72cd0a4cb00ffa1ec40efbb0644
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/data_loader.py
@@ -0,0 +1,132 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import data_loader
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT16": np.float16,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64,
+    "BOOL": np.bool
+}
+
+log = logging.getLogger("FAKE_DATA")
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+        self.config = config
+        self.cur_bs = 1
+
+    def name(self):
+        return 'fake_dataset'
+
+    def get_batch_count(self):
+        # always return 100
+        return 100
+
+    def generate_fake_data(self):
+        input_shape = self.config["input_shape"]
+        input_type = self.config["input_type"].split(',')
+
+        return self.get_fake_samples_regular(self.cur_bs, input_shape,
+                                             input_type)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+
+    def get_samples(self, sample_id):
+        if sample_id > 99 or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+
+        np.random.seed(sample_id)
+        return self.generate_fake_data()
+
+    def get_fake_samples_regular(self, batch_size, shape, input_type):
+        data = {}
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [batch_size] + val[1:]
+            if 'LONG' in input_type[i] or 'INT' in input_type[i]:
+                if "mask" in key or "segment" in key:
+                    data[key] = np.random.randint(
+                        low=0, high=2,
+                        size=val).astype(INPUT_TYPE[input_type[i]])
+                elif self.config[
+                        "model"] == "internal_videobert01-onnx-fp32" and key == "1_input_1":
+                    data[key] = np.random.ones(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.randint(
+                        low=0, high=1000,
+                        size=val).astype(INPUT_TYPE[input_type[i]])
+
+            elif 'STRING' in input_type[i]:
+                data[key] = 'This is a test string.'
+            elif 'BOOL' in input_type[i]:
+                data[key] = np.zeros(shape=val, dtype=bool)
+            else:
+                sample_data = np.random.random(size=val) * 2 - 1
+                data[key] = sample_data.astype(INPUT_TYPE[input_type[i]])
+            i += 1
+
+        return data
+
+    def get_fake_samples_bert(self, batch_size, shape, input_type):
+        data = {}
+
+        avg_seq_len = 192
+        max_seq_len = 384
+
+        if not input_type:
+            raise ValueError("Please provide input type")
+        i = 0
+        for key, val in shape.items():
+            val = [val[0] * batch_size] + val[1:]
+            if i == 0:
+                # fake input id and mask
+                input_ids = np.random.randint(low=0, high=30523,
+                                              size=val).astype(
+                                                  INPUT_TYPE[input_type[i]])
+                data[key] = input_ids
+            elif i == 1:
+                # fake input array length
+                input_len = np.random.randint(low=2 * avg_seq_len -
+                                              max_seq_len,
+                                              high=max_seq_len + 1,
+                                              size=(batch_size),
+                                              dtype=np.int32)
+
+                input_mask = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+
+                for b_idx, s_len in enumerate(input_len):
+                    input_mask[b_idx][:s_len] = 1
+                data[key] = input_mask
+            else:
+                data[key] = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+            i += 1
+        return data
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0f8712a446782a0cd1e22c155cdc3610fa0c183
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/fake_dataset/test_accuracy.py
@@ -0,0 +1,50 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent=10):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            if isinstance(results, dict):
+                list_key = list(results.keys())
+                list_key.sort()
+                for key in list_key:
+                    diffs.extend(results[key].flatten())
+            elif isinstance(results, list):
+                for out in results:
+                    diffs.extend(out.flatten())
+            else:
+                diffs.extend(results)
+
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, 0.0))
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                np.array(diffs),
+                allow_pickle=True)
+        return {"Fake Dataset Accuracy": 0}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..709d6f7646ef73140c39ea3de9d4dd0b8aa66609
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/data_loader.py
@@ -0,0 +1,95 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import numpy as np
+from general_perf.datasets import data_loader
+from tqdm import tqdm
+import collections
+
+log = logging.getLogger("CAIL2019")
+
+maxlen = 1024
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+
+        log.info("Initial...")
+        self.config = config
+        self.cur_bs = 2
+
+        batch_token_ids = np.load(
+            "general_perf/datasets/{}/batch_token_ids.npy".format(
+                self.config['dataset_name']),
+            allow_pickle=True)
+        batch_segment_ids = np.load(
+            "general_perf/datasets/{}/batch_segment_ids.npy".format(
+                self.config['dataset_name']),
+            allow_pickle=True)
+        labels = np.load("general_perf/datasets/{}/label.npy".format(
+            self.config['dataset_name']),
+                         allow_pickle=True)
+        self.feed_dict = collections.defaultdict(list)
+        self.feed_dict['batch_token_ids'] = batch_token_ids.tolist()
+        self.feed_dict['batch_segment_ids'] = batch_segment_ids.tolist()
+        self.feed_dict['label'] = labels.tolist()
+
+        self.items = len(self.feed_dict['label'])
+        self.batch_num = int(self.items / self.cur_bs)
+
+        for i in range(self.items):
+            batch_token_id = np.pad(
+                self.feed_dict['batch_token_ids'][i],
+                (0, 1024 - len(self.feed_dict['batch_token_ids'][i])),
+                'constant').astype(np.float32)
+            batch_segment_id = np.pad(
+                self.feed_dict['batch_segment_ids'][i],
+                (0, 1024 - len(self.feed_dict['batch_segment_ids'][i])),
+                'constant').astype(np.float32)
+            self.feed_dict['batch_token_ids'][i] = batch_token_id.tolist()
+            self.feed_dict['batch_segment_ids'][i] = batch_segment_id.tolist()
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.cur_bs, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        self.labels = []
+        for i in tqdm(range(self.batch_num)):
+            split_data = {
+                'input_segment:0':
+                self.feed_dict["batch_segment_ids"][i * self.cur_bs:(i + 1) *
+                                                    self.cur_bs],
+                'input_token:0':
+                self.feed_dict["batch_token_ids"][i * self.cur_bs:(i + 1) *
+                                                  self.cur_bs],
+            }
+            self.labels.append(
+                self.feed_dict["label"][i * self.cur_bs:(i + 1) * self.cur_bs])
+            self.batched_data.append(split_data)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/pre_process_data.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/pre_process_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce353805a686df60343cc68602fd83959ac7a74c
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/pre_process_data.py
@@ -0,0 +1,56 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tqdm import tqdm
+import json
+import collections
+import numpy as np
+from bert4keras.tokenizers import Tokenizer
+import jieba
+jieba.initialize()
+
+test_data = []
+with open("test.json", encoding='utf-8') as f:
+    for l in f:
+        l = json.loads(l)
+        assert l['label'] in 'BC'
+        if l['label'] == 'B':
+            test_data.append((l['A'], l['B'], l['C']))
+        else:
+            test_data.append((l['A'], l['C'], l['B']))
+
+tokenizer = Tokenizer("vocab.txt",
+                      do_lower_case=True,
+                      pre_tokenize=lambda s: jieba.cut(s, HMM=False))
+
+feed_dict = collections.defaultdict(list)
+maxlen = 1024
+for i in tqdm(range(len(test_data))):
+    (text1, text2, text3) = test_data[i]
+    token_ids, segment_ids = tokenizer.encode(text1, text2, maxlen=maxlen)
+    feed_dict["batch_token_ids"].append(token_ids)
+    feed_dict["batch_segment_ids"].append(segment_ids)
+    feed_dict["label"].append([1])
+    token_ids, segment_ids = tokenizer.encode(text1, text3, maxlen=maxlen)
+    feed_dict["batch_token_ids"].append(token_ids)
+    feed_dict["batch_segment_ids"].append(segment_ids)
+    feed_dict["label"].append([0])
+
+np.save("{}.npy".format('batch_token_ids'),
+        feed_dict["batch_token_ids"],
+        allow_pickle=True)
+np.save("{}.npy".format('batch_segment_ids'),
+        feed_dict["batch_segment_ids"],
+        allow_pickle=True)
+np.save("{}.npy".format('label'), feed_dict["label"], allow_pickle=True)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fd917f67adb39b6a36d9a6b69ef8528d1cc3dc3
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019/test_accuracy.py
@@ -0,0 +1,45 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+        good, total = 0, 0
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data, labels = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            results = results[list(results)[0]]
+            diffs.append(results)
+
+            total += len(results) // 2
+            good += (results[::2] > results[1::2]).sum()
+
+        accuracy = round((good / total), 5)
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs)
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, accuracy))
+        return {"Top-1": accuracy}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bef7f72acc9f75f1c29ee22ee712113a26165d6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/data_loader.py
@@ -0,0 +1,155 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import logging
+
+import numpy as np
+import os
+import pickle
+from tqdm import tqdm
+from typing import Any
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from PIL import Image
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+from general_perf.datasets import data_loader
+
+log = logging.getLogger("CIFAR100")
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+        log.info("Initial...")
+
+        base_folder = "general_perf/datasets/{}/cifar-100-python".format(
+            self.config['dataset_name'])
+        test_list = [
+            ['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'],
+        ]
+        meta = {
+            'filename': 'meta',
+            'key': 'fine_label_names',
+            'md5': '7973b15100ade9c7d40fb424638fde48',
+        }
+
+        self.data: Any = []
+        self.targets = []
+
+        # now load the picked numpy arrays
+        for file_name, checksum in test_list:
+            file_path = os.path.join(base_folder, file_name)
+            with open(file_path, 'rb') as f:
+                entry = pickle.load(f, encoding='latin1')
+                self.data.append(entry['data'])
+                if 'labels' in entry:
+                    self.targets.extend(entry['labels'])
+                else:
+                    self.targets.extend(entry['fine_labels'])
+
+        self.data = np.vstack(self.data).reshape(-1, 3, 32, 32)
+        self.data = self.data.transpose((0, 2, 3, 1))  # convert to HWC
+
+        transformer = _transform()
+        path = os.path.join(base_folder, meta['filename'])
+        with open(path, 'rb') as infile:
+            data = pickle.load(infile, encoding='latin1')
+            self.classes = data[meta['key']]
+        self.class_to_idx = {
+            _class: i
+            for i, _class in enumerate(self.classes)
+        }
+        self.test_data = []
+        for i in tqdm(range(len(self.data))):
+            img = self.data[i]
+            img = Image.fromarray(img)
+            img = transformer(img).detach().numpy()
+            self.test_data.append(img)
+        self.text_input = np.load(os.path.join(base_folder, 'text.npy'))
+        self.config = config
+        self.cur_bs = 1
+        self.items = len(self.data)
+        self.batch_num = int(self.items / self.cur_bs)
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.cur_bs, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        self.labels = []
+        for i in tqdm(range(self.batch_num)):
+            split_data = {
+                'image': self.test_data[i * self.cur_bs:(i + 1) * self.cur_bs],
+                'text': self.text_input,
+            }
+            self.labels.append(self.targets[i * self.cur_bs:(i + 1) *
+                                            self.cur_bs])
+            self.batched_data.append(split_data)
+
+    def get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                if key == "image":
+                    val = [val[0] * batch_size] + val[1:]
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                else:
+                    data[key] = np.random.random(size=val).astype(
+                        INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
+
+
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+
+
+def _transform():
+    return Compose([
+        Resize(224, interpolation=BICUBIC),
+        CenterCrop(224),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbe6a86087452ff8d65b1f21aa0a6901409fe3f2
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/test_accuracy.py
@@ -0,0 +1,49 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+        good, total = 0, 0
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data, labels = self.dataloader.get_samples(i)
+            logits_per_image, logits_per_text = self.runtime_backend.predict(
+                test_data)
+            diffs.append(logits_per_image)
+
+            for j in range(len(logits_per_image)):
+                probs = logits_per_image[j]
+
+                if np.argmax(probs) == labels[j]:
+                    good += 1
+                total += 1
+
+        accuracy = round((good / total), 5)
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs,
+                allow_pickle=True)
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, accuracy))
+        return {"Top-1": accuracy}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a224eaf59f720c8476b9ea8b085491f5e2884d5b
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/data_loader.py
@@ -0,0 +1,102 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from re import T
+import numpy as np
+from general_perf.datasets import data_loader
+from tqdm import tqdm
+
+log = logging.getLogger("CriteoKaggle")
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+
+        log.info("Initial...")
+        self.config = config
+        self.cur_bs = 1
+        if not os.path.exists("general_perf/datasets/{}/numeric.npy".format(
+                self.config['dataset_name'])):
+            from general_perf.datasets.open_criteo_kaggle.preprocess_dataset import csv_to_numpy
+            csv_to_numpy(
+                "general_perf/datasets/{}/eval.csv".format(
+                    self.config['dataset_name']),
+                "general_perf/datasets/{}/".format(self.config['dataset_name']))
+
+        num = np.load("general_perf/datasets/{}/numeric.npy".format(
+            self.config['dataset_name']))
+        cat = np.load("general_perf/datasets/{}/categorical.npy".format(
+            self.config['dataset_name']))
+        label = np.load("general_perf/datasets/{}/label.npy".format(
+            self.config['dataset_name']))
+        self.items = len(num)
+        self.batch_num = int(self.items / self.cur_bs)
+        self.feed_dict = {}
+        for i in tqdm(range(cat.shape[0])):
+            if i == 0:
+                self.feed_dict["new_categorical_placeholder:0"] = list(
+                    cat[i].reshape(-1, 2))
+                self.feed_dict["new_numeric_placeholder:0"] = list(
+                    num[i].reshape(1, -1))
+                self.feed_dict["label"] = list(label[i])
+            else:
+                self.feed_dict["new_categorical_placeholder:0"].extend(
+                    cat[i].reshape(-1, 2))
+                self.feed_dict["new_numeric_placeholder:0"].extend(
+                    num[i].reshape(1, -1))
+                self.feed_dict["label"].extend(label[i])
+        self.feed_dict['new_categorical_placeholder:0'] = np.array(
+            self.feed_dict['new_categorical_placeholder:0'], dtype=np.int64)
+        self.feed_dict['new_numeric_placeholder:0'] = np.array(
+            self.feed_dict['new_numeric_placeholder:0'], dtype=np.float32)
+        self.feed_dict['label'] = np.array(self.feed_dict['label'],
+                                           dtype=np.int64)
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.cur_bs, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        self.labels = []
+        for i in tqdm(range(self.batch_num)):
+            split_data = {
+                'new_categorical_placeholder:0':
+                self.feed_dict["new_categorical_placeholder:0"][i *
+                                                                self.cur_bs *
+                                                                26:(i + 1) *
+                                                                self.cur_bs *
+                                                                26, ],
+                'new_numeric_placeholder:0':
+                self.feed_dict["new_numeric_placeholder:0"][
+                    i * self.cur_bs:(i + 1) * self.cur_bs, ],
+            }
+            self.labels.append(
+                self.feed_dict["label"][i * self.cur_bs:(i + 1) *
+                                        self.cur_bs, ])
+            self.batched_data.append(split_data)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/preprocess_dataset.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/preprocess_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b38adf830c586fc706592801cd8f3f733c663888
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/preprocess_dataset.py
@@ -0,0 +1,174 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import pandas
+import argparse
+import numpy as np
+import tensorflow as tf
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input',
+                        type=str,
+                        default="eval.csv",
+                        help='full path of data file e.g. eval.csv',
+                        dest='evaldatafile_path',
+                        required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def version_is_less_than(a, b):
+    a_parts = a.split('.')
+    b_parts = b.split('.')
+
+    for i in range(len(a_parts)):
+        if int(a_parts[i]) < int(b_parts[i]):
+            print('{} < {}, version_is_less_than() returning False'.format(
+                a_parts[i], b_parts[i]))
+            return True
+    return False
+
+
+def csv_to_numpy(eval_csv_file, output):
+    print("TensorFlow version {}".format(tf.__version__))
+    required_tf_version = '2.0.0'
+
+    if version_is_less_than(tf.__version__, required_tf_version):
+        tf.compat.v1.enable_eager_execution()
+
+    # args = parse_args()
+    # eval_csv_file = args.evaldatafile_path
+
+    csv = pandas.read_csv(eval_csv_file, header=None)
+    if len(csv.columns) == 39:
+        dataset_type = 'test'
+    else:
+        dataset_type = 'eval'
+
+    fill_na_dict = {}
+    if dataset_type == 'test':
+        for i in range(0, 13):
+            fill_na_dict[i] = 0.0
+        for i in range(13, 39):
+            fill_na_dict[i] = ""
+    else:
+        for i in range(1, 14):
+            fill_na_dict[i] = 0.0
+        for i in range(14, 40):
+            fill_na_dict[i] = ""
+
+    csv = csv.fillna(value=fill_na_dict).values
+
+    LABEL_COLUMN = ["clicked"]
+    CATEGORICAL_COLUMNS1 = ["C" + str(i) + "_embedding" for i in range(1, 27)]
+    NUMERIC_COLUMNS1 = ["I" + str(i) for i in range(1, 14)]
+    CATEGORICAL_COLUMNS2 = ["C" + str(i) + "_embedding" for i in range(1, 27)]
+    NUMERIC_COLUMNS2 = ["I" + str(i) for i in range(1, 14)]
+
+    DATA_COLUMNS = LABEL_COLUMN + NUMERIC_COLUMNS1 + CATEGORICAL_COLUMNS1
+
+    CATEGORICAL_COLUMNS1.sort()
+    NUMERIC_COLUMNS1.sort()
+
+    with open(eval_csv_file, 'r') as f:
+        nums = [line.strip('\n\r').split(',') for line in f.readlines()]
+        numpy_arr = np.array(nums)
+        numpy_arr[numpy_arr == ''] = '0'
+        min_list, max_list, range_list = [], [], []
+
+        for i in range(len(DATA_COLUMNS)):
+            if DATA_COLUMNS[i] in NUMERIC_COLUMNS1:
+                col_min = numpy_arr[:, i].astype(np.float32).min()
+                col_max = numpy_arr[:, i].astype(np.float32).max()
+                min_list.append(col_min)
+                max_list.append(col_max)
+                range_list.append(col_max - col_min)
+
+        print('min list', min_list)
+        print('max list', max_list)
+        print('range list', range_list)
+
+    all_data = []
+    no_of_rows = 0
+    for row in csv:
+        no_of_rows = no_of_rows + 1
+        unnormalized_vals = np.array(row[1:14])
+        normalized_vals = (unnormalized_vals - min_list) / range_list
+        new_categorical_dict = dict(zip(CATEGORICAL_COLUMNS2, row[14:40]))
+
+        new_categorical_list = []
+        for i in CATEGORICAL_COLUMNS1:
+            if pandas.isnull(new_categorical_dict[i]):
+                new_categorical_list.append("")
+            else:
+                new_categorical_list.append(new_categorical_dict[i])
+
+        if tf.executing_eagerly():
+            hash_values = tf.strings.to_hash_bucket_fast(
+                new_categorical_list, 1000).numpy()
+        else:
+            hash_tensor = tf.strings.to_hash_bucket_fast(
+                new_categorical_list, 1000)
+            with tf.compat.v1.Session() as sess:
+                hash_values = hash_tensor.eval()
+
+        new_numerical_dict = dict(zip(NUMERIC_COLUMNS2, normalized_vals))
+
+        item_data = {
+            "new_numeric_placeholder": [],
+            "new_categorical_placeholder": [],
+            "label": []
+        }
+
+        for i in NUMERIC_COLUMNS1:
+            item_data["new_numeric_placeholder"].extend(
+                [new_numerical_dict[i]])
+
+        for i in range(0, 26):
+            item_data["new_categorical_placeholder"].extend([i])
+            item_data["new_categorical_placeholder"].extend([hash_values[i]])
+
+        item_data["label"].append(row[0])
+
+        all_data.append(item_data)
+
+    wnd_num = []
+    wnd_cate = []
+    wnd_lable = []
+
+    for data in all_data:
+        wnd_num.append(data["new_numeric_placeholder"])
+        wnd_cate.append(data["new_categorical_placeholder"])
+        wnd_lable.append(data["label"])
+
+    np.save(os.path.join(output, "numeric.npy"), np.array(wnd_num))
+    np.save(os.path.join(output, "categorical.npy"), np.array(wnd_cate))
+    np.save(os.path.join(output, "label.npy"), np.array(wnd_lable))
+
+    print('Total number of rows ', no_of_rows)
+    print(
+        'Generated output file name : wnd_num.npy, wnd_cate.npy, wnd_label.npy'
+    )
+
+
+if __name__ == "__main__":
+    csv_to_numpy()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..145e9cb3110e16361f1029aa941ca4dcf3ce08eb
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/test_accuracy.py
@@ -0,0 +1,47 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+        good, total = 0, 0
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data, labels = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            results = results[list(results)[0]]
+            diffs.append(results)
+
+            for j in range(len(results)):
+                if np.argmax(results[j].round()) == labels[j].round():
+                    good += 1
+                total += 1
+
+        accuracy = round((good / total), 5)
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs)
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, accuracy))
+        return {"Top-1": accuracy}
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..488ae1abd89532f3a3d9beccada6493eddfc37ab
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/data_loader.py
@@ -0,0 +1,260 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from os.path import split
+import re
+import time
+
+import cv2
+import numpy as np
+import random
+from tqdm import tqdm
+
+from general_perf.datasets import data_loader
+
+log = logging.getLogger("Imagenet")
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+}
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+        log.info("Initial...")
+
+        self.config = config
+        self.cur_bs = 1
+        self.image_size = [224, 224, 3]
+
+        if self.config['framework'] == 'Tensorflow':
+            image_format = "NHWC"
+            pre_process = pre_process_vgg
+        else:
+            image_format = "NCHW"
+            if 'resnet50' in self.config['model']:
+                pre_process = pre_process_imagenet_pytorch
+            else:
+                pre_process = pre_process_imagenet_vit
+
+        cache_dir = os.getcwd() + \
+            "/general_perf/datasets/{}".format(self.config['dataset_name'])
+        self.input_name = self.config['inputs']
+        self.image_list = []
+        self.label_list = []
+        self.count = None
+        self.use_cache = 0
+        self.cache_dir = os.path.join(cache_dir, "preprocessed",
+                                      self.config['model'])
+        self.data_path = "general_perf/datasets/{}/ILSVRC2012_img_val".format(
+            self.config['dataset_name'])
+        self.pre_process = pre_process
+        self.items = 0
+        # input images are in HWC
+        self.need_transpose = True if image_format == "NCHW" else False
+        not_found = 0
+        os.makedirs(self.cache_dir, exist_ok=True)
+
+        image_list = 'general_perf/datasets/{}/val_map.txt'.format(
+            self.config['dataset_name'])
+
+        start = time.time()
+        with open(image_list, 'r') as f:
+            for s in tqdm(f):
+                image_name, label = re.split(r"\s+", s.strip())
+                src = os.path.join(self.data_path, image_name)
+                if not os.path.exists(src):
+                    # if the image does not exists ignore it
+                    not_found += 1
+                    continue
+                os.makedirs(os.path.dirname(
+                    os.path.join(self.cache_dir, image_name)),
+                            exist_ok=True)
+                dst = os.path.join(self.cache_dir, image_name)
+                if not os.path.exists(dst + ".npy"):
+                    img_org = cv2.imread(src)
+                    processed = self.pre_process(
+                        img_org,
+                        need_transpose=self.need_transpose,
+                        dims=self.image_size)
+                    np.save(dst, processed)
+
+                self.image_list.append(image_name)
+                self.label_list.append(int(label) + 1)
+                self.items = len(self.image_list)
+
+                # limit the dataset if requested
+                if self.count and len(self.image_list) >= self.count:
+                    break
+
+        time_taken = time.time() - start
+        if not self.image_list:
+            log.error("no images in image list found")
+            raise ValueError("no images in image list found")
+        if not_found > 0:
+            log.info("reduced image list, %d images not found", not_found)
+
+        log.info("loaded {} images, cache={}, took={:.1f}sec".format(
+            len(self.image_list), self.use_cache, time_taken))
+
+        self.label_list = np.array(self.label_list)
+        self.batch_num = int(self.items / self.cur_bs)
+        self.shuffle_index = [i for i in range(self.items)]
+        random.seed(7)
+        random.shuffle(self.shuffle_index)
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.cur_bs, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        self.labels = []
+        for i in tqdm(range(self.batch_num)):
+            split_data, labels = [], []
+            for j in range(i * self.cur_bs, (i + 1) * self.cur_bs):
+                output, label = self.get_item(self.shuffle_index[j])
+                split_data.append(output)
+                labels.append(label)
+
+            self.labels.append(labels)
+            self.batched_data.append({self.input_name: np.array(split_data)})
+
+    def get_samples(self, sample_id):
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID: {} is out of range: {}".format(
+                sample_id, len(self.batched_data)))
+        return self.batched_data[sample_id], self.labels[sample_id]
+
+    def get_item(self, nr):
+        """Get image by number in the list."""
+        dst = os.path.join(self.cache_dir, self.image_list[nr])
+        img = np.load(dst + ".npy")
+        return img, self.label_list[nr]
+
+
+#
+# pre-processing
+#
+def center_crop(img, out_height, out_width):
+    height, width, _ = img.shape
+    left = int((width - out_width) / 2)
+    right = int((width + out_width) / 2)
+    top = int((height - out_height) / 2)
+    bottom = int((height + out_height) / 2)
+    img = img[top:bottom, left:right]
+    return img
+
+
+def resize_with_aspectratio(img,
+                            out_height,
+                            out_width,
+                            scale=87.5,
+                            inter_pol=cv2.INTER_LINEAR):
+    height, width, _ = img.shape
+    new_height = int(100. * out_height / scale)
+    new_width = int(100. * out_width / scale)
+    if height > width:
+        w = new_width
+        h = int(new_height * height / width)
+    else:
+        h = new_height
+        w = int(new_width * width / height)
+    img = cv2.resize(img, (w, h), interpolation=inter_pol)
+    return img
+
+
+def pre_process_vgg(img, dims=None, need_transpose=False):
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+    output_height, output_width, _ = dims
+    cv2_interpol = cv2.INTER_AREA
+    img = resize_with_aspectratio(img,
+                                  output_height,
+                                  output_width,
+                                  inter_pol=cv2_interpol)
+    img = center_crop(img, output_height, output_width)
+    img = np.asarray(img, dtype='float32')
+
+    # normalize image
+    means = np.array([123.68, 116.78, 103.94], dtype=np.float32)
+    img -= means
+
+    # transpose if needed
+    if need_transpose:
+        img = img.transpose([2, 0, 1])
+    return img
+
+
+def pre_process_imagenet_pytorch(img, dims=None, need_transpose=False):
+    from PIL import Image
+    import torchvision.transforms.functional as F
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = Image.fromarray(img)
+    img = F.resize(img, 256, Image.BILINEAR)
+    img = F.center_crop(img, 224)
+    img = F.to_tensor(img)
+    img = F.normalize(img,
+                      mean=[0.485, 0.456, 0.406],
+                      std=[0.229, 0.224, 0.225],
+                      inplace=False)
+    if not need_transpose:
+        img = img.permute(1, 2, 0)  # NHWC
+    img = np.asarray(img, dtype='float32')
+    return img
+
+def pre_process_imagenet_vit(img, dims=None, need_transpose=False):
+    from PIL import Image
+    import torchvision.transforms.functional as F
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    img = Image.fromarray(img)
+    img = F.resize(img, 256, Image.BILINEAR)
+    img = F.center_crop(img, 384)
+    img = F.to_tensor(img)
+    img = F.normalize(img,
+                      mean=[0.485, 0.456, 0.406],
+                      std=[0.229, 0.224, 0.225],
+                      inplace=False)
+    if not need_transpose:
+        img = img.permute(1, 2, 0)  # NHWC
+    img = np.asarray(img, dtype='float32')
+    return img
+
+
+def maybe_resize(img, dims):
+    img = np.array(img, dtype=np.float32)
+    if len(img.shape) < 3 or img.shape[2] != 3:
+        # some images might be grayscale
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    if dims != None:
+        im_height, im_width, _ = dims
+        img = cv2.resize(img, (im_width, im_height),
+                         interpolation=cv2.INTER_LINEAR)
+    return img
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..6275aaf21210842c055190d0cae4d533e8504e12
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/test_accuracy.py
@@ -0,0 +1,66 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+from general_perf.datasets import test_accuracy
+from tqdm import tqdm
+import torch
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+        good, total = 0, 0
+        diffs = []
+        for i in tqdm(range(num)):
+            test_data, labels = self.dataloader.get_samples(i)
+
+            results = self.runtime_backend.predict(test_data)
+            if "resnet50-tf-fp16" in self.configs["model"]:
+                if 'classes' in results:
+                    del results['classes']
+            results = self._post_processing(results, self.configs['framework'])
+            diffs.append(results)
+            for j in range(len(results)):
+                if np.argmax(results[j]) == labels[j]:
+                    good += 1
+                total += 1
+        accuracy = round((good / total), 5)
+        log.info('Batch size is {}, Accuracy: {}'.format(
+            self.dataloader.cur_bs, accuracy))
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs)
+        return {"Top-1": accuracy}
+
+    def _post_processing(self, inputs, framework):
+        if framework == "Onnx":
+            if isinstance(inputs, list):
+                inputs = list(inputs[0])
+            elif isinstance(inputs, dict):
+                key = list(inputs.keys())[0]
+                inputs = list(inputs[key])
+        else:
+            if isinstance(inputs, tuple):
+                inputs = inputs[0].float().cpu().numpy().astype(float) if inputs[0].dtype==torch.bfloat16 else inputs[0].cpu().numpy().astype(float)
+            else:
+                inputs = inputs[list(inputs)[0]]
+        if framework == "Pytorch" or framework == "Onnx":
+            inputs = np.array(
+                [np.insert(inputs[i], 0, 0) for i in range(len(inputs))])
+        return inputs
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/accuracy_squad.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/accuracy_squad.py
new file mode 100644
index 0000000000000000000000000000000000000000..18c97dd41766d8a9eee7c07ac28c471973d51d61
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/accuracy_squad.py
@@ -0,0 +1,322 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import collections
+import json
+import math
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+import numpy as np
+import six
+from bert import tokenization
+
+# To support feature cache.
+import pickle
+
+max_seq_length = 384
+max_query_length = 64
+doc_stride = 128
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+dtype_map = {
+    "int8": np.int8,
+    "int16": np.int16,
+    "int32": np.int32,
+    "int64": np.int64,
+    "float16": np.float16,
+    "float32": np.float32,
+    "float64": np.float64
+}
+
+
+def get_final_text(pred_text, orig_text, do_lower_case):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #     pred_text = steve smith
+    #     orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits),
+                             key=lambda x: x[1],
+                             reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def write_predictions(all_examples,
+                      all_features,
+                      all_results,
+                      n_best_size,
+                      max_answer_length,
+                      do_lower_case,
+                      output_prediction_file,
+                      max_examples=None):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    print("Writing predictions to: %s" % (output_prediction_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        if max_examples and example_index == max_examples: break
+
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            # FIX: During compliance/audit runs, we only generate a small subset of
+            # all entries from the dataset. As a result, sometimes dict retrieval
+            # fails because a key is missing.
+            # result = unique_id_to_result[feature.unique_id]
+            result = unique_id_to_result.get(feature.unique_id, None)
+            if result is None:
+                continue
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(
+                            start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        prelim_predictions = sorted(prelim_predictions,
+                                    key=lambda x:
+                                    (x.start_logit + x.end_logit),
+                                    reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = " ".join(tok_tokens)
+
+            # De-tokenize WordPieces that have been split off.
+            tok_text = tok_text.replace(" ##", "")
+            tok_text = tok_text.replace("##", "")
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case)
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+            nbest.append(
+                _NbestPrediction(text=final_text,
+                                 start_logit=pred.start_logit,
+                                 end_logit=pred.end_logit))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        all_predictions[example.qas_id] = nbest_json[0]["text"]
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/evaluate.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..177e136dfdb3fb5294c4213e1a22e79345b78723
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/bert/evaluate.py
@@ -0,0 +1,102 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from collections import Counter
+import string
+import re
+import argparse
+import json
+import sys
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r'\b(a|an|the)\b', ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return (normalize_answer(prediction) == normalize_answer(ground_truth))
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions, num):
+    f1 = exact_match = total = 0
+    for article in dataset:
+        for paragraph in article['paragraphs']:
+            for qa in paragraph['qas']:
+                total += 1
+                if qa['id'] not in predictions:
+                    message = 'Unanswered question ' + qa['id'] + \
+                              ' will receive score 0.'
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = list(map(lambda x: x['text'], qa['answers']))
+                prediction = predictions[qa['id']]
+                exact_match += metric_max_over_ground_truths(
+                    exact_match_score, prediction, ground_truths)
+                f1 += metric_max_over_ground_truths(f1_score, prediction,
+                                                    ground_truths)
+    total = num
+    exact_match = round(100.0 * exact_match / total, 5)
+    f1 = round(100.0 * f1 / total, 5)
+
+    return {'Exact Match': exact_match, 'F1 Score': f1}
+
+
+def check_accuracy(dataset_file, prediction_file, num):
+    expected_version = '1.1'
+    with open(dataset_file) as dataset_file:
+        dataset_json = json.load(dataset_file)
+        if (dataset_json['version'] != expected_version):
+            print('Evaluation expects v-' + expected_version +
+                  ', but got dataset with v-' + dataset_json['version'],
+                  file=sys.stderr)
+        dataset = dataset_json['data']
+    with open(prediction_file) as prediction_file:
+        predictions = json.load(prediction_file)
+    return evaluate(dataset, predictions, num)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/create_squad_data.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/create_squad_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff84c61e62c49e05fa17f148a4db02285458b4d1
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/create_squad_data.py
@@ -0,0 +1,427 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import tokenization
+import six
+
+
+class SquadExample(object):
+    """A single training/test example for simple sequence classification.
+     For examples without an answer, the start and end position are -1.
+  """
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (tokenization.printable_text(
+            self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file,
+                        is_training,
+                        version_2_with_negative=False):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file) as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer."
+                        )
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset +
+                                                           answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(
+                            doc_tokens[start_position:(end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            tokenization.whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            print("Could not find answer: '%s' vs. '%s'",
+                                  actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(qas_id=qas_id,
+                                       question_text=question_text,
+                                       doc_tokens=doc_tokens,
+                                       orig_answer_text=orig_answer_text,
+                                       start_position=start_position,
+                                       end_position=end_position,
+                                       is_impossible=is_impossible)
+                examples.append(example)
+
+    return examples
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def convert_examples_to_features(examples,
+                                 tokenizer,
+                                 max_seq_length,
+                                 doc_stride,
+                                 max_query_length,
+                                 is_training,
+                                 output_fn,
+                                 verbose_logging=False):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        tok_start_position = None
+        tok_end_position = None
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position +
+                                                     1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position,
+                tokenizer, example.orig_answer_text)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(
+                    tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans,
+                                                       doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            start_position = None
+            end_position = None
+            if is_training and not example.is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start
+                        and tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+
+            if is_training and example.is_impossible:
+                start_position = 0
+                end_position = 0
+
+            if verbose_logging and example_index < 20:
+                print("*** Example ***")
+                print("unique_id: %s" % (unique_id))
+                print("example_index: %s" % (example_index))
+                print("doc_span_index: %s" % (doc_span_index))
+                print(
+                    "tokens: %s" %
+                    " ".join([tokenization.printable_text(x) for x in tokens]))
+                print("token_to_orig_map: %s" % " ".join([
+                    "%d:%d" % (x, y)
+                    for (x, y) in six.iteritems(token_to_orig_map)
+                ]))
+                print("token_is_max_context: %s" % " ".join([
+                    "%d:%s" % (x, y)
+                    for (x, y) in six.iteritems(token_is_max_context)
+                ]))
+                print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                print("input_mask: %s" % " ".join([str(x)
+                                                   for x in input_mask]))
+                print("segment_ids: %s" %
+                      " ".join([str(x) for x in segment_ids]))
+                if is_training and example.is_impossible:
+                    print("impossible example")
+                if is_training and not example.is_impossible:
+                    answer_text = " ".join(
+                        tokens[start_position:(end_position + 1)])
+                    print("start_position: %d" % (start_position))
+                    print("end_position: %d" % (end_position))
+                    print("answer: %s" %
+                          (tokenization.printable_text(answer_text)))
+
+            feature = InputFeatures(unique_id=unique_id,
+                                    example_index=example_index,
+                                    doc_span_index=doc_span_index,
+                                    tokens=tokens,
+                                    token_to_orig_map=token_to_orig_map,
+                                    token_is_max_context=token_is_max_context,
+                                    input_ids=input_ids,
+                                    input_mask=input_mask,
+                                    segment_ids=segment_ids,
+                                    start_position=start_position,
+                                    end_position=end_position,
+                                    is_impossible=example.is_impossible)
+
+            # Run callback
+            output_fn(feature)
+
+            unique_id += 1
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc9ad6a4d7c0b0f4bd47536d98786cb6fb2551ec
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
@@ -0,0 +1,199 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+# To support feature cache.
+import pickle
+from transformers import BertTokenizer, AutoTokenizer
+from general_perf.datasets.open_squad.create_squad_data import read_squad_examples, convert_examples_to_features
+import collections
+from general_perf.datasets import data_loader
+import logging
+from tqdm import tqdm
+import numpy as np
+
+INPUT_TYPE = {
+    "UINT8": np.uint8,
+    "FLOAT32": np.float32,
+    "LONG": np.long,
+    "INT32": np.int32,
+    "INT64": np.int64
+}
+
+max_seq_length = 384
+max_query_length = 64
+doc_stride = 128
+
+log = logging.getLogger("SQUAD")
+
+
+class DataLoader(data_loader.Dataset):
+    def __init__(self, config):
+        super(DataLoader, self).__init__(config)
+
+        log.info("Initial...")
+        self.config = config
+        model = self.config["model"]
+        total_count_override = None
+        perf_count_override = None
+        eval_features = []
+        # Load features if cached, convert from examples otherwise.
+        input_file = "general_perf/datasets/open_squad/dev-v1.1.json"
+        cache_path = 'general_perf/datasets/open_squad/eval_features_' + self.config[
+            'model'] + '.pickle'
+        if os.path.exists(cache_path):
+            with open(cache_path, 'rb') as cache_file:
+                eval_features = pickle.load(cache_file)
+            eval_examples = read_squad_examples(input_file=input_file,
+                                                is_training=False,
+                                                version_2_with_negative=False)
+        else:
+            log.info("Start to generate data")
+            if "roberta" in self.config['model']:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    "csarron/roberta-base-squad-v1")
+            elif "albert" in self.config['model']:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    "madlag/albert-base-v2-squad")
+            elif "deberta" in self.config['model']:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    "Palak/microsoft_deberta-base_squad")
+            else:
+                tokenizer = BertTokenizer(
+                    "general_perf/datasets/open_squad/vocab.txt")
+            eval_examples = read_squad_examples(input_file=input_file,
+                                                is_training=False,
+                                                version_2_with_negative=False)
+
+            def append_feature(feature):
+                eval_features.append(feature)
+
+            convert_examples_to_features(examples=eval_examples,
+                                         tokenizer=tokenizer,
+                                         max_seq_length=max_seq_length,
+                                         doc_stride=doc_stride,
+                                         max_query_length=max_query_length,
+                                         is_training=False,
+                                         output_fn=append_feature,
+                                         verbose_logging=False)
+
+            with open(cache_path, 'wb') as cache_file:
+                pickle.dump(eval_features, cache_file)
+
+        self.eval_features = eval_features
+        self.eval_examples = eval_examples
+        self.count = total_count_override or len(self.eval_features)
+        self.items = len(self.eval_features)
+        self.perf_count = perf_count_override or self.count
+        self.model = model
+        self.cur_bs = 1
+        self.batch_num = int(self.items / self.cur_bs)
+
+        # save mask name to help setting the the results at unmasked positions to zero
+        if "roberta" in self.model or "torch" in self.model:
+            self.mask_name = "attention_mask.1"
+        else:
+            self.mask_name = "input_mask:0"
+
+    def name(self):
+        return self.config['dataset_name']
+
+    def preprocess(self):
+        log.info("Preprocessing...")
+
+        self.rebatch(self.batch_num, skip=False)
+
+    def rebatch(self, new_bs, skip=True):
+        log.info("Rebatching batch size to: {} ...".format(new_bs))
+
+        if self.cur_bs == new_bs and skip:
+            return
+
+        self.cur_bs = new_bs
+        self.batch_num = int(self.items / self.cur_bs)
+        self.batched_data = []
+        for i in tqdm(range(self.batch_num)):
+            features = collections.defaultdict(list)
+            for j in range(i * self.cur_bs, (i + 1) * self.cur_bs):
+                if "torch" in self.model:
+                    features['input_ids.1'].append(
+                        self.eval_features[j].input_ids)
+                    features['attention_mask.1'].append(
+                        self.eval_features[j].input_mask)
+                    if "roberta" in self.model:
+                        features['token_type_ids.1'].append(
+                            np.zeros((384,)))
+                    elif "deberta" in self.model:
+                        features['token_type_ids'].append(
+                            self.eval_features[j].segment_ids)
+                    else:
+                        features['token_type_ids.1'].append(
+                            self.eval_features[j].segment_ids)
+                else:
+                    features['input_ids:0'].append(
+                        self.eval_features[j].input_ids)
+                    features['input_mask:0'].append(
+                        self.eval_features[j].input_mask)
+                    features['segment_ids:0'].append(
+                        self.eval_features[j].segment_ids)
+            self.batched_data.append(features)
+
+    def get_samples(self, sample_id):
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+        return self.batched_data[sample_id], []
+
+    def get_id(self, sample_id):
+        if sample_id >= len(self.batched_data) or sample_id < 0:
+            raise ValueError("Your Input ID is out of range")
+        return [
+            self.eval_features[i].unique_id
+            for i in range(sample_id * self.cur_bs, (sample_id + 1) *
+                           self.cur_bs)
+        ]
+
+    def get_fake_samples(self, batch_size, shape, input_type):
+        data = {}
+
+        avg_seq_len = 192
+        max_seq_len = 384
+
+        if input_type:
+            i = 0
+            for key, val in shape.items():
+                val = [val[0] * batch_size] + val[1:]
+                if i == 0:
+                    # fake input id and mask
+                    input_ids = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+                    data[key] = input_ids
+                elif i == 1:
+                    # fake input array length
+                    input_len = np.random.randint(low=2 * avg_seq_len -
+                                                  max_seq_len,
+                                                  high=max_seq_len + 1,
+                                                  size=(batch_size),
+                                                  dtype=np.int32)
+
+                    input_mask = np.zeros(val).astype(
+                        INPUT_TYPE[input_type[i]])
+
+                    for b_idx, s_len in enumerate(input_len):
+                        input_mask[b_idx][:s_len] = 1
+                    data[key] = input_mask
+                else:
+                    data[key] = np.zeros(val).astype(INPUT_TYPE[input_type[i]])
+                i += 1
+            return data
+        else:
+            raise ValueError("Please provide input type")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..5edd352cbd9970c7502255654d8824f74f2ee1a6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/test_accuracy.py
@@ -0,0 +1,134 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import collections
+import numpy as np
+import tensorflow as tf
+import torch
+from tqdm import tqdm
+
+from general_perf.datasets.open_squad.bert.accuracy_squad import write_predictions
+from general_perf.datasets.open_squad.bert.evaluate import check_accuracy
+from general_perf.datasets import test_accuracy
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+log = logging.getLogger("TestAccuracy")
+
+
+class AccuracyChecker(test_accuracy.AccuracyChecker):
+    def calculate_acc(self, data_percent):
+        log.info("Start to calculate accuracy...")
+        results, diffs = [], []
+        num = int((data_percent / 100) * self.dataloader.get_batch_count()
+                  ) if data_percent else self.dataloader.get_batch_count()
+
+        for i in tqdm(range(num)):
+            test_data, _ = self.dataloader.get_samples(i)
+            unique_ids = self.dataloader.get_id(i)
+            result = self.runtime_backend.predict(test_data)
+            start_logits, end_logits = self._post_processing(
+                result, self.configs['framework'])
+
+            # set results at unmasked positions to zero since the vendor's result may have different value at those meaningless positions
+            def set_unmask_to_zero(res, mask):
+                arr = np.array(res)
+                arr[mask == 0] = 0.0
+                return list(arr)
+
+            for i, mask in enumerate(np.array(test_data[self.dataloader.mask_name])):
+                for i, sl in enumerate(start_logits):
+                    start_logits[i] = set_unmask_to_zero(sl, mask)
+
+                for i, el in enumerate(end_logits):
+                    end_logits[i] = set_unmask_to_zero(el, mask)
+
+            for i, u_id in enumerate(unique_ids):
+                results.append(
+                    RawResult(unique_id=u_id,
+                              start_logits=start_logits[i],
+                              end_logits=end_logits[i]))
+
+            diffs.append(start_logits + end_logits)
+
+        np.save(self.output_dir + "/{}.npy".format(self.dataloader.name()),
+                diffs)
+        data_file = os.path.abspath('.') + "/general_perf/datasets/open_squad/dev-v1.1.json"
+        predict_file = self.output_dir[:self.output_dir.
+                                       rindex('/')] + "/predictions.json"
+        write_predictions(self.dataloader.eval_examples,
+                          self.dataloader.eval_features, results, 20, 30, True,
+                          predict_file)
+        result = check_accuracy(data_file, predict_file,
+                                num * self.dataloader.cur_bs)
+        log.info('Batch size is {}, F1: {}, Exact Match:{}'.format(
+            self.dataloader.cur_bs, result['F1 Score'], result['Exact Match']))
+        return result
+
+    def _post_processing(self, inputs, framework):
+        start_results, end_results = [], []
+
+        if framework == "Tensorflow":
+            if 'distill' in self.configs['model']:
+                (start_logits, end_logits) = (inputs["output_0"],
+                                              inputs["output_1"])
+                for i in range(self.dataloader.cur_bs):
+                    start_logit = [float(x) for x in start_logits[i].flat]
+                    end_logit = [float(x) for x in end_logits[i].flat]
+                    start_results.append(start_logit)
+                    end_results.append(end_logit)
+            else:
+                tensor_name = list(inputs)[0]
+                for i in range(len(inputs[tensor_name])):
+                    logits = tf.transpose(np.array([inputs[tensor_name][i]]),
+                                          [2, 0, 1])
+                    unstacked_logits = tf.unstack(logits, axis=0)
+                    if tf.executing_eagerly():
+                        (start_logit,
+                         end_logit) = (unstacked_logits[0].numpy(),
+                                       unstacked_logits[1].numpy())
+                    else:
+                        with tf.compat.v1.Session():
+                            (start_logit,
+                             end_logit) = (unstacked_logits[0].eval(),
+                                           unstacked_logits[1].eval())
+                    start_logit = [float(x) for x in start_logit.flat]
+                    end_logit = [float(x) for x in end_logit.flat]
+                    start_results.append(start_logit)
+                    end_results.append(end_logit)
+        else:
+            if isinstance(inputs, dict):
+                (start_logits, end_logits) = (
+                    inputs["start_logits"],
+                    inputs["end_logits"],
+                )
+            elif isinstance(inputs[0], torch.Tensor):
+                (start_logits, end_logits) = (
+                    inputs[0].float().cpu().detach().numpy() if inputs[0].dtype==torch.bfloat16 else inputs[0].cpu().detach().numpy(),
+                    inputs[1].float().cpu().detach().numpy() if inputs[1].dtype==torch.bfloat16 else inputs[1].cpu().detach().numpy(),
+                )
+            else:
+                (start_logits, end_logits) = (inputs[0], inputs[1])
+            
+            for i in range(self.dataloader.cur_bs):
+                start_logit = [float(x) for x in start_logits[i].flat]
+                end_logit = [float(x) for x in end_logits[i].flat]
+                start_results.append(start_logit)
+                end_results.append(end_logit)
+
+        return start_results, end_results
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..25b2d3cb5d8d3802bb67d034515d2e1676a227d1
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/datasets/test_accuracy.py
@@ -0,0 +1,118 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import logging
+from typing import Any, Dict
+import matplotlib.pyplot as plt
+import numpy as np
+
+log = logging.getLogger("TestAccuracy")
+
+
+def draw_all_diff(ori_outs, cur_outs, file_name) -> Dict[str, Any]:
+    ori_data = ori_outs.flatten()
+    cur_data = cur_outs.flatten()
+    '''
+    Nan & Inf is not compareable, replece with 0 
+    '''
+    ori_data[np.isnan(ori_data)] = 0.0
+    ori_data[np.isinf(ori_data)] = 0.0
+
+    cur_data[np.isnan(cur_data)] = 0.0
+    cur_data[np.isinf(cur_data)] = 0.0
+
+    length = min(ori_data.shape[0], 300)
+    diff = ori_data - cur_data
+
+    ori_data = np.where(ori_data == 0, 1, ori_data)
+    rel_diff = np.divide(diff, ori_data)
+    rel_diff = np.nan_to_num(rel_diff)
+
+    log.info('Mean Diff: {}, Std Diff: {}, Max Diff: {}, Max Rel-Diff: {}, Mean Rel-Diff: {}'.format(
+        np.mean(abs(diff)), np.std(abs(diff)),
+        abs(diff).max(), abs(rel_diff).max(), np.mean(abs(rel_diff))))
+
+    result = {}
+    result["Mean Diff"] = round(float(np.mean(abs(diff))), 5)
+    result["Std Diff"] = round(float(np.std(abs(diff))), 5)
+    result["Max Diff"] = round(float(abs(diff).max()), 5)
+    result["Max Rel-Diff"] = round(float(abs(rel_diff).max()), 5)
+    result["Mean Rel-Diff"] = round(float(np.mean(abs(rel_diff))), 5)
+
+    plt.figure(figsize=(16, 8))
+
+    plt.cla()
+
+    plt.subplot(1, 3, 1)
+    plt.yscale('log')
+    plt.hist(diff,
+             bins=length,
+             alpha=0.5,
+             label='Diff',
+             range=(diff.min(), diff.max()))
+    plt.xlabel("Diff Distribute")
+
+    plt.subplot(1, 3, 2)
+    plt.yscale('log')
+    plt.hist(ori_data,
+             bins=length,
+             alpha=0.5,
+             label='CPU',
+             range=(ori_data.min(), ori_data.max()))
+    plt.xlabel("CPU Result")
+
+    plt.subplot(1, 3, 3)
+    plt.yscale('log')
+    plt.hist(cur_data,
+             bins=length,
+             alpha=0.5,
+             label='Backend',
+             range=(cur_data.min(), cur_data.max()))
+    plt.xlabel("Backend Result")
+
+    plt.savefig(file_name, dpi=300)
+    return result
+
+
+class AccuracyChecker():
+    def __init__(self):
+        self.configs = None
+        self.dataloader = None
+        self.runtime_backend = None
+        self.output_dir = ""
+
+    def calculate_diff(self) -> Dict[str, float]:
+        """
+        Return a dictionary of Mean Diff, Std Diff and Max Diff
+
+        Args: None
+
+        Returns: Dict[str, float]
+        """
+        cpu_data_path = os.path.abspath('general_perf/reports/CPU/' +
+                                        self.configs["model"])
+        if not os.path.exists(cpu_data_path):
+            log.info("Fetch CPU Data Failed")
+            return {}
+        vendor_data = np.load(self.output_dir +
+                              "/{}.npy".format(self.dataloader.name()))
+        cpu_data = np.load(cpu_data_path +
+                           "/{}.npy".format(self.dataloader.name()))
+        return draw_all_diff(
+            cpu_data, vendor_data,
+            self.output_dir + "/" + self.configs["model"] + '-to-' + self.configs['compile_precision'].lower() + '.png')
+
+    def calculate_acc(self, data_percent) -> Dict[str, Any]:
+        raise NotImplementedError("Dataset: caculate_acc")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/launch.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..51cf30609f7536f0c35651a689a106c6611c8f3d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/launch.py
@@ -0,0 +1,89 @@
+import os
+import sys
+import argparse
+import subprocess
+import logging
+import json
+
+# ${prj_root}/byte_infer_perf
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+from general_perf.core.configs.workload_store import load_workload
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("LANUCH")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="",
+        help="The task going to be evaluted, refs to workloads/")
+    parser.add_argument(
+        "--hardware_type",
+        default="CPU",
+        help="The backend going to be evaluted, refs to backends/")
+    parser.add_argument("--compile_only",
+                        action='store_true',
+                        help="Task will stoped after compilation finished")
+    parser.add_argument("--show_task_list",
+                        action='store_true',
+                        help="Print all task names")
+    parser.add_argument("--show_hardware_list",
+                        action='store_true',
+                        help="Print all hardware bytemlperf supported")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    parsed_args = get_args()
+
+    if parsed_args.show_task_list:
+        log.info("******************* Supported Task *******************")
+        for file in os.listdir('general_perf/workloads'):
+            print(file[:-5])
+
+    if parsed_args.show_hardware_list:
+        log.info("***************** Supported Hardware Backend *****************")
+        for file in os.listdir('general_perf/backends'):
+            if not file.endswith('.py') and not file.startswith('_'):
+                print(file)
+
+    if parsed_args.task:
+        log.info("******************* Pip Package Installing *******************")
+        subprocess.call([
+            'python3', '-m', 'pip', 'install', 'pip', '--upgrade', '--quiet'])
+        subprocess.call([
+            'python3', '-m', 'pip', 'install', '-r', 'general_perf/requirements.txt', '--quiet'])
+
+        workload = load_workload(parsed_args.task)
+        with open("general_perf/model_zoo/" + workload['model'] + '.json', 'r') as file:
+            model_info = json.load(file)
+
+        if not os.path.exists(model_info['model_path']):
+            subprocess.call([
+                'bash', 'general_perf/prepare_model_and_dataset.sh',
+                model_info['model'], model_info['dataset_name'] or "None"])
+
+        # test numeric
+        if workload['test_numeric'] and not parsed_args.compile_only and not workload['compile_only']:
+            log.info("******************************************* Running CPU Numeric Checker... *******************************************")
+            subprocess.call([
+                'bash', 'general_perf/backends/CPU/calculate_cpu_diff.sh',
+                workload['model'],
+                str(workload['batch_sizes'][0])
+            ])
+
+        cmd = f'python3 general_perf/core/perf_engine.py --hardware_type {parsed_args.hardware_type} --task {parsed_args.task}'
+        if parsed_args.compile_only:
+            cmd += '--compile_only'
+        exit_code = subprocess.call(cmd, shell=True)
+        sys.exit(exit_code)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/prepare_model_and_dataset.sh b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7e89c2ff2525d0d7b1203848db448cf30b8a3739
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/prepare_model_and_dataset.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+echo "******************* Downloading Model....  *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+
+#--Basic Model--
+# https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/bert_mhlo.tar
+# https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50_mhlo.tar
+
+if [ $1 == "bert-tf-fp32" -o $1 == "bert-torch-fp32" ]; then
+    wget -O general_perf/download/open_bert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_bert.tar
+    tar xf general_perf/download/open_bert.tar -C general_perf/model_zoo/regular/
+elif [ $1 == "resnet50-tf-fp32" -o $1 == "resnet50-torch-fp32" ]; then
+    wget -O general_perf/download/resnet50.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/resnet50.tar
+    tar xf general_perf/download/resnet50.tar -C general_perf/model_zoo/regular/
+elif [ $1 == "widedeep-tf-fp32" ]; then
+    wget -O general_perf/download/open_wide_deep.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_wide_deep_saved_model.tar
+    tar xf general_perf/download/open_wide_deep.tar -C general_perf/model_zoo/regular/
+#--Popular Model--
+elif [ $1 == "albert-torch-fp32" ]; then
+    wget -O general_perf/download/open_albert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_albert.tar
+    tar xf general_perf/download/open_albert.tar -C general_perf/model_zoo/popular/ 
+elif [ $1 == "roformer-tf-fp32" ]; then
+    wget -O general_perf/download/open_roformer.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roformer.tar
+    tar xf general_perf/download/open_roformer.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "videobert-onnx-fp32" ]; then
+    wget -O general_perf/download/open_videobert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_videobert.tar
+    tar xf general_perf/download/open_videobert.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "yolov5-onnx-fp32" ]; then
+    wget -O general_perf/download/open_yolov5.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_yolov5.tar
+    tar xf general_perf/download/open_yolov5.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "conformer-encoder-onnx-fp32" ]; then
+    wget -O general_perf/download/open_conformer.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_conformer.tar
+    tar xf general_perf/download/open_conformer.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "roberta-torch-fp32" ]; then
+    wget -O general_perf/download/open_roberta.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roberta.tar
+    tar xf general_perf/download/open_roberta.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "deberta-torch-fp32" ]; then
+    wget -O general_perf/download/open_deberta.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_deberta.tar
+    tar xf general_perf/download/open_deberta.tar -C general_perf/model_zoo/popular/
+elif [ $1 == "swin-large-torch-fp32" ]; then
+    wget -O general_perf/download/open-swin-large.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open-swin-large.tar
+    tar xf general_perf/download/open-swin-large.tar -C general_perf/model_zoo/popular/
+#--Sota Model--
+elif [ $1 == "vae-encoder-onnx-fp32" -o $1 == "vae-decoder-onnx-fp32" -o $1 == "clip-onnx-fp32" ]; then
+    wget -O general_perf/download/stable_diffusion.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/stable_diffusion.tar
+    tar xf general_perf/download/stable_diffusion.tar -C general_perf/model_zoo/sota/
+elif [ $1 == "unet-onnx-fp32" ]; then
+    wget -O general_perf/download/unet.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/unet.tar
+    tar xf general_perf/download/unet.tar -C general_perf/model_zoo/sota/
+elif [ $1 == "gpt2-torch-fp32" ]; then
+    wget -O general_perf/download/traced_gpt2.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/traced_gpt2.tar
+    mkdir general_perf/model_zoo/sota/traced_gpt2
+    tar xf general_perf/download/traced_gpt2.tar -C general_perf/model_zoo/sota/
+elif [ $1 == "chatglm2-6b-torch-fp16" ]; then
+    wget -O general_perf/download/chatglm2-6b.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/chatglm2-6b.tar
+    tar xf general_perf/download/chatglm2-6b.tar -C general_perf/model_zoo/sota/
+elif [ $1 == "llama2-7b-torch-fp16" ]; then
+    wget -O general_perf/download/llama-7b.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/llama-7b.tar
+    tar xf general_perf/download/llama-7b.tar -C general_perf/model_zoo/sota/
+fi
+
+# Download Datasets
+if [ $2 == "open_imagenet" ] && [ ! -f "general_perf/download/open_imagenet.tar" ] ; then
+    wget -O general_perf/download/open_imagenet.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_imagenet.tar
+    tar xf general_perf/download/open_imagenet.tar -C general_perf/datasets/
+elif [ $2 == "open_squad" ] && [ ! -f "general_perf/download/open_squad.tar" ]; then
+    wget -O general_perf/download/open_squad.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
+    tar xf general_perf/download/open_squad.tar -C general_perf/datasets/open_squad
+elif [ $2 == "open_criteo_kaggle" ] && [ ! -f "general_perf/download/eval.csv" ]; then
+    wget -O general_perf/download/eval.csv https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/eval.csv
+    cp general_perf/download/eval.csv general_perf/datasets/open_criteo_kaggle/eval.csv
+elif [ $2 == "open_cail2019" ] && [ ! -f "general_perf/download/open_cail2019.tar" ]; then
+    wget -O general_perf/download/open_cail2019.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_cail2019.tar
+    tar xf general_perf/download/open_cail2019.tar -C general_perf/datasets/open_cail2019 --strip-components 1
+elif [ $2 == "open_cifar" ] && [ ! -f "general_perf/download/cifar-100-python.tar" ]; then
+    wget -O general_perf/download/cifar-100-python.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/cifar-100-python.tar
+    tar xf general_perf/download/cifar-100-python.tar -C general_perf/datasets/open_cifar
+fi
+
+echo "Extract Done."
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/albert-torch-fp32/albert-torch-fp32-to-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/albert-torch-fp32/albert-torch-fp32-to-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..e99f09475480164396aa509a593f18f9418e0022
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/albert-torch-fp32/albert-torch-fp32-to-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/bert-torch-fp32/bert-torch-fp32-to-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/bert-torch-fp32/bert-torch-fp32-to-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..7228b71c0ec9f49add69e9b66c63e99053206055
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/bert-torch-fp32/bert-torch-fp32-to-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/deberta-torch-fp32/deberta-torch-fp32-to-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/deberta-torch-fp32/deberta-torch-fp32-to-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..e2b96884f82a452e748bcf1e3ec762fdb682b64c
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/deberta-torch-fp32/deberta-torch-fp32-to-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..697a484a2bcba2da67c3de58a7ef60531ebc3e4f
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..a241cc5d89af76f592389fff15eabff7b49b6980
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/HPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/albert-torch-fp32/albert-torch-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/albert-torch-fp32/albert-torch-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6a45931f71ef5effc12859491646b9f10e0bfcd
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/albert-torch-fp32/albert-torch-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/bert-torch-fp32/bert-torch-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/bert-torch-fp32/bert-torch-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..50bde3e4ae5aed4c24e84a53580580d79b723571
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/bert-torch-fp32/bert-torch-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/bert-torch-fp32/bert-torch-fp32-to-fp8.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/bert-torch-fp32/bert-torch-fp32-to-fp8.png
new file mode 100644
index 0000000000000000000000000000000000000000..2102bac75c99d3abcf355531ab26d58552a9c670
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/bert-torch-fp32/bert-torch-fp32-to-fp8.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/clip-onnx-fp32/clip-onnx-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/clip-onnx-fp32/clip-onnx-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..84f821f91e7a0955b77199cc9c3c3d4193d89346
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/clip-onnx-fp32/clip-onnx-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/conformer-encoder-onnx-fp32/conformer-encoder-onnx-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/conformer-encoder-onnx-fp32/conformer-encoder-onnx-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..1bd30ae2938fa7e87694e665c6ead7b81906486c
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/conformer-encoder-onnx-fp32/conformer-encoder-onnx-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/deberta-torch-fp32/deberta-torch-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/deberta-torch-fp32/deberta-torch-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..fe5d1467c8bbb8c1636ae92d2ce0aafa0ef49e71
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/deberta-torch-fp32/deberta-torch-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..17d1accb2b63b241e9e10374f1d61f3849e1364f
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp8.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp8.png
new file mode 100644
index 0000000000000000000000000000000000000000..248063103bb0d505722169dc54494d264bfb1429
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/resnet50-torch-fp32/resnet50-torch-fp32-to-fp8.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/roberta-torch-fp32/roberta-torch-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/roberta-torch-fp32/roberta-torch-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..8945887628c4481afc2c290dd177256031752b08
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/roberta-torch-fp32/roberta-torch-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/roformer-tf-fp32/roformer-tf-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/roformer-tf-fp32/roformer-tf-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..13fd5f9dca897f1ab274383dea9461eb08d559ec
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/roformer-tf-fp32/roformer-tf-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..220a5628d72457ebec5badb2c15131a261f7d7f6
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp8.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp8.png
new file mode 100644
index 0000000000000000000000000000000000000000..220a5628d72457ebec5badb2c15131a261f7d7f6
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/swin-large-torch-fp32/swin-large-torch-fp32-to-fp8.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/videobert-onnx-fp32/videobert-onnx-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/videobert-onnx-fp32/videobert-onnx-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc45b54181fdc41e985c428d956fe2c110aca7a4
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/videobert-onnx-fp32/videobert-onnx-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/widedeep-tf-fp32/widedeep-tf-fp32-to-fp16.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/widedeep-tf-fp32/widedeep-tf-fp32-to-fp16.png
new file mode 100644
index 0000000000000000000000000000000000000000..a37c156267782458b6846d8862cae9135a4b17c7
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/IPU/widedeep-tf-fp32/widedeep-tf-fp32-to-fp16.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/README b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/README
new file mode 100644
index 0000000000000000000000000000000000000000..17e969953bdf21ad85d563779b5a9f56b87b87a0
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/README
@@ -0,0 +1 @@
+benchmark reports
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/albert-torch-fp32/albert-torch-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/albert-torch-fp32/albert-torch-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c4950411a391244e6d5852e0a1c0016061afb33
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/albert-torch-fp32/albert-torch-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/bert-torch-fp32/bert-torch-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/bert-torch-fp32/bert-torch-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..03d05ff27662a8fc1c743dad676f6fa30afb464b
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/bert-torch-fp32/bert-torch-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/conformer-encoder-onnx-fp32/conformer-encoder-onnx-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/conformer-encoder-onnx-fp32/conformer-encoder-onnx-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d3e2ab028a17bee7390170738dbf50d299bc562
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/conformer-encoder-onnx-fp32/conformer-encoder-onnx-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/resnet50-torch-fp32/resnet50-torch-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/resnet50-torch-fp32/resnet50-torch-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..64541e003df242da96a153f3a917e57632a575bb
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/resnet50-torch-fp32/resnet50-torch-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/roberta-torch-fp32/roberta-torch-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/roberta-torch-fp32/roberta-torch-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..8753c9f69368210210618041142776e7e4f84d0b
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/SPU/roberta-torch-fp32/roberta-torch-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/albert-torch-fp32/albert-torch-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/albert-torch-fp32/albert-torch-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..9709fc1237ee27705cf7564e78d41f41ba818a3a
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/albert-torch-fp32/albert-torch-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/bert-tf-fp32/bert-tf-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/bert-tf-fp32/bert-tf-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..a95d612434227f73b2e40a2c74427187bcadd9b8
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/bert-tf-fp32/bert-tf-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/bert-torch-fp32/bert-torch-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/bert-torch-fp32/bert-torch-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..2db639fe1b5253cf03fc33795d6b98d9db7c219e
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/bert-torch-fp32/bert-torch-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/resnet50-tf-fp32/resnet50-tf-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/resnet50-tf-fp32/resnet50-tf-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f6d2c0937c373660f69c3c7f5ec7ceeb206fe64
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/resnet50-tf-fp32/resnet50-tf-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/roberta-torch-fp32/roberta-torch-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/roberta-torch-fp32/roberta-torch-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..63cf07755aa6ebaa6738008ef0e57505190ab395
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/roberta-torch-fp32/roberta-torch-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/widedeep-tf-fp32/widedeep-tf-fp32.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/widedeep-tf-fp32/widedeep-tf-fp32.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1a42b976cbd04f9d434fc0aea0540f880244210
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/STC/widedeep-tf-fp32/widedeep-tf-fp32.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/reports_summary.png b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/reports_summary.png
new file mode 100644
index 0000000000000000000000000000000000000000..4cf284c88a9912280c7dafdc5cd1c71cf574ae9f
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/reports_summary.png differ
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/reports_summary.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/reports_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3df3b010958659f9cef56b71b2f0fad19a4c6ed
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/reports/reports_summary.py
@@ -0,0 +1,125 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import json
+import numpy as np
+import matplotlib.pyplot as plt
+
+'''
+labels : x轴坐标标签序列
+datas : 数据集, 二维列表, 要求列表每个元素的长度必须与labels的长度一致
+tick_step : 默认x轴刻度步长为1, 通过tick_step可调整x轴刻度步长。
+group_gap : 柱子组与组之间的间隙，最好为正值，否则组与组之间重叠
+bar_gap : 每组柱子之间的空隙, 默认为0, 每组柱子紧挨，正值每组柱子之间有间隙，负值每组柱子之间重叠
+'''
+def create_multi_bars(summary):  
+    tick_step = 8
+    group_gap = 5
+    bar_gap = 0
+    labels = []
+    datas = []
+    backends = []
+
+    for name in summary.keys():
+        labels.append(name)
+
+    backends = list(summary[labels[0]].keys())
+
+    for item in summary.values():
+        for idx, qps in enumerate(item.values()):
+            if idx == len(datas):
+                datas.append([qps])
+            else:
+                datas[idx].append(qps)
+
+    # ticks为x轴刻度
+    ticks = np.arange(len(labels)) * tick_step
+    # group_num为数据的组数，即每组柱子的柱子个数
+    group_num = len(datas)
+    # group_width为每组柱子的总宽度，group_gap 为柱子组与组之间的间隙。
+    group_width = tick_step - group_gap
+    # bar_span为每组柱子之间在x轴上的距离，即柱子宽度和间隙的总和
+    bar_span = group_width / group_num
+    # bar_width为每个柱子的实际宽度
+    bar_width = bar_span - bar_gap
+    # baseline_x为每组柱子第一个柱子的基准x轴位置，随后的柱子依次递增bar_span即可
+    baseline_x = ticks - (group_width - bar_span) / 2
+    
+    def autolabel(backend, rects):
+        """在*rects*中的每个柱状条上方附加一个文本标签，显示其高度"""
+        for rect in rects:
+            height = rect.get_height()
+            plt.annotate('{}:{}'.format(backend,height),
+                        xy=(rect.get_x() + rect.get_width() / 2, height),
+                        xytext=(0, 3),  # 3点垂直偏移
+                        textcoords="offset points",
+                        ha='center', va='bottom')
+        
+    plt.figure(figsize=(22, 15))
+    for index, y in enumerate(datas):
+        rects = plt.bar(baseline_x + index*bar_span, y, bar_width, label=backends[index])
+        autolabel(backends[index], rects)
+
+    # x轴刻度标签位置与x轴刻度一致
+    plt.xticks(ticks, labels, rotation=330)
+
+    plt.legend()
+    plt.xlabel('Backends')
+    plt.ylabel('Model QPS')
+    plt.yscale('log')
+    plt.title('Reports Summary(QPS)')
+    
+    plt.savefig("general_perf/reports/reports_summary.png", dpi=100)
+
+def get_best_qps(backend, report_name):
+    if not os.path.exists('general_perf/reports/' + backend + '/' + report_name + "/result.json"):
+        return 0
+        
+    with open('general_perf/reports/' + backend + '/' + report_name + "/result.json",  'r') as f:
+        report_info = json.load(f)
+        all_qps= report_info['Performance']
+        best_qps = 0
+        for qps in all_qps:
+            if qps['QPS'] > best_qps:
+                best_qps = qps['QPS']
+        return int(best_qps)
+
+def reports_summary():
+    all_backends = []
+    for file in os.listdir('general_perf/reports'):
+        if os.path.isdir(os.path.join('general_perf/reports', file)):
+            all_backends.append(file)
+
+    all_reports_names = []
+    for backend in all_backends:
+        for report_name in os.listdir('general_perf/reports/' + backend):
+            if report_name not in all_reports_names:
+                all_reports_names.append(report_name)
+
+    summary = {}
+    for name in all_reports_names:
+        summary[name] = {key : 0 for key in all_backends}
+    
+    for report_name in summary.items():
+        for backend in report_name[1].keys():
+            best_qps = get_best_qps(backend, report_name[0])
+            summary[report_name[0]][backend] = best_qps
+    
+    create_multi_bars(summary)
+
+
+if __name__ == "__main__":
+    reports_summary()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e57e2c9c8f12a0f98011960785b28899a54741cf
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
@@ -0,0 +1,12 @@
+matplotlib
+pandas
+virtualenv==16.7.9
+scikit-learn
+prompt_toolkit
+tqdm
+opencv-python
+transformers
+tokenization
+fpdf
+typing-extensions==3.7.4.3
+numpy==1.23.0
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/build_pdf.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/build_pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d849045ba5f94b5be40e2e47fd57691455d8e29
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/build_pdf.py
@@ -0,0 +1,202 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from fpdf import FPDF
+import json
+import math
+import os
+
+
+class PDF(FPDF):
+    def titles(self, title, backend):
+        self.set_xy(0.0, 0.0)
+        self.set_font('Times', 'B', 16)
+        # self.set_text_color(220, 50, 50)
+        self.cell(w=210.0,
+                  h=40.0,
+                  align='C',
+                  txt=title + ' REPORT (' + backend + ')',
+                  border=0)
+
+    def lines(self):
+        self.rect(5.0, 5.0, 200.0, 287.0)
+
+    def icon(self, icon_path):
+        self.set_xy(10.0, 10.0)
+        self.image(icon_path, link='', type='', w=37.6, h=5.2)
+        self.set_xy(157.0, 0.0)
+        self.set_font('Times', 'B', 10)
+        # self.set_text_color(220, 50, 50)
+        self.cell(w=60.0, h=25.0, align='C', txt='BYTE MLPERF', border=0)
+
+    def charts(self, chart_path):
+        self.y += 5
+        self.x += 6
+        self.image(chart_path, link='', type='', w=700 / 4, h=450 / 4.9)
+
+    def diff_tables(self, data, dataset):
+        col_width = 45
+        # self.set_xy(10.00125,40)
+        x = self.x
+        i = 0
+        self.set_font("Times", 'B', size=10)
+        line_height = self.font_size * 2.5
+        self.x = x + 5
+        self.multi_cell(90 * math.ceil(((len(data)) / 3)),
+                        line_height,
+                        'Accuracy Results' + ' (' + dataset + ')',
+                        border=1,
+                        align='C')
+        y = self.y
+        reset_y = self.y
+        self.ln(line_height)
+        self.set_font("Times", size=10)
+        final_y = None
+        for i, (key, val) in enumerate(data.items()):
+            if i < 4:
+                if (i % 3 == 0):
+                    final_y = y
+                    y = reset_y
+                self.x = x + 90 * (i // 3) + 5
+                self.y = y
+                self.multi_cell(col_width,
+                                line_height,
+                                key,
+                                border=1,
+                                align='C')
+                self.x += (45 + 90 * (i // 3)) + 5
+                self.y = y
+                self.multi_cell(col_width,
+                                line_height,
+                                str(val),
+                                border=1,
+                                align='C')
+                y = self.y
+                i += 1
+        if final_y:
+            self.y = final_y
+
+    def graph_tables(self, data):
+        real_data = []
+        row_name = []
+        row_data = []
+        for key, val in data.items():
+            row_name.append(key)
+            row_data.append(str(val))
+        real_data.append(row_name)
+        real_data.append(row_data)
+
+        col_width = 45
+        self.set_xy(10.00125, 30)
+        x = self.x
+        self.x += 27
+        self.set_font("Times", 'B', size=10)
+        line_height = self.font_size * 2.5
+        self.multi_cell(135,
+                        line_height,
+                        'Graph Compilation Results',
+                        border=1,
+                        align='C')
+        y = self.y
+        self.ln(line_height)
+        self.set_font("Times", size=10)
+        for row in real_data:
+            self.x = x
+            for i, datum in enumerate(row):
+                self.y = y
+                self.x += (i + 1) * 45 - 18
+                self.multi_cell(col_width,
+                                line_height,
+                                str(datum),
+                                border=1,
+                                align='C')
+            y = self.y
+        self.y += 5
+
+    def performance_tables(self, data):
+        real_data = []
+        row_name = []
+        for i in range(len(data)):
+            row_data = []
+            for key, val in data[i].items():
+                if i == 0:
+                    row_name.append(key)
+                row_data.append(val)
+            real_data.append(row_data)
+        real_data.insert(0, row_name)
+
+        col_width = 33.75
+        self.set_xy(10.00125, 65)
+        x = self.x
+        self.x += 27
+        self.set_font("Times", 'B', size=10)
+        line_height = self.font_size * 2.5
+        self.multi_cell(135,
+                        line_height,
+                        'Performance Results',
+                        border=1,
+                        align='C')
+        y = self.y
+        self.ln(line_height)
+        self.set_font("Times", size=10)
+        for row in real_data:
+            self.x = x
+            for i, datum in enumerate(row):
+                self.y = y
+                self.x += (i + 1) * 33.75 - 6.75
+                self.multi_cell(col_width,
+                                line_height,
+                                str(datum),
+                                border=1,
+                                align='C')
+            y = self.y
+
+            self.ln(line_height)
+
+    def footer(self):
+        # Go to 1.5 cm from bottom
+        self.set_y(-15)
+        # Select Arial italic 8
+        self.set_font('Arial', 'I', 8)
+        # Print centered page number
+        self.cell(0, 10, '%s' % self.page_no(), 0, 0, 'C')
+
+    def generate_report(self, path):
+        with open(path, 'r') as f:
+            report = json.load(f)
+        output_dir = os.path.dirname(path) + '/'
+        index = output_dir.index('ByteMLPerf') + len('ByteMLPerf')
+        base_path = output_dir[:index]
+
+        icon_path = os.path.join(base_path, 'docs/images/icon.png')
+        self.add_page()
+        self.lines()
+        self.icon(icon_path)
+        self.graph_tables(report['Graph Compile'])
+        if 'Performance' in report:
+            self.performance_tables(report['Performance'])
+        if 'Accuracy' in report:
+            self.diff_tables(report['Accuracy'], report['Dataset'])
+            if 'Diff Dist' in report['Accuracy']:
+                self.charts(output_dir + report['Accuracy']['Diff Dist'])
+        self.titles(report['Model'], report['Backend'])
+        self.set_author('Bytedance')
+        precision = path.split('/')[-1].split('-')[1]
+        self.output(output_dir + report['Model'] + '-TO-' + precision.upper() + '.pdf', 'F')
+        return True
+
+
+def build_pdf(path):
+    pdf = PDF(orientation='P', unit='mm', format='A4')
+    return pdf.generate_report(path)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/convert.sh b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/convert.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f2238f750780de26d693f8e0d35914b4de31bdf6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/convert.sh
@@ -0,0 +1,17 @@
+#！bin/bash
+if [ ! -d "tools/venv" ]; then
+    python3 -m virtualenv tools/venv
+    source tools/venv/bin/activate
+    tools/venv/bin/python3 -m pip install --upgrade pip -q
+    tools/venv/bin/python3 -m pip install -r tools/requirements.txt -q
+else
+    source tools/venv/bin/activate
+fi
+
+if [ "$3" == "pt2onnx" ];then
+    python3 tools/torch_to_onnx.py --model_path $1 --output_path $2
+elif [ "$3" == "saved2onnx" ];then
+    python3 tools/saved_to_onnx.py --model_path $1 --output_path $2
+elif [ "$3" == "saved2frozen" ];then
+    python3 tools/saved_to_frozen.py --model_path $1 --output_path $2
+fi
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/frozen_to_saved.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/frozen_to_saved.py
new file mode 100644
index 0000000000000000000000000000000000000000..98a43e86cd661a377959a8c245651b6ebb49b41d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/frozen_to_saved.py
@@ -0,0 +1,67 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import signature_constants
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
+
+
+def convert_pb_to_server_model(pb_model_path, export_dir, input_names,
+                               output_names):
+    if not input_names:
+        raise ValueError("Converter needs inputs")
+    if not output_names:
+        raise ValueError("Converter needs outputs")
+    input_names = input_names.split(",")
+    output_names = output_names.split(",")
+    graph_def = read_pb_model(pb_model_path)
+    convert_pb_saved_model(graph_def, export_dir, input_names, output_names)
+
+
+def read_pb_model(pb_model_path):
+    with tf.io.gfile.GFile(pb_model_path, "rb") as f:
+        graph_def = tf.compat.v1.GraphDef()
+        graph_def.ParseFromString(f.read())
+        return graph_def
+
+
+def convert_pb_saved_model(graph_def, export_dir, input_names, output_names):
+    builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
+
+    sigs = {}
+    with tf.Session(graph=tf.Graph()) as sess:
+        tf.import_graph_def(graph_def, name="")
+        g = tf.get_default_graph()
+        input_infos = {}
+        output_infos = {}
+        for input_name in input_names:
+            input_infos[input_name] = g.get_tensor_by_name(input_name)
+        for output_name in output_names:
+            output_infos[output_name] = g.get_tensor_by_name(output_name)
+
+        sigs[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = \
+            tf.saved_model.signature_def_utils.predict_signature_def(
+                input_infos, output_infos)
+
+        builder.add_meta_graph_and_variables(sess, [tag_constants.SERVING],
+                                             signature_def_map=sigs)
+        builder.save()
+
+
+path = "densenet121.pb"
+convert_pb_to_server_model(path,
+                           os.path.abspath('.') + "/densenet_saved_model",
+                           "input_1", "fc1000")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/h5_to_frozen.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/h5_to_frozen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0310d4da2df87f612b8a101c163749bfeb054dca
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/h5_to_frozen.py
@@ -0,0 +1,56 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+from tensorflow.keras import backend
+from tensorflow.python.tools import freeze_graph
+from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
+import logging
+import argparse
+
+
+def frozen_graph(h5_file_path, workdir, pb_name):
+    model = tf.keras.models.load_model(h5_file_path,
+                                       custom_objects={
+                                           "backend": backend,
+                                       })
+    model.summary()
+
+    full_model = tf.function(lambda input_1: model(input_1))
+    full_model = full_model.get_concrete_function(
+        tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype))
+
+    # Get frozen ConcreteFunction
+    frozen_func = convert_variables_to_constants_v2(full_model)
+    frozen_func.graph.as_graph_def()
+
+    layers = [op.name for op in frozen_func.graph.get_operations()]
+    print(frozen_func.outputs)
+
+    # Save frozen graph from frozen ConcreteFunction to hard drive
+    tf.io.write_graph(graph_or_graph_def=frozen_func.graph,
+                      logdir=workdir,
+                      name=pb_name,
+                      as_text=False)
+    print('model has been saved')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='VC model h5->freezedpb script')
+    parser.add_argument("--h5_model_path", type=str, required=True)
+    parser.add_argument("--freezed_pb_name", type=str, required=True)
+    parser.add_argument("--workdir", type=str, required=True)
+    args = parser.parse_args()
+    frozen_graph(args.h5_model_path, args.workdir, args.freezed_pb_name)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/model_trt_convert.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/model_trt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..2028a545d7c6b4078a730c29118d6595ec3a6f36
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/model_trt_convert.py
@@ -0,0 +1,43 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+import numpy as np
+gpus = tf.config.experimental.list_physical_devices('GPU')
+if gpus:
+    try:
+        # Currently, memory growth needs to be the same across GPUs
+        for gpu in gpus:
+            tf.config.experimental.set_memory_growth(gpu, True)
+        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
+        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
+    except RuntimeError as e:
+        # Memory growth must be set before GPUs have been initialized
+        print(e)
+
+
+def my_calibration_input_fn():
+    for _ in range(10):
+        yield np.random.normal(size=(1, 224, 224, 3)).astype(np.uint8),
+        # yield tf.random.normal((1, 224, 224, 3)).astype(np.uint8),
+
+
+saved_model_path = 'byte_mlperf/model_zoo/resnet50_saved_model'
+model_params = tf.experimental.tensorrt.ConversionParams(
+    precision_mode="int8".upper(), max_batch_size=64, use_calibration=True)
+model_trt = tf.experimental.tensorrt.Converter(
+    input_saved_model_dir=saved_model_path, conversion_params=model_params)
+model_trt.convert(calibration_input_fn=my_calibration_input_fn)
+output_saved_model_dir = 'test'
+model_trt.save(output_saved_model_dir)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/mxnet_to_onnx.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/mxnet_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af7385a29da02219b0c6e0ba381163f28f6290e
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/mxnet_to_onnx.py
@@ -0,0 +1,84 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import mxnet as mx
+
+import numpy as np
+import onnx
+
+
+def get_mod(prefix, epoch, ctx, data_shape):
+    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+
+    mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+
+    mod.bind(for_training=False,
+             data_shapes=[("data", data_shape)],
+             label_shapes=mod._label_shapes)
+
+    mod.set_params(arg_params, aux_params, allow_missing=True)
+
+    return mod
+
+
+def load_mxnet():
+    prefix = "image_level_space"
+    epoch = 0
+    ctx = mx.cpu()
+    data_shape = (1, 3, 736, 416)
+
+    mod = get_mod(prefix, epoch, ctx, data_shape)
+
+    return mod
+
+
+'''
+require mxnet >= 19.0
+'''
+
+
+def do_mxnet2onnx(sym, params, onnx_file, in_shapes, in_types,
+                  dynamic_input_shapes):
+    '''
+    example:
+
+    sym = 'byte_mlperf/byte_mlperf/download/manysplit/image_level_space-symbol.json'
+    params = 'byte_mlperf/byte_mlperf/download/manysplit/image_level_space-0000.params'
+    onnx_file = 'manysplit.onnx'
+
+    in_shapes = [(1,3,736,416)]
+    in_types = [np.float32]
+    dynamic_input_shapes = [(None,3,736,416)]
+    '''
+
+    converted_model_path = mx.onnx.export_model(
+        sym,
+        params,
+        in_shapes,
+        in_types,
+        onnx_file,
+        dynamic=True,
+        dynamic_input_shapes=dynamic_input_shapes,
+        verbose=True)
+
+    # Load the ONNX model
+    model_proto = onnx.load_model(converted_model_path)
+
+    # Check if the converted ONNX protobuf is valid
+    onnx.checker.check_graph(model_proto.graph)
+
+
+if __name__ == "__main__":
+    # load_mxnet()
+    do_mxnet2onnx()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/onnx_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/onnx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..69d9e2638fc9f9e845643e02ad1377f3dac2c885
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/onnx_utils.py
@@ -0,0 +1,699 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import cast
+import numpy as np
+from numpy.lib.function_base import append
+import onnx
+import onnx.helper as helper
+import onnxruntime as rt
+from onnx import numpy_helper
+from onnx.tools import update_model_dims
+from onnx import shape_inference, TensorProto
+import struct
+import copy
+import sys
+'''
+DType Info
+'''
+ONNX_DTYPE = {
+    0: TensorProto.FLOAT,  # UNDEFINE, default as float32
+    1: TensorProto.FLOAT,
+    2: TensorProto.UINT8,
+    3: TensorProto.INT8,
+    4: TensorProto.UINT16,
+    5: TensorProto.INT16,
+    6: TensorProto.INT32,
+    7: TensorProto.INT64,
+    8: TensorProto.STRING,
+    9: TensorProto.BOOL,
+    10: TensorProto.FLOAT16,
+    11: TensorProto.DOUBLE,
+    12: TensorProto.UINT32,
+    13: TensorProto.UINT64,
+}
+'''
+Nodes
+'''
+
+
+def get_node_by_name(graph, name):
+    for node in graph.node:
+        if node.name == name:
+            return node
+    return None
+
+
+def get_nodes_by_optype(graph, typename):
+    nodes = []
+    for node in graph.node:
+        if node.op_type == typename:
+            nodes.append(node)
+    return nodes
+
+
+def get_node_by_output_name(graph, name):
+    for node in graph.node:
+        if node.output[0] == name:
+            return node
+    return None
+
+
+def get_node_successor(graph, target_node):
+    successor = []
+    for node in graph.node:
+        if len(list(set(node.input).intersection(set(
+                target_node.output)))) > 0:
+            successor.append(node)
+    return successor
+
+
+def get_value_info_by_name(graph, name):
+    for val_info in graph.value_info:
+        if val_info.name == name:
+            return val_info
+    return None
+
+
+def get_shape_from_value_info(val_info):
+    shape = [d.dim_value for d in val_info.type.tensor_type.shape.dim]
+    return shape
+
+
+def remove_weights(graph, name_list):
+    rm_list = []
+    for weight in graph.initializer:
+        if weight.name in name_list:
+            rm_list.append(weight)
+    for weight in rm_list:
+        graph.initializer.remove(weight)
+
+
+def remove_inputs(graph, name_list):
+    rm_list = []
+    for input_t in graph.input:
+        if input_t.name in name_list:
+            rm_list.append(input_t)
+    for input_t in rm_list:
+        graph.input.remove(input_t)
+
+
+def remove_value_infos(graph, name_list):
+    rm_list = []
+    for value_info in graph.value_info:
+        if value_info.name in name_list:
+            rm_list.append(value_info)
+    for value_info in rm_list:
+        graph.value_info.remove(value_info)
+
+
+def remove_node_by_name(graph, name):
+    target_node = get_node_by_name(graph, name)
+    remove_node(graph, target_node)
+
+
+def remove_node(graph, target_node):
+    '''
+        remove the node with only one input and only one output
+    '''
+    node_input = target_node.input[0]
+    node_output = target_node.output[0]
+    # set input of successor node to predecessor node of target node
+    for node in graph.node:
+        for i, n in enumerate(node.input):
+            if n == node_output:
+                node.input[i] = node_input
+
+    target_names = set(target_node.input) & set(
+        [weight.name for weight in graph.initializer])
+    remove_weights(graph, target_names)
+    target_names.add(node_output)
+    remove_inputs(graph, target_names)
+    remove_value_infos(graph, target_names)
+    graph.node.remove(target_node)
+
+
+'''
+Constant & Initializer
+'''
+
+
+def is_initializer(graph, name):
+    for tensor in graph.initializer:
+        if tensor.name == name:
+            return True
+    return False
+
+
+def get_initializer_by_name(graph, name):
+    for tensor in graph.initializer:
+        if tensor.name == name:
+            return tensor
+    return None
+
+
+def get_init_value(tensor):
+    return numpy_helper.to_array(tensor)
+
+
+def set_init_value(graph, weight, data_numpy):
+    # NOTE: weight can be stroed in human readable fields(float_data, int32_data, string_data, ...)
+    # as well as raw_data, if we set weight by raw_data, we must clear the fields above to make it effective
+    # NOTE: data_type between numpy and TensorProto
+
+    raw_shape = tuple([i for i in weight.dims])
+    new_shape = np.shape(data_numpy)
+
+    if weight.data_type == 8:
+        # string data type is special, it requires to store data in string_data field
+        # NOT the raw_data field
+        weight.string_data = bytes(data_numpy, encoding="utf8")
+        weight.ClearField("raw_data")
+
+        return
+
+    if new_shape != raw_shape:
+        print(
+            "Warning: the new weight shape is not consistent with original shape!"
+        )
+
+        weight.dims[:] = list(new_shape)
+
+        #  in cast is graph input?
+        for model_input in graph.input:
+            if model_input.name == weight.name:
+                # copy from onnx.helper...
+                tensor_shape_proto = model_input.type.tensor_type.shape
+                tensor_shape_proto.ClearField("dim")
+                tensor_shape_proto.dim.extend([])
+                for d in new_shape:
+                    dim = tensor_shape_proto.dim.add()
+                    dim.dim_value = d
+
+    weight.ClearField("float_data")
+    weight.ClearField("int32_data")
+    weight.ClearField("int64_data")
+    weight.raw_data = data_numpy.tobytes()
+
+    return
+
+
+def is_constant(node):
+    if node.op_type == "Constant":
+        return True
+    else:
+        return False
+
+
+def get_constant_value(node):
+    for attr in node.attribute:
+        if attr.name == 'value':
+            if attr.t.data_type == 1:
+                return np.array(struct.unpack('f', attr.t.raw_data))
+            elif attr.t.data_type == 2:
+                return np.array(struct.unpack('i', attr.t.raw_data))
+            elif attr.t.data_type == 3:
+                return np.array(struct.unpack('s', attr.t.raw_data))
+            elif attr.t.data_type == 4:
+                return np.array(struct.unpack('t', attr.t.raw_data))
+            elif attr.t.data_type == 5:
+                return np.array(struct.unpack('g', attr.t.raw_data))
+            elif attr.t.data_type == 6:
+                return np.frombuffer(attr.t.raw_data, dtype=np.float32)
+            elif attr.t.data_type == 7:
+                return np.frombuffer(attr.t.raw_data, dtype=np.int32)
+            elif attr.t.data_type == 8:
+                return np.frombuffer(attr.t.raw_data, dtype=np.string)
+            elif attr.t.data_type == 9:
+                return np.frombuffer(attr.t.raw_data, dtype=np.bool)
+            elif attr.t.data_type == 10:
+                return np.frombuffer(attr.t.raw_data, dtype=np.float16)
+            elif attr.t.data_type == 11:
+                return np.frombuffer(attr.t.raw_data, dtype=np.double)
+            elif attr.t.data_type == 12:
+                return np.frombuffer(attr.t.raw_data, dtype=np.uint32)
+            elif attr.t.data_type == 13:
+                return np.frombuffer(attr.t.raw_data, dtype=np.uint64)
+            else:
+                print("unsupported attribute data type with attribute name")
+
+
+def set_constant_value(target_node, value):
+    # NOTE : dtype value should match with target_node
+    for attr in target_node.attribute:
+        if (attr.name == "value"):
+            attr.t.raw_data = value.tobytes()
+
+
+'''
+Attributes
+'''
+
+
+def get_attribute_by_name(node, name):
+    for attr in node.attribute:
+        if attr.name == name:
+            return attr
+    return attr
+
+
+def set_node_attribute(target_node, attr_name, attr_value):
+    flag = False
+    for attr in target_node.attribute:
+        if (attr.name == attr_name):
+            if attr.type == 1:  # float value
+                attr.f = attr_value
+            elif attr.type == 2:  # int value
+                attr.i = attr_value
+            elif attr.type == 3:  # string value
+                attr.s = attr_value
+            elif attr.type == 4:  # tensor value
+                attr.t = attr_value
+            elif attr.type == 5:  # graph value
+                attr.g = attr_value
+            # NOTE: For repeated composite types, we should use something like
+            # del attr.xxx[:]
+            # attr.xxx.extend([n1, n2, n3])
+            elif attr.type == 6:  # float[]
+                attr.floats[:] = attr_value
+            elif attr.type == 7:  # int[]
+                attr.ints[:] = attr_value
+            elif attr.type == 8:  # strings[]
+                attr.strings[:] = attr_value
+            else:
+                print("unsupported attribute data type with attribute name")
+                return False
+            flag = True
+
+    if not flag:
+        # attribute not in original node
+        print("Warning: you are appending a new attribute to the node!")
+        target_node.attribute.append(
+            helper.make_attribute(attr_name, attr_value))
+        flag = True
+
+    return flag
+
+
+'''
+Graph Input/Output
+'''
+
+
+def add_extra_output(graph, target_output, target_shape):
+
+    extra_elem_type = 1
+    for vi in graph.value_info:
+        if vi.name == target_output:
+            extra_elem_type = vi.type.tensor_type.elem_type
+
+    extra_output = helper.make_tensor_value_info(target_output,
+                                                 extra_elem_type, target_shape)
+    '''
+    # NOTE
+    # if we know the value type and shape, we can alse use this
+    def make_tensor_value_info(
+        name,  # type: Text
+        elem_type,  # type: int
+        shape,  # type: Optional[Sequence[Union[Text, int]]]
+        doc_string="",  # type: Text
+        shape_denotation=None,  # type: Optional[List[Text]]
+    ):
+    '''
+
+    graph.output.append(extra_output)
+    return
+
+
+def get_graph_input_by_name(graph, name):
+    for input in graph.input:
+        if input.name == name:
+            return input
+    return None
+
+
+def get_graph_output_by_name(graph, name):
+    for out in graph.output:
+        if out.name == name:
+            return out
+    return None
+
+
+def resort_nodes(model):
+    new_model = copy.deepcopy(model)
+    for n in new_model.graph.node:
+        model.graph.node.remove(n)
+
+    ready_tensors = [n.name for n in model.graph.input]
+    ready_tensors.extend([n.name for n in model.graph.initializer])
+    ready_tensors = set(ready_tensors)
+    all_nodes = [n for n in new_model.graph.node]
+    while True:
+        activate_nodes = []
+        for node in all_nodes:
+            inputs = set(node.input)
+            if len(inputs - ready_tensors) == 0:
+                activate_nodes.append(node)
+
+        assert len(activate_nodes) != 0, 'invalid graph'
+        for node in activate_nodes:
+            model.graph.node.append(node)
+            ready_tensors = ready_tensors | set(node.output)
+            all_nodes.remove(node)
+
+        if len(all_nodes) == 0:
+            break
+    return model
+
+
+'''
+Pass
+'''
+
+
+def fix_model_shape(model,
+                    in_dim_dict=None,
+                    out_dim_dict=None,
+                    fully_si=False):
+
+    if in_dim_dict != None and out_dim_dict != None:
+        update_model_dims.update_inputs_outputs_dims(model, in_dim_dict,
+                                                     out_dim_dict)
+
+    if fully_si:
+        input_num = len(model.graph.input)
+        tensors = model.graph.initializer
+        for i, tensor in enumerate(tensors):
+            value_info = helper.make_tensor_value_info(
+                tensor.name, ONNX_DTYPE[tensor.data_type], tensor.dims)
+            model.graph.input.insert(i + input_num, value_info)
+
+    onnx.checker.check_model(model)
+    model = shape_inference.infer_shapes(model)
+
+    return model
+
+
+def remove_redundant_cast(graph):
+    cast_nodes = get_nodes_by_optype(graph, "Cast")
+    for node in cast_nodes:
+        in_node = get_node_by_output_name(graph, node.input[0])
+        if in_node.op_type == "Cast":
+            print("Removing redundant cast: ", in_node)
+            node.input[0] = in_node.input[0]
+            graph.node.remove(in_node)
+
+
+def onxx_sess_opt(model, opt_model):
+    sess_options = rt.SessionOptions()
+    sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_BASIC
+    sess_options.optimized_model_filepath = opt_model
+    rt.InferenceSession(model,
+                        sess_options,
+                        providers=['CPUExecutionProvider'])
+
+
+# ------------- Model speficted pass --------------------
+
+
+def convert_fp16_to_fp32(model):
+    # handle model.graph.initializer
+    to_convert = []
+    for init in model.graph.initializer:
+        # print(init.name)
+
+        if init.data_type != 10:
+            continue
+        to_convert.append(init)
+
+    for init in to_convert:
+        val = get_init_value(init)
+        new_val = val.astype(np.float32)
+        new_init = numpy_helper.from_array(new_val, init.name)
+        model.graph.initializer.remove(init)
+        model.graph.initializer.append(new_init)
+
+    # handle mode.graph.node
+    cons_ops = get_nodes_by_optype(model.graph, "Constant")
+    for op in cons_ops:
+        val_attr = get_attribute_by_name(op, "value")
+        if val_attr.t.data_type != 10:
+            continue
+
+        # import pdb;pdb.set_trace()
+        val = get_constant_value(op)
+        new_val = val.astype(np.float32)
+        set_constant_value(op, new_val)
+        val_attr.t.data_type = 1
+
+    for val_info in model.graph.value_info:
+        if val_info.type.tensor_type.elem_type != 10:
+            continue
+        val_info.type.tensor_type.elem_type = 1
+
+    # handle cast op
+    cast_ops = get_nodes_by_optype(model.graph, "Cast")
+
+    to_remove = []
+    for cast in cast_ops:
+        to = get_attribute_by_name(cast, "to")
+        if to.i != 10 and to.i != 1:
+            continue
+
+        if to.i == 10:
+            up_node = get_node_by_output_name(model.graph, cast.input[0])
+            set_node_attribute(cast, "to", 1)
+
+            if up_node.op_type != "Cast":
+                continue
+
+            up_to = get_attribute_by_name(up_node, "to")
+            if up_to.i != 1:
+                continue
+
+        if to.i == 1:
+            down_node = get_node_successor(model.graph, cast)
+            if len(down_node) == 0:
+                continue
+
+            if down_node[0].op_type != "Cast":
+                continue
+
+            down_to = get_attribute_by_name(down_node[0], "to")
+            if down_to.i != 10:
+                continue
+
+        # print(cast.name)
+        succs = get_node_successor(model.graph, cast)
+        for succ in succs:
+            for idx, in_name in enumerate(succ.input):
+                if in_name == cast.output[0]:
+                    succ.input[idx] = cast.input[0]
+
+        to_remove.append(cast)
+
+    for cast in to_remove:
+        out_info = get_graph_output_by_name(model.graph, cast.output[0])
+        if out_info == None:
+            model.graph.node.remove(cast)
+        else:
+            node = get_node_by_output_name(model.graph, cast.input[0])
+            if node != None:
+                for idx, out in enumerate(node.output):
+                    if out == cast.input[0]:
+                        node.output[idx] = cast.output[0]
+
+            model.graph.node.remove(cast)
+
+    return model
+
+
+def replace_mask_where(model):
+    # pattern: sub -> cast ----|
+    #           |-----------> where
+    where_ops = get_nodes_by_optype(model.graph, "Where")
+
+    to_replace = []
+    for where_node in where_ops:
+        cond = where_node.input[0]
+        node = get_node_by_output_name(model.graph, cond)
+        if node.op_type != "Cast":
+            continue
+
+        y_in = where_node.input[2]
+        node = get_node_by_output_name(model.graph, y_in)
+        if node.op_type != "Sub":
+            continue
+
+        to_replace.append(where_node)
+
+    to_remove = []
+    for where in to_replace:
+        x_in = where.input[1]
+        y_in = where.input[2]
+        mul_op = onnx.helper.make_node('Mul', [x_in, y_in],
+                                       where.output,
+                                       name="{}_mask_mul_replaced".format(
+                                           where.name))
+        model.graph.node.append(mul_op)
+
+        cast_op = get_node_by_output_name(model.graph, where.input[0])
+        to_remove.append(cast_op)
+        to_remove.append(where)
+
+    for node in to_remove:
+        model.graph.node.remove(node)
+
+    return model
+
+
+def convert_expand_to_tile(model):
+    expand_ops = get_nodes_by_optype(model.graph, "Expand")
+
+    for expand_node in expand_ops:
+        ifm = expand_node.input[0]
+        ofm = expand_node.output[0]
+
+        ifm_vi = get_value_info_by_name(model.graph, expand_node.input[0])
+        if ifm_vi == None:
+            continue
+
+        init_shape = get_initializer_by_name(model.graph, expand_node.input[1])
+        if init_shape == None:
+            continue
+        shape_val = get_init_value(init_shape)
+
+        ofm_shape = shape_val.tolist()
+        ifm_shape = [
+            dim.dim_value for dim in ifm_vi.type.tensor_type.shape.dim
+        ]
+
+        repeats = [
+            1 if i == j else int(j / i) for i, j in zip(ifm_shape, ofm_shape)
+        ]
+
+        repeats = np.array(repeats)
+        repeats = numpy_helper.from_array(
+            repeats, 'Tile_{}_repeats'.format(expand_node.name))
+        tile_node = onnx.helper.make_node('Tile', [ifm, repeats.name], [ofm],
+                                          name=expand_node.name)
+
+        model.graph.node.append(tile_node)
+        model.graph.initializer.append(repeats)
+        model.graph.node.remove(expand_node)
+
+    return model
+
+
+def concat_to_tile(model):
+    def is_tile_type(node):
+        tile_flag = True
+        for idx in range(len(node.input) - 1):
+            if node.input[idx] == node.input[idx + 1]:
+                continue
+            else:
+                tile_flag = False
+                break
+        return tile_flag
+
+    concat_ops = get_nodes_by_optype(model.graph, "Concat")
+
+    for concat in concat_ops:
+        if not is_tile_type(concat):
+            continue
+
+        print("Converting concat to tile")
+
+        in_val = get_value_info_by_name(model.graph, concat.input[0])
+        out_val = get_value_info_by_name(model.graph, concat.output[0])
+        ifm_shape = get_shape_from_value_info(in_val)
+        ofm_shape = get_shape_from_value_info(out_val)
+
+        repeats = [
+            1 if i == j else int(j / i) for i, j in zip(ifm_shape, ofm_shape)
+        ]
+
+        repeats = np.array(repeats)
+        repeats = numpy_helper.from_array(
+            repeats, 'Tile_{}_repeats'.format(concat.name))
+        tile_node = onnx.helper.make_node('Tile',
+                                          [concat.input[0], repeats.name],
+                                          [concat.output[0]],
+                                          name=concat.name)
+
+        model.graph.node.append(tile_node)
+        model.graph.initializer.append(repeats)
+        model.graph.node.remove(concat)
+
+
+def remove_qdq(model):
+    q_ops = get_nodes_by_optype(model.graph, "QuantizeLinear")
+
+    for q_op in q_ops:
+        dq = get_node_successor(model.graph, q_op)
+        if len(dq) != 1 and dq[0].op_type != "DequantizeLinear":
+            continue
+
+        qdq_succ = get_node_successor(model.graph, dq[0])
+        for i, n in enumerate(qdq_succ[0].input):
+            if n == dq[0].output[0]:
+                qdq_succ[0].input[i] = q_op.input[0]
+
+        model.graph.node.remove(q_op)
+        model.graph.node.remove(dq[0])
+
+
+import torch
+from onnx2torch import convert
+import onnxruntime as ort
+
+if __name__ == "__main__":
+    # Path to ONNX model
+    onnx_model_path = 'converted_models/no_qdq_2.onnx'
+    onnx_model = onnx.load(onnx_model_path)
+    in_shape_dict = {
+        "data": [2, 10, 3, 256, 256],
+    }
+    out_shape_dict = {'logits': [2, 2], '1383': [1, 20]}
+    onnx_model = fix_model_shape(onnx_model, in_shape_dict, out_shape_dict,
+                                 True)
+    onnx.save(onnx_model, 'converted_models/no_qdq_3.onnx')
+
+    onxx_sess_opt('converted_models/no_qdq_3.onnx',
+                  'converted_models/no_qdq_3.onnx')
+    onnx_model = onnx.load('converted_models/no_qdq_3.onnx')
+
+    torch_model_2 = convert(onnx_model)
+
+    # You can pass the path to the onnx model to convert it or...
+    # torch_model_1 = convert(onnx_model_path)
+
+    # Create example data
+    x = torch.ones((2, 10, 3, 256, 256))
+
+    out_torch = torch_model_2(x)
+
+    trace_model = torch.jit.trace(torch_model_2, x)
+
+    ort_sess = ort.InferenceSession(onnx_model_path)
+    outputs_ort = ort_sess.run(None, {'data': x.numpy()})
+
+    print(outputs_ort[0] - out_torch[0].detach().numpy())
+    print(outputs_ort[1] - out_torch[1].detach().numpy())
+
+    # Check the Onnx output against PyTorch
+    # print(torch.max(torch.abs(outputs_ort[0] - out_torch[0].detach().numpy())))
+    # print(torch.max(torch.abs(outputs_ort[1] - out_torch[1].detach().numpy())))
+    # print(np.allclose(outputs_ort[0], out_torch[0].detach().numpy(), atol=1.e-7))
+    # print(np.allclose(outputs_ort[1], out_torch[1].detach().numpy(), atol=1.e-7))
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/requirements.txt b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f59eea6505a4b43ae61fc6c09d66dc36aa7cafc7
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/requirements.txt
@@ -0,0 +1,4 @@
+tensorflow>=2.6.0
+tf2onnx
+numpy
+torch==1.9.1
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_frozen.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_frozen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da7fbf3ea73f2306aab1b86ee115c1da6c90ee8
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_frozen.py
@@ -0,0 +1,147 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+An Interface to export saved_models to frozen models.
+
+Please notice, this API makes 2 assumptions
+
+    1. saved_model like below:
+        |--save-model.pb
+        |--variable
+        |-- |--variables.data-00000-of-00001
+        |-- |--variables.index
+
+    2. saved_tags is tag_constants.SERVING by default if not specific
+    3. signature is signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY by default if not specific
+Copyright Reserve: Habana Labs
+'''
+
+import sys
+from tensorflow.python.tools import freeze_graph
+from tensorflow.python.tools import saved_model_cli
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import signature_constants
+import argparse
+from six import StringIO
+import contextlib
+
+
+def freeze_saved_model(saved_model_dir,
+                       output_nodes,
+                       pb_name,
+                       saved_tags=tag_constants.SERVING):
+    input_saved_model_dir = saved_model_dir
+    output_node_names = output_nodes
+    input_binary = False
+    input_saver_def_path = False
+    restore_op_name = None
+    filename_tensor_name = None
+    clear_devices = True
+    input_meta_graph = False
+    checkpoint_path = None
+    input_graph_filename = None
+    saved_model_tags = saved_tags
+    output_graph_filename = pb_name
+
+    freeze_graph.freeze_graph(input_graph_filename, input_saver_def_path,
+                              input_binary, checkpoint_path, output_node_names,
+                              restore_op_name, filename_tensor_name,
+                              output_graph_filename, clear_devices, "", "", "",
+                              input_meta_graph, input_saved_model_dir,
+                              saved_model_tags)
+
+
+@contextlib.contextmanager
+def captured_output():
+    new_out, new_err = StringIO(), StringIO()
+    old_out, old_err = sys.stdout, sys.stderr
+
+    try:
+        sys.stdout, sys.stderr = new_out, new_err
+        yield sys.stdout, sys.stderr
+    finally:
+        sys.stdout, sys.stderr = old_out, old_err
+
+
+def get_output_node(saved_model_dir, saved_tags, sign):
+
+    parser = saved_model_cli.create_parser()
+    args = parser.parse_args([
+        'show', '--dir', saved_model_dir, '--tag_set', saved_tags,
+        '--signature_def', sign
+    ])
+
+    with captured_output() as (out, err):
+        saved_model_cli.show(args)
+
+    result = out.getvalue().strip()
+
+    print(result)
+
+    output_num = 0
+    output_nodes = None
+    lines = result.split('\n')
+    for idx, line in enumerate(result.split('\n')):
+        if "outputs[" in line:
+            line = lines[idx + 3]
+            output = line.split(":")[1]
+            if output_num > 0:
+                output_nodes = output_nodes + "," + output
+            else:
+                output_nodes = output
+            output_num = output_num + 1
+
+    if output_nodes == None:
+        raise RuntimeError("No Output Nodes found in saved_model.")
+
+    return output_nodes, output_num
+
+
+def saved_to_frozen(
+    saved_model_dir,
+    frozen_path,
+    saved_tags=tag_constants.SERVING,
+    sign=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+
+    output_nodes, output_num = get_output_node(saved_model_dir, saved_tags,
+                                               sign)
+
+    output_nodes = output_nodes
+
+    print("[INFO]: Save Model has [", output_num, "] outputs.")
+    print("[INFO]: Outputs Nodes: [", output_nodes, "].")
+
+    # cwd = os.getcwd()
+    # frozen_path = os.path.join(cwd, "converted_frozen.pb")
+
+    freeze_saved_model(saved_model_dir, output_nodes, frozen_path, saved_tags)
+
+    print("[INFO]: Saved Model convert to Frozen Model done.")
+    print("[INFO]: Frozen Model saved here: ", frozen_path)
+
+    return frozen_path
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    saved_to_frozen(args.model_path, args.output_path)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_onnx.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b6a74560e8ce6b733f0c78d6e8984736c84ee60
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/saved_to_onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tf2onnx
+from tf2onnx import tf_loader
+import argparse
+ONNX_OPSET = 11
+
+
+def _convert_graphdef_to_onnx(graph_def,
+                              inputs=None,
+                              outputs=None,
+                              output_path='',
+                              **kwargs):
+    inputs_as_nchw = kwargs.get('inputs_as_nchw', None)
+    custom_ops = kwargs.get('custom_ops', None)
+    custom_op_handlers = kwargs.get('custom_op_handlers', None)
+    custom_rewriter = kwargs.get('custom_rewriter', None)
+    extra_opset = kwargs.get('extra_opset', None)
+    large_model = kwargs.get('large_model', False)
+    name = kwargs.get('name', 'habana_convert')
+    target = kwargs.get('target', None)
+    shape_override = kwargs.get('shape_override', {})
+
+    tf2onnx.convert.from_graph_def(graph_def,
+                                   name=name,
+                                   input_names=inputs,
+                                   output_names=outputs,
+                                   opset=ONNX_OPSET,
+                                   custom_ops=custom_ops,
+                                   custom_op_handlers=custom_op_handlers,
+                                   custom_rewriter=custom_rewriter,
+                                   inputs_as_nchw=inputs_as_nchw,
+                                   extra_opset=extra_opset,
+                                   shape_override=shape_override,
+                                   target=target,
+                                   large_model=large_model,
+                                   output_path=output_path)
+    return output_path
+
+
+def savedmodel_to_onnx(model_path, output_path='', **kwargs):
+    inputs = kwargs.get('inputs', None)
+    outputs = kwargs.get('outputs', None)
+    graph_def, inputs, outputs = tf_loader.from_saved_model(
+        model_path, inputs, outputs)
+    return _convert_graphdef_to_onnx(graph_def, inputs, outputs, output_path,
+                                     **kwargs)
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    savedmodel_to_onnx(args.model_path, args.output_path)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_fp32_to_fp16.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_fp32_to_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1f6dad9687d751c55a0e5c5f9a0a73997d70fff
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_fp32_to_fp16.py
@@ -0,0 +1,189 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+# tf.contrib.resampler
+from tensorflow.core.framework import types_pb2, graph_pb2, attr_value_pb2
+from tensorflow.tools.graph_transforms import TransformGraph
+from google.protobuf import text_format
+import numpy as np
+from textops import tf_load_op_library
+
+# Const should be float32 in object detection api during nms (see here: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/non-max-suppression-v4.html)
+keep_fp32_node_name = []
+keep_fp16_node_name = []
+
+
+def load_graph(model_path):
+    graph = tf.Graph()
+    with graph.as_default():
+        graph_def = tf.GraphDef()
+        if model_path.endswith("pb"):
+            with open(model_path, "rb") as f:
+                graph_def.ParseFromString(f.read())
+        else:
+            with open(model_path, "r") as pf:
+                text_format.Parse(pf.read(), graph_def)
+        tf.import_graph_def(graph_def, name="")
+        sess = tf.Session(graph=graph)
+        return sess
+
+
+def rewrite_batch_norm_node_v2(node, graph_def, target_type='fp16'):
+    """
+    Rewrite FusedBatchNorm with FusedBatchNormV2 for reserve_space_1 and reserve_space_2 in FusedBatchNorm require float32 for 
+    gradient calculation (See here: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/fused-batch-norm)
+    """
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp64':
+        dtype = types_pb2.DT_DOUBLE
+    else:
+        dtype = types_pb2.DT_FLOAT
+    new_node = graph_def.node.add()
+    new_node.op = "FusedBatchNormV2"
+    new_node.name = node.name
+    new_node.input.extend(node.input)
+    new_node.attr["U"].CopyFrom(
+        attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+    for attr in list(node.attr.keys()):
+        if attr == "T":
+            node.attr[attr].type = dtype
+        new_node.attr[attr].CopyFrom(node.attr[attr])
+    print("rewrite fused_batch_norm done!")
+
+
+def convert_graph_to_fp16(model_path,
+                          save_path,
+                          name,
+                          as_text=False,
+                          target_type='fp16',
+                          input_name=None,
+                          output_names=None):
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp64':
+        dtype = types_pb2.DT_DOUBLE
+    else:
+        dtype = types_pb2.DT_FLOAT
+
+    source_sess = load_graph(model_path)
+    source_graph_def = source_sess.graph.as_graph_def()
+    target_graph_def = graph_pb2.GraphDef()
+    target_graph_def.versions.CopyFrom(source_graph_def.versions)
+
+    for node in source_graph_def.node:
+        # fused batch norm node
+        if node.op == "FusedBatchNorm":
+            rewrite_batch_norm_node_v2(node,
+                                       target_graph_def,
+                                       target_type=target_type)
+            continue
+
+        # replicate node
+        new_node = target_graph_def.node.add()
+        new_node.op = node.op
+        new_node.name = node.name
+        new_node.input.extend(node.input)
+        attrs = list(node.attr.keys())
+
+        # keep batch norm params node
+        if ("BatchNorm" in node.name) or ('batch_normalization' in node.name):
+            for attr in attrs:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+            continue
+
+        # replace dtype in node attr with target dtype
+        for attr in attrs:
+            # keep special node in fp32
+            if node.name in keep_fp32_node_name:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+                continue
+
+            if node.attr[attr].type == types_pb2.DT_FLOAT:
+                # modify node dtype
+                node.attr[attr].type = dtype
+
+            if attr == "value":
+                tensor = node.attr[attr].tensor
+                if tensor.dtype == types_pb2.DT_FLOAT:
+                    # if float_val exists
+                    if tensor.float_val:
+                        float_val = tf.make_ndarray(node.attr[attr].tensor)
+                        new_node.attr[attr].tensor.CopyFrom(
+                            tf.make_tensor_proto(float_val, dtype=dtype))
+                        continue
+
+                    # if tensor content exists
+                    if tensor.tensor_content:
+                        tensor_shape = [
+                            x.size for x in tensor.tensor_shape.dim
+                        ]
+                        tensor_weights = tf.make_ndarray(tensor)
+                        # reshape tensor
+                        tensor_weights = np.reshape(tensor_weights,
+                                                    tensor_shape)
+                        tensor_proto = tf.make_tensor_proto(tensor_weights,
+                                                            dtype=dtype)
+                        new_node.attr[attr].tensor.CopyFrom(tensor_proto)
+                        continue
+
+            new_node.attr[attr].CopyFrom(node.attr[attr])
+
+    # transform graph
+    if output_names:
+        if not input_name:
+            input_name = []
+        transforms = ["strip_unused_nodes"]
+        target_graph_def = TransformGraph(target_graph_def, input_name,
+                                          output_names, transforms)
+
+    # write graph_def to model
+    tf.io.write_graph(target_graph_def,
+                      logdir=save_path,
+                      name=name,
+                      as_text=as_text)
+    print("Converting done ...")
+
+
+def main():
+    # input_name = ["input_ids", "segment_ids", "input_mask"]
+    # output_names = ["output_scores"]
+    input_name = [
+        "block_ids", "font_size", "height", "strclass", "tag_titles", "tags",
+        "text", "urls", "width", "x_axis", "y_axis"
+    ]
+    output_names = ["loss/Softmax", "init_all_tables"]
+
+    model_path = "frozen_init_all_table.pb"
+    save_path = "./"
+    name = "fp32_frozen_init_all_table.pb"
+    as_text = False
+    target_type = 'fp32'
+    convert_graph_to_fp16(model_path,
+                          save_path,
+                          name,
+                          as_text=as_text,
+                          target_type=target_type,
+                          input_name=input_name,
+                          output_names=output_names)
+    # test loading
+    # ISSUE: loading detection model is extremely slow while loading classification model is normal
+    sess = load_graph(save_path + "/" + name)
+    print("DONE!")
+
+
+if __name__ == "__main__":
+    tf_load_op_library()
+    main()
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_utils.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e197d4aeec3ef820283500b676e6af5f6399ad7
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/tf_utils.py
@@ -0,0 +1,861 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+import tensorflow as tf
+from tensorflow.core import framework
+from tensorflow.core.framework import types_pb2, graph_pb2, attr_value_pb2
+from tensorflow.tools.graph_transforms import TransformGraph
+from google.protobuf import text_format
+import numpy as np
+
+
+def isTextProtobuf(filename):
+    """ Returns whether a filename is a text protobuf based on the file extension.
+
+    Args:
+        filename: string - file name to process.
+
+    Returns:
+        true if `filename`'s extension is .pbtxt, false otherwise.
+    """
+
+    retval = False
+
+    _, filename_ext = os.path.splitext(filename)
+    if filename_ext and filename_ext.lower() == ".pbtxt":
+        retval = True
+
+    return retval
+
+
+def saveGraphProtobufToFile(file_name, graph_d):
+    """ Saves a `GraphDef` protocol buffer graph to a file.
+
+    Args:
+        file_name: string - name of the file where to write the graph.
+        graph_d: The `GraphDef` protocol buffer to save.
+    """
+    output_file_name_no_dir = os.path.basename(file_name)
+    output_file_dir = os.path.dirname(file_name)
+    tf.io.write_graph(graph_d,
+                      output_file_dir,
+                      output_file_name_no_dir,
+                      as_text=isTextProtobuf(file_name))
+
+
+def loadGraphProtobufFromFile(file_name):
+    """ Loads a `GraphDef` protocol buffer graph from a file.
+
+    Args:
+        file_name: string - name of the file to load.
+
+    Returns:
+        A `GraphDef` protocol buffer loaded from the file.
+    """
+    graph_d = framework.graph_pb2.GraphDef()
+    with open(file_name, "rb") as f:
+        if isTextProtobuf(file_name):
+            # for text file:
+            text_format.Merge(f.read(), graph_d)
+        else:
+            # for binary file:
+            graph_d.ParseFromString(f.read())
+    return graph_d
+
+
+def duplicateGraph(graph_d):
+    """ Creates a deep copy of a tf GraphDef.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to duplicate.
+
+    Returns:
+        A deep copy of the specified tf GraphDef.
+    """
+
+    with tf.Graph().as_default() as tmp_graph:
+        _ = tf.import_graph_def(graph_d, name="")
+        return tmp_graph.as_graph_def()
+
+
+def getNodeNames(nodes_d):
+    """ Compiles a list of strings representing all the name of
+    the nodes in the specified list of nodes.
+
+    Args:
+        nodes_d: List of `NodeDef` objects to process.
+
+    Returns:
+        A list of strings representing all the name of the nodes in `nodes_d`.
+    """
+    return [node_d.name for node_d in nodes_d]
+
+
+def getNodeIndexByName(nodes_d, node_name):
+    """ Finds the NodeDef node in list of NodeDef corresponding to
+    the specified name.
+
+    Args:
+        nodes_d: List of `NodeDef` objects to process.
+        node_name: node to find.
+
+    Returns:
+        And integer index representing the index of the node in the list
+        passed or -1 if not found.
+    """
+
+    retval = -1
+    for i, node_d in enumerate(nodes_d):
+        if node_d.name == node_name:
+            retval = i
+            break
+    return retval
+
+
+def getNodeInputNamesClean(node_input_names):
+    retval = []
+    for input_name in node_input_names:
+        tensor_idx = input_name.rfind(":")
+        if tensor_idx < 0:
+            retval.append(input_name)
+        else:
+            retval.append(input_name[:tensor_idx])
+    return retval
+
+
+def getNodeByName(nodes_d, node_name):
+    """ Finds the NodeDef node in list of NodeDef corresponding to
+    the specified name.
+
+    Args:
+        nodes_d: List of `NodeDef` objects to process.
+        node_name: node to find.
+
+    Returns:
+        The `NodeDef` node in `nodes_d` corresponding to the specified name,
+        or None if name is not found in `nodes_d`.
+    """
+
+    retval = getNodeIndexByName(nodes_d, node_name)
+    if (retval < 0):
+        retval = None
+    else:
+        retval = nodes_d[retval]
+    return retval
+
+
+def getInputNodeNames(graph_d):
+    """ Finds the placeholder nodes (or inputs) in the graph.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+
+    Returns:
+        A list of node names corresponding to all nodes that are
+        inputs to the graph.
+    """
+
+    retval = []
+    for node_d in graph_d.node:
+        if node_d.op == "Placeholder":
+            retval.append(node_d.name)
+    return retval
+
+
+def getOutputNodeNames(graph_d):
+    """ Finds the nodes that are leaf nodes (or outputs) in the graph.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+
+    Returns:
+        A list of node names corresponding to all nodes that are
+        leaf nodes (or outputs) in the graph.
+    """
+
+    non_output_node_names = set()
+    for node_d in graph_d.node:
+        non_output_node_names = non_output_node_names | set(
+            getNodeInputNamesClean(node_d.input))
+    graph_node_names = set(getNodeNames(graph_d.node))
+    return list(graph_node_names - non_output_node_names)
+
+
+def getNodesInOutput(graph_d, node_name):
+    """ Finds all nodes that use the output of specified node as
+    their input in the specified graph.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+        node_name: String name of node to check.
+
+    Returns:
+        A list of node names corresponding to all nodes that use the
+        output of specified node as their input.
+    """
+    retval = []
+
+    for node_d in graph_d.node:
+        node_input_names = getNodeInputNamesClean(node_d.input)
+        for id, input_name in enumerate(node_input_names):
+            if input_name == node_name:
+                retval.append([id, node_d.name])
+                break
+
+    return retval
+
+
+def getNodesInSubGraph(graph_d, start_nodes, end_nodes):
+    subgraph = []
+    for node in start_nodes:
+        subgraph.append(node)
+
+    successor = start_nodes
+    while len(successor) != 0:
+        for node in successor:
+            tmp_suc = getNodesInOutput(graph_d, node)
+            for suc in tmp_suc:
+                if suc in subgraph:
+                    continue
+                else:
+                    subgraph.append(suc)
+        successor = tmp_suc
+
+    return subgraph
+
+
+def convertTensorflow2NumpyShape(shape_tf):
+    """ Converts a tensorflow `TensorShape` to a numpy shape.
+    All unknown values for partial shapes will be converted to -1.
+
+    Args:
+        shape_tf: A `TensorShape` object to convert.
+
+    Returns:
+        A list of values representing a valid numpy style shape.
+    """
+    retval = [
+        shape_val if shape_val is not None else -1
+        for shape_val in shape_tf.as_list()
+    ]
+    return retval
+
+
+def convertNumpy2TensorflowShape(shape_np):
+    """ Converts a numpy shape to a tensorflow shape.
+    All unknown (-1) values for partial shapes will be converted to None.
+
+    Args:
+        shape_np: A list of values representing a valid numpy shape.
+
+    Returns:
+        A list of values representing a valid tensorflow style shape.
+    """
+    retval = [shape_val if shape_val >= 0 else None for shape_val in shape_np]
+    return retval
+
+
+def getInputShape(graph_d, numpy_format=False):
+    """ Retrieves the shape of all inputs to specified `GraphDef` object.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+        numpy_format: boolean - if False (default), shape is given in tensorflow format,
+            otherwise, numpy format.
+
+    Returns:
+        A mapping string => list: from input tensor name to shape.
+    """
+
+    retval = {}
+
+    input_node_names = getInputNodeNames(graph_d)
+
+    tf.import_graph_def(graph_d, name="")
+    for input_node_name in input_node_names:
+        # find all output tensors for this placeholder, i.e. input:0, input:1, etc.
+        try:
+            i = 0
+            while True:
+                input_tensor_name = input_node_name + ":" + str(i)
+                next_input_tensor = tf.get_default_graph().get_tensor_by_name(
+                    input_tensor_name)
+                tensor_shape = next_input_tensor.shape
+                if numpy_format:
+                    tensor_shape = convertTensorflow2NumpyShape(tensor_shape)
+                retval[input_tensor_name] = tensor_shape
+                i += 1
+        except:
+            pass  # reached the end of the placeholder outputs
+
+    return retval
+
+
+def getInputOutputNodes(frozen_graph):
+    """ Finds all input and output nodes in the specified graph.
+
+    Args:
+        frozen_graph: TensorFlow frozen graph
+
+    Returns:
+        A list of input and output node names.
+    """
+    predefined_inputs = ['segment', 'mask', 'input_ids']
+    graph_d = loadGraphProtobufFromFile(frozen_graph)
+    inputs = getInputNodeNames(graph_d)
+    outputs = getOutputNodeNames(graph_d)
+    nodes = [
+        str for str in inputs if any(sub in str for sub in predefined_inputs)
+    ]
+    if len(nodes) == len(predefined_inputs):
+        return [inputs, outputs]
+    else:
+        status, inputs = findNodeByName(graph_d, predefined_inputs)
+        if status:
+            return [inputs, outputs]
+        else:
+            raise RuntimeError(
+                "Cannot find suitable inputs for this tool, please indicate the names of inputs after preprocessing"
+            )
+
+
+def findNodeByName(graph_d, node_name):
+    """ Finds nodes specified by name in the specified graph.
+
+    Args:
+        graph_d: A `GraphDef` protocol buffer to process.
+        node_name: String name of node to check.
+
+    Returns:
+        status - True if all nodes are found, False otherwise
+        A list of node names.
+    """
+    status = False
+    all_nodes = list(getNodeNames(graph_d.node))
+    retval = [str for str in all_nodes if any(sub in str for sub in node_name)]
+    if len(node_name) == len(retval):
+        status = True
+
+    return status, retval
+
+
+def load_graph(model_path):
+    graph = tf.Graph()
+    with graph.as_default():
+        graph_def = tf.GraphDef()
+        if model_path.endswith("pb"):
+            with open(model_path, "rb") as f:
+                graph_def.ParseFromString(f.read())
+        else:
+            with open(model_path, "r") as pf:
+                text_format.Parse(pf.read(), graph_def)
+        return graph_def
+
+
+from opt_tf import *
+import os
+import tensorflow as tf
+import sys
+from tensorflow.python.tools import freeze_graph
+from tensorflow.python.tools import saved_model_cli
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.tools.graph_transforms import TransformGraph
+from six import StringIO, iteritems
+import contextlib
+
+from tensorflow.core.framework import types_pb2, tensor_shape_pb2, graph_pb2, attr_value_pb2
+
+import numpy as np
+from load_runstep import load_runstep
+
+
+def load_graph(model):
+    graph_def = tf.GraphDef()
+
+    print("load model: ", model)
+    with open(model, 'rb') as f:
+        graph_def.ParseFromString(f.read())
+
+    return graph_def
+
+
+def find_node(graph_def, name):
+    node = None
+    for n in graph_def.node:
+        if n.name == name:
+            node = n
+            break
+    # if node == None:
+    #     print('Node {} not found'.format(name))
+
+    return node
+
+
+def find_node_by_type(graph_def, type):
+    node = []
+    for n in graph_def.node:
+        if n.op == type:
+            node.append(n)
+    return node
+
+
+def get_node_successor(graph_def, node_name):
+    outputs = []
+    for n in graph_def.node:
+        for input in n.input:
+            if node_name == input.split(':')[0]:
+                outputs.append(n)
+
+    # if len(outputs) == 0:
+    #     print("[INFO] {} has no successor".format(node_name))
+
+    return outputs
+
+
+def get_node_output(graph_def, node_name):
+    outputs = []
+    for n in graph_def.node:
+        for input in n.input:
+            if node_name == input.split(':')[0]:
+                if len(input.split(':')) == 1:
+                    if not input + ":0" in outputs:
+                        outputs.append(input + ":0")
+                else:
+                    if not input in outputs:
+                        outputs.append(input)
+
+    # if len(outputs) == 0:
+    #     print("[INFO] {} has no output".format(node_name))
+
+    return outputs
+
+
+# single in & singel out
+
+
+def remove_nodes(graph_d, nodes):
+    for node in nodes:
+        # assert len(node.input) == 1
+        pre_node = node.input[0]
+
+        succ_nodes = get_node_successor(graph_d, node.name)
+        for succ in succ_nodes:
+            for idx, name in enumerate(succ.input):
+                if name == node.name:
+                    succ.input[idx] = pre_node
+
+        graph_d.node.remove(node)
+
+    return graph_d
+
+
+def create_shape_proto(shape):
+    shape_proto = tensor_shape_pb2.TensorShapeProto()
+    for dim in shape:
+        shape_proto.dim.add().size = dim
+    return attr_value_pb2.AttrValue(shape=shape_proto)
+
+
+def set_shape(node, shape):
+    node.attr["shape"].CopyFrom(create_shape_proto(shape))
+
+
+def remove_control_dep(graph_def):
+    # reset & import
+    tf.reset_default_graph()
+    tf.import_graph_def(graph_def, name="")
+
+    for node in graph_def.node:
+        op = tf.get_default_graph().get_operation_by_name(node.name)
+        if len(op.control_inputs) != 0:
+            tf.contrib.graph_editor.remove_control_inputs(
+                op, op.control_inputs)
+
+    graph_def = tf.get_default_graph().as_graph_def()
+    return graph_def
+
+
+def is_leaf_node(graph_d, name):
+    for n in graph_d.node:
+        for in_n in n.input:
+            if name == in_n or name == in_n.split(":0")[0]:
+                return False
+    return True
+
+
+def get_node_shape(node):
+    return [d.size for d in node.attr["shape"].shape.dim]
+
+
+def get_graph_input(graph_d):
+    in_node = []
+    for n in graph_d.node:
+        if n.op == "Placeholder":
+            in_node.append(n.name)
+
+    to_remove = []
+    for in_n in in_node:
+        if is_leaf_node(graph_d, in_n):
+            to_remove.append(in_n)
+
+    for name in to_remove:
+        node = find_node(graph_d, name)
+        graph_d.node.remove(node)
+
+    real_in = set(in_node) - set(to_remove)
+
+    return list(real_in)
+
+
+def get_graph_output(graph_d):
+    out_node = []
+    for n in graph_d.node:
+        if len(get_node_successor(graph_d, n.name)) == 0:
+            out_node.append(n.name)
+
+    # if len(out_node) == 0:
+    #     print("[INFO] Graph No Outputs??")
+
+    return out_node
+
+
+def get_constant_val(node):
+    val = tf.make_ndarray(node.attr["value"].tensor)
+    return val
+
+
+def get_dtype_from_np(val):
+    if val.dtype == np.int32:
+        return types_pb2.DT_INT32
+
+    if val.dtype == np.float32:
+        return types_pb2.DT_FLOAT
+
+    if val.dtype == np.int64:
+        return types_pb2.DT_INT64
+
+    if val.dtype == np.float16:
+        return types_pb2.DT_HALF
+
+    raise ValueError("DTYPE {} NOT SUPPORTEED!".format(val.dtype))
+
+
+def set_constant_val(node, val):
+    tf_dtype = get_dtype_from_np(val)
+    node.attr["value"].tensor.CopyFrom(
+        tf.make_tensor_proto(val, dtype=tf_dtype))
+
+
+@contextlib.contextmanager
+def captured_output():
+    new_out, new_err = StringIO(), StringIO()
+    old_out, old_err = sys.stdout, sys.stderr
+
+    try:
+        sys.stdout, sys.stderr = new_out, new_err
+        yield sys.stdout, sys.stderr
+    finally:
+        sys.stdout, sys.stderr = old_out, old_err
+
+
+def get_saved_input_node(saved_model_dir, saved_tags, sign):
+
+    parser = saved_model_cli.create_parser()
+    args = parser.parse_args([
+        'show', '--dir', saved_model_dir, '--tag_set', saved_tags,
+        '--signature_def', sign
+    ])
+
+    with captured_output() as (out, err):
+        saved_model_cli.show(args)
+
+    result = out.getvalue().strip()
+
+    input_tensors = []
+
+    lines = result.split('\n')
+    for idx, line in enumerate(result.split('\n')):
+        if "inputs[" in line:
+            line = lines[idx + 3]
+            input = line.split(":")[1]
+            input_tensors.append(input.strip() + ":0")
+    return input_tensors
+
+
+def get_saved_output_node(saved_model_dir, saved_tags, sign):
+
+    parser = saved_model_cli.create_parser()
+    args = parser.parse_args([
+        'show', '--dir', saved_model_dir, '--tag_set', saved_tags,
+        '--signature_def', sign
+    ])
+
+    with captured_output() as (out, err):
+        saved_model_cli.show(args)
+
+    result = out.getvalue().strip()
+
+    # print(result)
+
+    output_nodes = []
+    lines = result.split('\n')
+    for idx, line in enumerate(result.split('\n')):
+        if "outputs[" in line:
+            line = lines[idx + 3]
+            output = line.split(":")[1]
+            output_nodes.append(output.strip() + ":0")
+
+    return output_nodes
+
+
+def duplicate_const(graph_d):
+    all_consts = find_node_by_type(graph_d, "Const")
+
+    need_duplicate = []
+    for node in all_consts:
+        if len(get_node_successor(graph_d, node.name)) > 1:
+            need_duplicate.append(node.name)
+
+    for node in need_duplicate:
+        succ_nodes = get_node_successor(graph_d, node)
+
+        for idx, succ in enumerate(succ_nodes):
+            ori_node = find_node(graph_d, node)
+
+            new_node = graph_d.node.add()
+            new_node.op = ori_node.op
+            new_node.name = ori_node.name + "new_{}".format(idx)
+            new_node.input.extend(ori_node.input)
+            attrs = list(ori_node.attr.keys())
+            for attr in attrs:
+                new_node.attr[attr].CopyFrom(ori_node.attr[attr])
+
+            for i, input in enumerate(succ.input):
+                if input == ori_node.name:
+                    succ.input[i] = new_node.name
+
+    return graph_d
+
+
+def rewrite_batch_norm_node_v2(node, graph_def, target_type):
+    """
+    Rewrite FusedBatchNorm with FusedBatchNormV2 for reserve_space_1 and reserve_space_2 in FusedBatchNorm require float32 for 
+    gradient calculation (See here: https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/fused-batch-norm)
+    """
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp32':
+        dtype = types_pb2.DT_FLOAT
+
+    new_node = graph_def.node.add()
+    new_node.op = "FusedBatchNormV2"
+    new_node.name = node.name
+    new_node.input.extend(node.input)
+    new_node.attr["U"].CopyFrom(
+        attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+    for attr in list(node.attr.keys()):
+        if attr == "T":
+            node.attr[attr].type = dtype
+        new_node.attr[attr].CopyFrom(node.attr[attr])
+
+    print("rewrite fused_batch_norm done!")
+
+
+def convert_graph_to_fp16(model_path,
+                          save_path,
+                          name,
+                          as_text=False,
+                          target_type='fp16',
+                          input_name=None,
+                          output_names=None,
+                          keep_fp32_node_name=[]):
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp32':
+        dtype = types_pb2.DT_FLOAT
+
+    source_sess = load_graph(model_path)
+    source_graph_def = source_sess.graph.as_graph_def()
+    target_graph_def = graph_pb2.GraphDef()
+    target_graph_def.versions.CopyFrom(source_graph_def.versions)
+
+    for node in source_graph_def.node:
+        # fused batch norm node
+        if node.op == "FusedBatchNorm":
+            rewrite_batch_norm_node_v2(node,
+                                       target_graph_def,
+                                       target_type=target_type)
+            continue
+
+        # replicate node
+        new_node = target_graph_def.node.add()
+        new_node.op = node.op
+        new_node.name = node.name
+        new_node.input.extend(node.input)
+        attrs = list(node.attr.keys())
+
+        # keep batch norm params node
+        if ("BatchNorm" in node.name) or ('batch_normalization' in node.name):
+            for attr in attrs:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+            continue
+
+        # replace dtype in node attr with target dtype
+        for attr in attrs:
+            # keep special node in fp32
+            if node.name in keep_fp32_node_name:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+                continue
+
+            if node.attr[attr].type == types_pb2.DT_FLOAT:
+                # modify node dtype
+                node.attr[attr].type = dtype
+
+            if attr == "value":
+                tensor = node.attr[attr].tensor
+                if tensor.dtype == types_pb2.DT_FLOAT:
+                    # if float_val exists
+                    if tensor.float_val:
+                        float_val = tf.make_ndarray(node.attr[attr].tensor)
+                        new_node.attr[attr].tensor.CopyFrom(
+                            tf.make_tensor_proto(float_val, dtype=dtype))
+                        continue
+
+                    # if tensor content exists
+                    if tensor.tensor_content:
+                        tensor_shape = [
+                            x.size for x in tensor.tensor_shape.dim
+                        ]
+                        tensor_weights = tf.make_ndarray(tensor)
+                        # reshape tensor
+                        tensor_weights = np.reshape(tensor_weights,
+                                                    tensor_shape)
+                        tensor_proto = tf.make_tensor_proto(tensor_weights,
+                                                            dtype=dtype)
+                        new_node.attr[attr].tensor.CopyFrom(tensor_proto)
+                        continue
+
+            new_node.attr[attr].CopyFrom(node.attr[attr])
+
+    # transform graph
+    if output_names:
+        if not input_name:
+            input_name = []
+        transforms = ["strip_unused_nodes"]
+        target_graph_def = TransformGraph(target_graph_def, input_name,
+                                          output_names, transforms)
+
+    # write graph_def to model
+    tf.io.write_graph(target_graph_def,
+                      logdir=save_path,
+                      name=name,
+                      as_text=as_text)
+    print("Converting done ...")
+
+
+def convert_graph_to_fp32(model_path,
+                          save_path,
+                          name,
+                          as_text=False,
+                          target_type='fp32',
+                          input_name=None,
+                          output_names=None,
+                          keep_fp16_node_name=[]):
+    if target_type == 'fp16':
+        dtype = types_pb2.DT_HALF
+    elif target_type == 'fp32':
+        dtype = types_pb2.DT_FLOAT
+
+    source_sess = load_graph(model_path)
+    source_graph_def = source_sess.graph.as_graph_def()
+    target_graph_def = graph_pb2.GraphDef()
+    target_graph_def.versions.CopyFrom(source_graph_def.versions)
+
+    for node in source_graph_def.node:
+        # fused batch norm node
+        if node.op == "FusedBatchNorm":
+            rewrite_batch_norm_node_v2(node,
+                                       target_graph_def,
+                                       target_type=target_type)
+            continue
+
+        # replicate node
+        new_node = target_graph_def.node.add()
+        new_node.op = node.op
+        new_node.name = node.name
+        new_node.input.extend(node.input)
+        attrs = list(node.attr.keys())
+
+        # keep batch norm params node
+        if ("BatchNorm" in node.name) or ('batch_normalization' in node.name):
+            for attr in attrs:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+            continue
+
+        # replace dtype in node attr with target dtype
+        for attr in attrs:
+            # keep special node in fp16
+            if node.name in keep_fp16_node_name:
+                new_node.attr[attr].CopyFrom(node.attr[attr])
+                continue
+
+            if node.attr[attr].type == types_pb2.DT_HALF:
+                # modify node dtype
+                node.attr[attr].type = dtype
+
+            if attr == "value":
+                tensor = node.attr[attr].tensor
+                if tensor.dtype == types_pb2.DT_HALF:
+                    # if half_val exists
+                    if tensor.half_val:
+                        half_val = tf.make_ndarray(node.attr[attr].tensor)
+                        new_node.attr[attr].tensor.CopyFrom(
+                            tf.make_tensor_proto(half_val, dtype=dtype))
+                        continue
+
+                    # if tensor content exists
+                    if tensor.tensor_content:
+                        tensor_shape = [
+                            x.size for x in tensor.tensor_shape.dim
+                        ]
+                        tensor_weights = tf.make_ndarray(tensor)
+                        # reshape tensor
+                        tensor_weights = np.reshape(tensor_weights,
+                                                    tensor_shape)
+                        tensor_proto = tf.make_tensor_proto(tensor_weights,
+                                                            dtype=dtype)
+                        new_node.attr[attr].tensor.CopyFrom(tensor_proto)
+                        continue
+
+            new_node.attr[attr].CopyFrom(node.attr[attr])
+
+    # transform graph
+    if output_names:
+        if not input_name:
+            input_name = []
+        transforms = ["strip_unused_nodes"]
+        target_graph_def = TransformGraph(target_graph_def, input_name,
+                                          output_names, transforms)
+
+    # write graph_def to model
+    tf.io.write_graph(target_graph_def,
+                      logdir=save_path,
+                      name=name,
+                      as_text=as_text)
+    print("Converting done ...")
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/torch_to_onnx.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/torch_to_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ffd1cc42ad93036b28c445bc87955e66249ca5d
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/tools/torch_to_onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+    model_name = output_path.split("/")[-1][:-4]
+    with open("general_perf/model_zoo/" + model_name + "json", "r") as f:
+        model_info = json.load(f)
+    model_inputs = model_info["inputs"].split(",")
+    input_shapes = model_info["input_shape"]
+    input_type = model_info["input_type"].split(",")
+    example_inputs = _get_fake_samples(input_shapes, input_type)
+
+    model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+    model.eval()
+
+    names = model_inputs
+    dynamic_inputs = {}
+    for i in range(len(names)):
+        dynamic_inputs[names[i]] = {0: "batch_size"}
+    outputs = model_info["outputs"].split(",")
+    for output in outputs:
+        dynamic_inputs[output] = {0: "batch_size"}
+    torch.onnx.export(
+        model,
+        example_inputs,
+        output_path,
+        opset_version=11,
+        input_names=names,
+        output_names=outputs,
+        dynamic_axes=dynamic_inputs,
+    )
+
+
+def _get_fake_samples(shape, type):
+    data = []
+    idx = 0
+    for key, val in shape.items():
+        val = [val[0] * 1] + val[1:]
+        data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+        idx += 1
+    return data
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="")
+    parser.add_argument("--output_path", default="")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    torch_to_onnx(args.model_path, args.output_path)
diff --git a/toolbox/ByteMLPerf/byte_infer_perf/general_perf/version.py b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..608f35d6f6b03ca23f46fbd6500fc32f694a858f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_infer_perf/general_perf/version.py
@@ -0,0 +1 @@
+__version__ = '1.0.0'
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/README.md b/toolbox/ByteMLPerf/byte_micro_perf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6033f40e265b9060de9e25a6a8f6fc20aaa3aa7f
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/README.md
@@ -0,0 +1,73 @@
+# ByteMicroPerf
+
+## Introduction
+ByteMicroPerf is a part of ByteMLPerf, which is mainly used to evaluate the performance of frequent computation and communication operators in mainstream deep learning models on new emerging heterogeneous hardwares. The main characteristics are as follows:
+
+- Easy and quick access for diverse heterogeneous hardware
+- Evaluation process fitting realistic business scenarios
+- Coverage of frequent operators across multiple categories
+
+## Quickstart
+
+### Prepare running environment
+
+```
+git clone https://github.com/bytedance/ByteMLPerf.git
+cd ByteMLPerf/byte_micro_perf
+```
+
+### Prepare hardware configuration(optional)
+Please follow the given style at `ByteMLPerf/vendor_zoo` directory to create a new hardware config file for your own heterogeneous hardware. Because this helps the framework evaluate operator performance on new hardware more precisely.
+
+### An example
+
+```
+python3 launch.py --task exp --hardware_type GPU
+```
+#### Usage
+```
+--task: operator name                              please create a workload file for new operators by following the existing style in byte_micro_perf/workloads.
+
+--hardware_type: hardware category name            please derive a Backend class for your heterogeneous hardware in byte_micro_perf/backends.
+```
+
+### Expected Output
+For different types of operators (Compute-bound / Memory-bound), we adopt various metrics to comprehensively evaluate the performance of the operator. Regarding the various metrics, the explanations are as follows:
+| Metric    | Description |
+| -------- | ------- |
+| Memory Size(MB) | the rough sum of read/write bytes    |
+| Kernel bandwidth(GB/s) | the achieved bandwidth under given input size of this kernel     |
+| Bandwidth Utilization(%)    | the ratio of achieved bandwidth and theoretical bandwidth   |
+| Avg latency(us) |the average of kernel latencies|
+
+Example:
+```
+{
+    "Operator": "EXP",
+    "Backend": "GPU",
+    "Host Info": "Intel(R) Xeon(R) Platinum 8336C CPU @ 2.30GHz",
+    "Device Info": "NVIDIA A800-SXM4-80GB",
+    "Performance": [
+        {
+            "Dtype": "float32",
+            "Tensor Shapes": [
+                [
+                    256,
+                    8192
+                ]
+            ],
+            "Read IO Size(MB)": 8.0,
+            "Write IO Size(MB)": 8.0,
+            "Memory Size(MB)": 16.0,
+            "Kernel bandwidth(GB/s)": 1790.52,
+            "Bandwidth Utilization(%)": 87.81,
+            "Avg latency(us)": 9.37,
+            "QPS": 27321.24
+        }
+    ]
+}
+```
+
+## Trouble Shooting
+
+For more details, you can visit our offical website here: [bytemlperf.ai](https://bytemlperf.ai/). Please let us know if you need any help or have additional questions and issues!
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/backend_gpu.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/backend_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb40d5ea22ff85ecb75dba93b211db2080d1cd16
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/backend_gpu.py
@@ -0,0 +1,282 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import math
+import os
+from datetime import timedelta
+from typing import Any, Dict, List
+
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as dist_c10d
+
+from backends.backend import Backend
+from backends.module_store import *
+from backends.utils import get_dtype_bytes
+
+from .custom_ops import GPUGemmOp, GPUBatchGemmOp, GPUGroupGemmOp
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+
+
+class BackendGPU(Backend):
+    def get_device_name(self):
+        return torch.cuda.get_device_name(0)
+
+    def get_backend_properties(self):
+        self.memory_limit = int(
+            torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        )
+
+        if self.vendor_path is not None and os.path.exists(self.vendor_path) and (self.vendor_path).endswith(".json"):
+            with open(self.vendor_path, "r") as f:
+                self.hw_info_dict = json.load(f)
+                # if the vendor path does not exist, please set this param manaually
+                self.bandwidth_limit = self.hw_info_dict["内存参数"]["内存"]["内存带宽(GB/s)"]
+        else:
+            log.warning(
+                "Vendor_path: [ {} ] was not found or not a full path points to json, please check your path!!! Otherwise, please set the hardware info manaually.".format(
+                    self.vendor_path
+                )
+            )
+
+
+    # device/host ops
+    def host2device(self):
+        self.op = Host2DeviceOp(torch.device("cuda"))
+
+    def device2host(self):
+        self.op = Device2HostOp()
+
+
+    # communication ops
+    def allreduce(self):
+        self.setup_2d_group()
+        self.op = AllReduceOp(self.group)
+
+    def allgather(self):
+        self.setup_2d_group()
+        self.op = AllGatherOp(self.group)
+
+    def reducescatter(self):
+        self.setup_2d_group()
+        self.op = ReduceScatterOp(self.group)
+
+    def alltoall(self):
+        self.setup_2d_group()
+        self.op = AllToAllOp(self.group)
+
+    def broadcast(self):
+        self.setup_2d_group()
+        self.op = BroadcastOp(self.group)
+
+    def p2p(self):
+        self.setup_2d_group()
+        self.op = P2POp(self.group, self.ranks, self.rank)
+
+    # compute ops
+    # unary ops
+    def sin(self):
+        self.op = SinOp()
+
+    def cos(self):
+        self.op = CosOp()
+
+    def exp(self):
+        self.op = ExpOp()
+
+    def exponential(self):
+        self.op = ExponentialOp()
+
+    def silu(self):
+        self.op = SiluOp()
+
+    def gelu(self):
+        self.op = GeluOp()
+
+    def swiglu(self):
+        self.op = SwiGLUOp()
+
+    def cast(self):
+        self.op = CastOp()
+
+
+    # binary ops
+    def add(self):
+        self.op = AddOp()
+
+    def mul(self):
+        self.op = MulOp()
+
+    def sub(self):
+        self.op = SubOp()
+
+    def div(self):
+        self.op = DivOp()
+
+
+    # reduce ops
+    def layernorm(self):
+        self.op = LayerNormOp()
+
+    def softmax(self):
+        self.op = SoftmaxOp()
+
+    def reduce_sum(self):
+        self.op = ReduceSumOp()
+
+    def reduce_min(self):
+        self.op = ReduceMinOp()
+
+    def reduce_max(self):
+        self.op = ReduceMaxOp()
+
+
+    # index ops
+    def index_add(self):
+        self.op = IndexAddOp()
+
+    def sort(self):
+        self.op = SortOp()
+
+    def unique(self):
+        self.op = UniqueOp()
+
+    def scatter(self):
+        self.op = ScatterOp()
+    
+    def gather(self):
+        self.op = GatherOp()
+
+    # gemm ops
+    def gemm(self):
+        self.op = GPUGemmOp()
+
+    def gemv(self):
+        self.op = GPUGemmOp()
+
+    def batch_gemm(self):
+        self.op = GPUBatchGemmOp()
+
+    def group_gemm(self):
+        self.op = GPUGroupGemmOp()
+
+
+
+    # create input tensors
+    def build_tensor(self, input_shapes, dtype):
+        torch.cuda.empty_cache()
+        torch_dtype = getattr(torch, dtype)
+
+        # compute size of input and output tensors
+        if hasattr(self.op, "compute_size"):
+            bytes_per_cnt = self.op.compute_size(input_shapes, dtype)
+        # default: input_tensors_size == output_tensor_size, all tensors have same dtype
+        else:
+            dtype_size = get_dtype_bytes(dtype)
+            element_num = 2 * sum([math.prod(shape) for shape in input_shapes])
+            bytes_per_cnt = dtype_size * element_num
+
+        # compute max avail tensors for compute
+        avail_bytes = (self.memory_limit - 4) * 1024**3
+        avail_cnts = avail_bytes // bytes_per_cnt
+        max_data_cnt = min(self.iterations, avail_cnts)
+
+        # create input tensors for each op
+        input_tensors_list = []
+        for _ in range(max_data_cnt):
+            # create input tensors
+            if hasattr(self.op, "custom_create_tensors"):
+                input_tensors = self.op.custom_create_tensors(input_shapes, torch_dtype, "cuda")
+                input_tensors_list.append(input_tensors)
+            # default: all input tensors have same dtype
+            else:
+                if torch_dtype in [torch.int8, torch.int32]:
+                    input_tensors = [
+                        torch.randint(-3, 3, size=shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                else:
+                    input_tensors = [
+                        torch.randn(shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                input_tensors_list.append(input_tensors)
+        if hasattr(self.op, "process_inputs"):
+            input_tensors_list = [
+                self.op.process_inputs(*(input_tensor))
+                for input_tensor in input_tensors_list
+            ]
+        return input_tensors_list, max_data_cnt, bytes_per_cnt
+
+
+
+    def _run_operation(self, operation, inputs):
+        result = operation(*inputs)
+        return result
+
+    def device_synchronize(self):
+        torch.cuda.synchronize()
+        return True
+
+    def initialize_ccl(self, rank, world_size):
+        """
+        initialize distributed process groups and relevant ENVs
+        """
+        # check device_count
+        device_count = torch.cuda.device_count()
+        if world_size > device_count:
+            world_size = device_count
+        if rank >= world_size:
+            return False
+
+        # set envs
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = "49373"
+        os.environ["LOCAL_RANK"] = str(rank)
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
+        torch.cuda.set_device(rank)
+
+        # Call the init process
+        timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30))
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=world_size,
+            rank=rank,
+            store=None,
+            timeout=timedelta(seconds=timeout_seconds),
+        )
+        self.setup_2d_group()
+        log.warning("DIST: rank {}, world_size {}".format(rank, world_size))
+        return True
+
+    def setup_2d_group(self):
+        self.rank = dist.get_rank()
+        torch.cuda.set_device(self.rank)
+        origin_store_based_barrier = dist_c10d._store_based_barrier
+        dist_c10d._store_based_barrier = lambda *a, **kw: None
+        self.world_size = dist.get_world_size()
+        self.ranks = range(0, self.world_size)
+        group = dist.new_group(self.ranks)
+        if self.rank in self.ranks:
+            self.group = group
+        dist_c10d._store_based_barrier = origin_store_based_barrier
+        # wait for all ranks finish group initializing
+        torch.distributed.barrier()
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/custom_ops.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f4a6b9acae5806bc06a1d21a66effe92b0122e9
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/custom_ops.py
@@ -0,0 +1,119 @@
+from typing import List
+
+import torch
+import cutlass
+
+from backends.module_store import GemmOp, BatchGemmOp, GroupGemmOp
+
+
+# gemm(pytorch) float32/float16/bfloat16 --> float32/float16/bfloat16
+# gemm(cutlass) int8 --> int32
+class GPUGemmOp(GemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.Gemm(
+                alpha=1, beta=0,
+                element_A=dtype,
+                element_B=dtype,
+                element_C=accum_dtype,
+                element_D=accum_dtype,
+                layout_A=cutlass.LayoutType.RowMajor,
+                layout_B=cutlass.LayoutType.RowMajor,
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='gemm', cc=self.plan.cc, 
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("GPUGemmOp cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+        if compute_dtype == torch.int8:
+            output_tensor = self.gemm_op_int8.run(input_tensor_a, input_tensor_b)
+        else:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# batch_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# batch_gemm(cutlass)   int8 --> int32
+class GPUBatchGemmOp(BatchGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+        except:
+            raise Exception("GPUBatchGemmOp import cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+
+        output_tensor = None
+        if compute_dtype == torch.int8:
+            bs, m, n = input_tensor_a.shape[0], input_tensor_a.shape[1], input_tensor_b.shape[2]
+            c_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            output_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            plan = cutlass.op.Gemm(A=input_tensor_a, B=input_tensor_b, C=c_tensor, D=output_tensor, element_accumulator=cutlass.DataType.s32)
+            plan.run(input_tensor_a, input_tensor_b, c_tensor, output_tensor, 1, 0)
+        else:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# group_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# group_gemm(cutlass)   int8 --> int32
+class GPUGroupGemmOp(GroupGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.GroupedGemm(
+                alpha=1, beta=0, 
+                element_A=dtype, 
+                element_B=dtype, 
+                element_C=accum_dtype, 
+                element_D=accum_dtype, 
+                layout_A=cutlass.LayoutType.RowMajor, 
+                layout_B=cutlass.LayoutType.RowMajor, 
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='group_gemm', cc=self.plan.cc,
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("GPUGroupGemmOp cutlass error")
+
+    def forward(self, 
+        a_list : List[torch.Tensor], 
+        b_list : List[torch.Tensor]
+    ):
+        compute_dtype = a_list[0].dtype
+        if compute_dtype == torch.int8:
+            output_tensors = self.gemm_op_int8.run(a_list, b_list)
+        else:
+            output_tensors = [a @ b for a, b in zip(a_list, b_list)]
+        return output_tensors
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/requirements.txt b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e45aca82def1619f23d7570e8b4f90f122686636
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/GPU/requirements.txt
@@ -0,0 +1,2 @@
+torch==2.1.0
+nvidia-cutlass
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
new file mode 100644
index 0000000000000000000000000000000000000000..02807ac4263cb031ef1492b72deb4a96d0fe4978
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/backend_iluvatar.py
@@ -0,0 +1,280 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+## limitations under the License.
+
+import json
+import logging
+import math
+import os
+from datetime import timedelta
+from typing import Any, Dict, List
+
+import torch
+import torch.distributed as dist
+import torch.distributed.distributed_c10d as dist_c10d
+
+from backends.backend import Backend
+from backends.module_store import *
+from backends.utils import get_dtype_bytes 
+
+from backends.module_store import GemmOp, GemvOp, BatchGemmOp, GroupGemmOp
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+
+
+class BackendILUVATAR(Backend):
+    def get_device_name(self):
+        return torch.cuda.get_device_name(0)
+
+    def get_backend_properties(self):
+        self.memory_limit = int(
+            torch.cuda.get_device_properties(0).total_memory / (1024**3)
+        )
+        if self.vendor_path is not None and os.path.exists(self.vendor_path) and (self.vendor_path).endswith(".json"):
+            with open(self.vendor_path, "r") as f:
+                self.hw_info_dict = json.load(f)
+                # if the vendor path does not exist, please set this param manaually
+                self.bandwidth_limit = self.hw_info_dict["内存参数"]["内存"]["内存带宽(GB/s)"]
+        else:
+            log.warning(
+                "Vendor_path: [ {} ] was not found or not a full path points to json, please check your path!!! Otherwise, please set the hardware info manaually.".format(
+                    self.vendor_path
+                )
+            )
+  
+    # device/host ops
+    def host2device(self):
+        self.op = Host2DeviceOp(torch.device("cuda"))
+
+    def device2host(self):
+        self.op = Device2HostOp()
+
+
+    # communication ops
+    def allreduce(self):
+        self.setup_2d_group()
+        self.op = AllReduceOp(self.group)
+
+    def allgather(self):
+        self.setup_2d_group()
+        self.op = AllGatherOp(self.group)
+
+    def reducescatter(self):
+        self.setup_2d_group()
+        self.op = ReduceScatterOp(self.group)
+
+    def alltoall(self):
+        self.setup_2d_group()
+        self.op = AllToAllOp(self.group)
+
+    def broadcast(self):
+        self.setup_2d_group()
+        self.op = BroadcastOp(self.group)
+
+    def p2p(self):
+        self.setup_2d_group()
+        self.op = P2POp(self.group, self.ranks, self.rank)
+    
+
+    # compute ops
+    # unary ops
+    def sin(self):
+        self.op = SinOp()
+
+    def cos(self):
+        self.op = CosOp()
+
+    def exp(self):
+        self.op = ExpOp()
+
+    def exponential(self):
+        self.op = ExponentialOp()
+
+    def silu(self):
+        self.op = SiluOp()
+
+    def gelu(self):
+        self.op = GeluOp()
+
+    def swiglu(self):
+        self.op = SwiGLUOp()
+
+    def cast(self):
+        self.op = CastOp()
+
+
+    # binary ops
+    def add(self):
+        self.op = AddOp()
+
+    def mul(self):
+        self.op = MulOp()
+
+    def sub(self):
+        self.op = SubOp()
+
+    def div(self):
+        self.op = DivOp()
+
+
+    # reduce ops
+    def layernorm(self):
+        self.op = LayerNormOp()
+
+    def softmax(self):
+        self.op = SoftmaxOp()
+
+    def reduce_sum(self):
+        self.op = ReduceSumOp()
+
+    def reduce_min(self):
+        self.op = ReduceMinOp()
+
+    def reduce_max(self):
+        self.op = ReduceMaxOp()
+
+
+    # index ops
+    def index_add(self):
+        self.op = IndexAddOp()
+
+    def sort(self):
+        self.op = SortOp()
+
+    def unique(self):
+        self.op = UniqueOp()
+
+    def scatter(self):
+        self.op = ScatterOp()
+
+    def gather(self):
+        self.op = GatherOp()
+
+
+    # gemm ops
+    def gemm(self):
+        self.op = GemmOp()
+
+    def gemv(self):
+        self.op = GemvOp()
+
+    def batch_gemm(self):
+        self.op = BatchGemmOp()
+
+    def group_gemm(self):
+        self.op = GroupGemmOp()
+
+
+    # create input tensors
+    def build_tensor(self, input_shapes, dtype):
+        torch.cuda.empty_cache()
+        torch_dtype = getattr(torch, dtype)
+
+        # compute size of input and output tensors
+        if hasattr(self.op, "compute_size"):
+            bytes_per_cnt = self.op.compute_size(input_shapes, dtype)
+        # default: input_tensors_size == output_tensor_size, all tensors have same dtype
+        else:
+            dtype_size = get_dtype_bytes(dtype)
+            element_num = 2 * sum([math.prod(shape) for shape in input_shapes])
+            bytes_per_cnt = dtype_size * element_num
+
+        # compute max avail tensors for compute
+        avail_bytes = (self.memory_limit - 4) * 1024**3
+        avail_cnts = avail_bytes // bytes_per_cnt
+        max_data_cnt = min(self.iterations, avail_cnts)
+
+        # create input tensors for each op
+        input_tensors_list = []
+        for _ in range(max_data_cnt):
+            # create input tensors
+            if hasattr(self.op, "custom_create_tensors"):
+                input_tensors = self.op.custom_create_tensors(input_shapes, torch_dtype, "cuda")
+                input_tensors_list.append(input_tensors)
+            # default: all input tensors have same dtype
+            else:
+                if torch_dtype in [torch.int8, torch.int32]:
+                    input_tensors = [
+                        torch.randint(-3, 3, size=shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                else:
+                    input_tensors = [
+                        torch.randn(shape, dtype=torch_dtype, device="cuda")
+                        for shape in input_shapes
+                    ]
+                input_tensors_list.append(input_tensors)
+        if hasattr(self.op, "process_inputs"):
+            input_tensors_list = [
+                self.op.process_inputs(*(input_tensor))
+                for input_tensor in input_tensors_list
+            ]
+        return input_tensors_list, max_data_cnt, bytes_per_cnt
+
+
+    def _run_operation(self, operation, inputs):
+        result = operation(*inputs)
+        return result
+
+    def device_synchronize(self):
+        torch.cuda.synchronize()
+        return True
+
+    def initialize_ccl(self, rank, world_size):
+        """
+        initialize distributed process groups and relevant ENVs
+        """
+        # check device_count
+        device_count = torch.cuda.device_count()
+        if world_size > device_count:
+            world_size = device_count
+        if rank >= world_size:
+            return False
+
+        # set envs
+        os.environ["MASTER_ADDR"] = "127.0.0.1"
+        os.environ["MASTER_PORT"] = "49373"
+        os.environ["LOCAL_RANK"] = str(rank)
+        os.environ["RANK"] = str(rank)
+        os.environ["WORLD_SIZE"] = str(world_size)
+
+        torch.cuda.set_device(rank)
+
+        # Call the init process
+        timeout_seconds = int(os.environ.get("MEGATRON_NCCL_TIMEOUT_SECOND", 30))
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=world_size,
+            rank=rank,
+            store=None,
+            timeout=timedelta(seconds=timeout_seconds),
+        )
+        self.setup_2d_group()
+        log.warning("DIST: rank {}, world_size {}".format(rank, world_size))
+        return True
+
+    def setup_2d_group(self):
+        self.rank = dist.get_rank()
+        torch.cuda.set_device(self.rank)
+        origin_store_based_barrier = dist_c10d._store_based_barrier
+        dist_c10d._store_based_barrier = lambda *a, **kw: None
+        self.world_size = dist.get_world_size()
+        self.ranks = range(0, self.world_size)
+        group = dist.new_group(self.ranks)
+        if self.rank in self.ranks:
+            self.group = group
+        dist_c10d._store_based_barrier = origin_store_based_barrier
+        # wait for all ranks finish group initializing
+        torch.distributed.barrier()
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/custom_ops.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/custom_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fcb1dfb655ef76e1a46c0a16c0ba0700bf8c8b9
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/custom_ops.py
@@ -0,0 +1,119 @@
+from typing import List
+
+import torch
+import cutlass
+
+from backends.module_store import GemmOp, BatchGemmOp, GroupGemmOp
+
+
+# gemm(pytorch) float32/float16/bfloat16 --> float32/float16/bfloat16
+# gemm(cutlass) int8 --> int32
+class ILUVATARGemmOp(GemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.Gemm(
+                alpha=1, beta=0,
+                element_A=dtype,
+                element_B=dtype,
+                element_C=accum_dtype,
+                element_D=accum_dtype,
+                layout_A=cutlass.LayoutType.RowMajor,
+                layout_B=cutlass.LayoutType.RowMajor,
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='gemm', cc=self.plan.cc, 
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("ILUVATARGemmOp cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+        if compute_dtype == torch.int8:
+            output_tensor = self.gemm_op_int8.run(input_tensor_a, input_tensor_b)
+        else:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# batch_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# batch_gemm(cutlass)   int8 --> int32
+class ILUVATARBatchGemmOp(BatchGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+        except:
+            raise Exception("ILUVATARBatchGemmOp import cutlass error")
+
+    def forward(
+        self, 
+        input_tensor_a : torch.Tensor, 
+        input_tensor_b : torch.Tensor
+    ):
+        compute_dtype = input_tensor_a.dtype
+
+        output_tensor = None
+        if compute_dtype == torch.int8:
+            bs, m, n = input_tensor_a.shape[0], input_tensor_a.shape[1], input_tensor_b.shape[2]
+            c_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            output_tensor = torch.randint(-3, 3, [bs, m, n], dtype=torch.int32, device="cuda")
+            plan = cutlass.op.Gemm(A=input_tensor_a, B=input_tensor_b, C=c_tensor, D=output_tensor, element_accumulator=cutlass.DataType.s32)
+            plan.run(input_tensor_a, input_tensor_b, c_tensor, output_tensor, 1, 0)
+        else:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        return output_tensor
+
+
+# group_gemm(pytorch)   float32/float16/bfloat16 --> float32/float16/bfloat16
+# group_gemm(cutlass)   int8 --> int32
+class ILUVATARGroupGemmOp(GroupGemmOp):
+    def __init__(self):
+        super().__init__()
+
+        try:
+            import cutlass
+            dtype = torch.int8
+            accum_dtype=torch.int32
+            self.plan = cutlass.op.GroupedGemm(
+                alpha=1, beta=0, 
+                element_A=dtype, 
+                element_B=dtype, 
+                element_C=accum_dtype, 
+                element_D=accum_dtype, 
+                layout_A=cutlass.LayoutType.RowMajor, 
+                layout_B=cutlass.LayoutType.RowMajor, 
+                layout_C=cutlass.LayoutType.RowMajor
+            )
+            self.op = self.plan.construct()
+            self.gemm_op_int8 = cutlass.emit.pytorch(
+                self.op, name='group_gemm', cc=self.plan.cc,
+                jit=True, sourcedir='out'
+            )
+        except:
+            self.gemm_op_int8 = None
+            raise Exception("ILUVATARGroupGemmOp cutlass error")
+
+    def forward(self, 
+        a_list : List[torch.Tensor], 
+        b_list : List[torch.Tensor]
+    ):
+        compute_dtype = a_list[0].dtype
+        if compute_dtype == torch.int8:
+            output_tensors = self.gemm_op_int8.run(a_list, b_list)
+        else:
+            output_tensors = [a @ b for a, b in zip(a_list, b_list)]
+        return output_tensors
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/requirements.txt b/toolbox/ByteMLPerf/byte_micro_perf/backends/ILUVATAR/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/__init__.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/backend.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ac40f2c72b62b945b05c5f3856d621b3427f60
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/backend.py
@@ -0,0 +1,253 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import random
+import traceback
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+
+from backends.utils import dump_communication_ops_report, dump_computation_ops_report
+
+class Backend(ABC):
+    def __init__(self, workload_dict: Dict[str, Any], vendor_path: str):
+        self.op_name = workload_dict["operator"]
+        self.iterations = workload_dict["iterations"]
+        self.warmup = int(0.1 * workload_dict["iterations"])
+        self.vendor_path = vendor_path
+        self.op = None
+
+        # communication params
+        self.rank = None
+        self.world_size = None
+        self.group = None
+
+        # hardware info
+        self.hw_info_dict = None
+        self.memory_limit = None
+        self.bandwidth_limit = None
+        self.get_backend_properties()
+
+        self.target_dtype = None
+
+    @abstractmethod
+    def get_device_name(self):
+        pass
+
+    @abstractmethod
+    def get_backend_properties(self):
+        pass
+
+    @abstractmethod
+    def build_tensor(self, input_shapes: List[List[int]], dtype):
+        pass
+
+    @abstractmethod
+    def _run_operation(self, operation, inputs):
+        pass
+
+    @abstractmethod
+    def device_synchronize(self):
+        pass
+
+    @abstractmethod
+    def initialize_ccl(self, rank, world_size):
+        pass
+
+    @abstractmethod
+    def setup_2d_group(self):
+        pass
+
+
+    # communication ops
+    def host2device(self):
+        pass
+
+    def device2host(self):
+        pass
+
+    def allreduce(self):
+        pass
+
+    def allgather(self):
+        pass
+
+    def reducescatter(self):
+        pass
+
+    def alltoall(self):
+        pass
+
+    def broadcast(self):
+        pass
+
+    def p2p(self):
+        pass
+
+    # compute ops
+    # unary ops
+    def sin(self):
+        pass
+
+    def cos(self):
+        pass
+
+    def exp(self):
+        pass
+
+    def exponential(self):
+        pass
+
+    def silu(self):
+        pass
+
+    def gelu(self):
+        pass
+
+    def swiglu(self):
+        pass
+
+    def cast(self):
+        pass
+
+
+    # binary ops
+    def add(self):
+        pass
+
+    def mul(self):
+        pass
+
+    def sub(self):
+        pass
+
+    def div(self):
+        pass
+
+
+    # reduce ops
+    def layernorm(self):
+        pass
+
+    def softmax(self):
+        pass
+
+    def reduce_sum(self):
+        pass
+
+    def reduce_min(self):
+        pass
+
+    def reduce_max(self):
+        pass
+
+
+    # index ops
+    def index_add(self):
+        pass
+
+    def sort(self):
+        pass
+
+    def unique(self):
+        pass
+
+    def scatter(self):
+        pass
+        
+    def gather(self):
+        pass
+
+
+    # gemm ops
+    def gemm(self):
+        pass
+
+    def gemv(self):
+        pass
+
+    def batch_gemm(self):
+        pass
+
+    def group_gemm(self):
+        pass
+
+
+    # perf specify input_shape for 
+    def perf(self, input_shapes: List[List[int]], dtype):
+        error = ""
+
+        # create input tensors based on input_shapes and dtype
+        tensor_list, tensor_cnt, tensor_size_perc_cnt = self.build_tensor(
+            input_shapes, dtype
+        )
+
+        if tensor_cnt > 0:
+            try:
+                # random select input tensors
+                input_index_list = [
+                    random.randint(0, tensor_cnt - 1) for _ in range(self.iterations)
+                ]
+
+                # warmup
+                num_warm_up = 10
+                for _ in range(num_warm_up):
+                    self._run_operation(self.op, tensor_list[0])
+
+                # perf
+                self.device_synchronize()
+                start_time = time.perf_counter_ns()
+                for i in range(self.iterations):
+                    self._run_operation(
+                        self.op,
+                        tensor_list[input_index_list[i]]
+                    )
+                self.device_synchronize()
+                end_time = time.perf_counter_ns()
+
+                # time in us
+                total_exec_time = (end_time - start_time) / 1e3
+                latency = round(total_exec_time / self.iterations, 2)
+            except Exception as e:
+                traceback.print_exc()
+                latency = 0
+                error = "RUN_OP_ERROR"
+        else:
+            latency = 0
+            error = "OOM"
+
+        tensor_list = []
+        
+        if self.op_name in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
+            report = dump_communication_ops_report(
+                self.op_name,
+                dtype,
+                input_shapes,
+                self.group.size(),
+                None,
+                latency,
+                error
+            )
+        else:
+            report = dump_computation_ops_report(
+                self.op_name, 
+                dtype, 
+                input_shapes, 
+                self.bandwidth_limit, 
+                latency, 
+                error
+            )
+        return report
+
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/module_store.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/module_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..a821ab1143a2d942d6b951e82048b09e67e809cc
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/module_store.py
@@ -0,0 +1,615 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+from typing import List
+
+import torch
+import torch.distributed as dist
+
+from .utils import get_dtype_bytes
+
+
+class GemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[M, K], [K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        M, K = a_shape
+        K, N = b_shape
+        d_shape = [M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"GemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class GemvOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+    
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[M, K], [K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        M, K = a_shape
+        K, N = b_shape
+        d_shape = [M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.mm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"GemvOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class BatchGemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        # input_shapes: [[bs, M, K], [bs, K, N]]
+        torch_dtype = getattr(torch, dtype)
+        a_shape, b_shape = input_shapes
+        bs, M, K = a_shape
+        bs, K, N = b_shape
+        d_shape = [bs, M, N]
+        dtype_size = get_dtype_bytes(dtype)
+        input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+        output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+        if torch_dtype == torch.int8:
+            bytes_per_cnt = dtype_size * input_element_num + get_dtype_bytes("int32") * output_element_num * 2
+        else:
+            bytes_per_cnt = dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a.dtype
+        output_tensor = None
+        if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+            output_tensor = torch.bmm(input_tensor_a, input_tensor_b)
+        else:
+            raise Exception(f"BatchGemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor
+
+
+class GroupGemmOp(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def compute_size(self, input_shapes, dtype):
+        """
+        [
+            [[M1, K1], [K1, N1]], 
+            [[M2, K2], [K2, N2]]
+        ]
+        """
+        torch_dtype = getattr(torch, dtype)
+        bytes_per_cnt = 0
+        for problem_shape in input_shapes:
+            a_shape, b_shape = problem_shape
+            M, K = a_shape
+            K, N = b_shape
+            d_shape = [M, N]
+            dtype_size = get_dtype_bytes(dtype)
+            input_element_num = sum([math.prod(shape) for shape in [a_shape, b_shape]])
+            output_element_num = sum([math.prod(shape) for shape in [d_shape]])
+            if torch_dtype == torch.int8:
+                bytes_per_cnt += dtype_size * input_element_num + get_dtype_bytes("float32") * output_element_num
+            else:
+                bytes_per_cnt += dtype_size * (input_element_num + output_element_num)
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        """
+        [
+            [[M1, K1], [K1, N1]], 
+            [[M2, K2], [K2, N2]]
+        ]
+        """
+        left_tensors = []
+        right_tensors = []
+
+        for problem_shape in input_shapes:
+            a_shape, b_shape = problem_shape
+            if torch_dtype in [torch.int8, torch.int32]:
+                left_tensor = torch.randint(-3, 3, size=a_shape, dtype=torch_dtype, device=xpu_device)
+                right_tensor = torch.randint(-3, 3, size=b_shape, dtype=torch_dtype, device=xpu_device)
+            else:
+                left_tensor = torch.randn(a_shape, dtype=torch_dtype, device=xpu_device)
+                right_tensor = torch.randn(b_shape, dtype=torch_dtype, device=xpu_device)
+            left_tensors.append(left_tensor)
+            right_tensors.append(right_tensor)
+
+        return [left_tensors, right_tensors]
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        compute_dtype = input_tensor_a[0].dtype
+        output_tensor_list = []
+        for a, b in zip(input_tensor_a, input_tensor_b):
+            if compute_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                output_tensor = torch.mm(a, b)
+                output_tensor_list.append(output_tensor)
+            else:
+                raise Exception(f"GroupGemmOp with dtype {compute_dtype} is not implemented")
+        return output_tensor_list
+
+
+class Host2DeviceOp(torch.nn.Module):
+    def __init__(self, xpu_device):
+        super().__init__()
+        self.xpu_device = xpu_device
+
+    def process_inputs(self, input_tensors):
+        new_inputs = input_tensors.cpu()
+        return [new_inputs]
+
+    def forward(self, input_tensors):
+        assert input_tensors.device.type == "cpu"
+        output_xpu = input_tensors.to(self.xpu_device)
+        return output_xpu
+
+
+class Device2HostOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        assert input_tensors.device.type != "cpu"
+        output_cpu = input_tensors.cpu()
+        return output_cpu
+
+
+class AllReduceOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def forward(self, input_tensors):
+        dist.all_reduce(input_tensors, group=self.group)
+        return True
+
+
+class AllGatherOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensors):
+        input_tensor_list = list(
+            torch.chunk(input_tensors, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list]
+
+    def forward(self, input_tensor_list):
+        dist.all_gather(
+            input_tensor_list,
+            input_tensor_list[dist.get_rank(self.group)],
+            group=self.group,
+        )
+        return True
+
+
+class ReduceScatterOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensors):
+        input_tensor_list = list(
+            torch.chunk(input_tensors, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list]
+
+    def forward(self, input_tensor_list):
+        dist.reduce_scatter(
+            input_tensor_list[dist.get_rank(self.group)],
+            input_tensor_list,
+            group=self.group,
+        )
+        return True
+
+
+class AllToAllOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def process_inputs(self, input_tensor, output_tensor):
+        input_tensor_list = list(
+            torch.chunk(input_tensor, dist.get_world_size(self.group))
+        )
+        output_tensor_list = list(
+            torch.chunk(output_tensor, dist.get_world_size(self.group))
+        )
+        return [input_tensor_list, output_tensor_list]
+
+    def forward(self, in_tensors_list, out_tensors_list):
+        dist.all_to_all(out_tensors_list, in_tensors_list, group=self.group)
+        return True
+
+
+class BroadcastOp(torch.nn.Module):
+    def __init__(self, group):
+        super().__init__()
+        self.group = group
+
+    def forward(self, input_tensors):
+        dist.broadcast(input_tensors, 0, self.group)
+        return True
+
+
+class P2POp(torch.nn.Module):
+    def __init__(self, group, ranks, rank):
+        super().__init__()
+        self.group = group
+        self.group_size = self.group.size()
+        self.rank = rank
+        self.ranks = ranks
+        self.rank_size = len(ranks)
+
+    def next_rank(self):
+        return self.ranks[(self.rank + 1) % self.rank_size]
+
+    def prev_rank(self):
+        return self.ranks[(self.rank - 1) % self.rank_size]
+
+    def forward(self, send_tensor, recv_tensor):
+        reqs = []
+        if self.rank != (self.group_size - 1):
+            send_req = dist.isend(send_tensor, self.next_rank(), self.group)
+            reqs.append(send_req)
+        if self.rank != 0:
+            recv_req = dist.irecv(recv_tensor, self.prev_rank(), self.group)
+            reqs.append(recv_req)
+
+        for req in reqs:
+            req.wait()
+        return True
+
+
+class SinOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.sin(input_tensors)
+        return result
+
+
+class CosOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.cos(input_tensors)
+        return result
+
+
+class ExpOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.exp(input_tensors)
+        return result
+
+
+class ExponentialOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = input_tensors.exponential_()
+        return result
+
+
+class SiluOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.nn.functional.silu(input_tensors)
+        return result
+
+
+class GeluOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.nn.functional.gelu(input_tensors)
+        return result
+
+
+class SwiGLUOp(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.w = 1
+        self.v = 2
+
+    def forward(self, input_tensors):
+        result = (torch.nn.functional.sigmoid(input_tensors) * self.w) + (input_tensors * self.v)
+        return result
+
+
+class CastOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def set_dtype(self, src_dtype: str):
+        target_dtype = "bfloat16" if src_dtype == "float32" else "float32"
+        self.target_dtype = target_dtype
+        self.target_torch_dtype = getattr(torch, target_dtype)
+
+    def compute_size(self, input_shapes, dtype):
+        torch_dtype = getattr(torch, dtype)
+        self.set_dtype(dtype)
+        dtype_size = get_dtype_bytes(dtype)
+        target_dtype_size = get_dtype_bytes(self.target_dtype)
+        element_num = sum([math.prod(shape) for shape in input_shapes])
+        bytes_per_cnt = dtype_size * element_num + target_dtype_size * element_num
+        return bytes_per_cnt
+
+    def forward(self, input_tensors):
+        result = input_tensors.to(self.target_torch_dtype)
+        return result
+
+
+class AddOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a + input_tensor_b
+        return result
+
+
+class MulOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a * input_tensor_b
+        return result
+
+
+class SubOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a - input_tensor_b
+        return result
+
+
+class DivOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensor_a, input_tensor_b):
+        result = input_tensor_a / input_tensor_b
+        return result
+
+
+class LayerNormOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.nn.functional.layer_norm(
+            input_tensors, (input_tensors.shape[-1],)
+        )
+        return result
+
+
+class SoftmaxOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.nn.functional.softmax(input_tensors, dim=-1)
+        return result
+
+
+class ReduceSumOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.sum(input_tensors, dim=-1)
+        return result
+
+
+class ReduceMinOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.min(input_tensors, dim=-1)
+        return result
+
+
+class ReduceMaxOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.max(input_tensors, dim=-1)
+        return result
+
+
+class IndexAddOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def process_inputs(self, input_tensor, source_tensor):
+        index = torch.randint(0, input_tensor.shape[0], (source_tensor.shape[0],)).to(
+            input_tensor.device
+        )
+        return [input_tensor, index, source_tensor]
+
+    def forward(self, input_tensor, index, source_tensor):
+        result = input_tensor.index_add_(0, index, source_tensor)
+        return result
+
+
+class SortOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.sort(input_tensors)
+        return result
+
+
+class UniqueOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input_tensors):
+        result = torch.unique(input_tensors, return_counts=True)
+        return result
+
+
+class ScatterOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def compute_size(self, input_shapes, dtype):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        tensor_dtype_size = get_dtype_bytes(dtype)
+        index_dtype_size = get_dtype_bytes("int64")
+
+        shape_func = lambda shape: math.prod(shape)
+
+        bytes_per_cnt = (
+            shape_func(tensor_shape) * tensor_dtype_size
+            + shape_func(tensor_shape) * index_dtype_size
+            + shape_func(tensor_shape) * tensor_dtype_size
+        )
+        
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        dst_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+        src_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+
+        # dim = 0
+        # dst[index[i, j], j] = src[i, j]
+        batch_size = tensor_shape[0]
+        tensor_len = tensor_shape[1]
+
+        index = [i for i in range(batch_size)]
+        random.shuffle(index)
+        index_tensor = torch.cat(
+            [torch.full((1, tensor_len), i, dtype=torch.int64, device=xpu_device) for i in index], 
+            dim=0
+        )
+        
+        return [dst_tensor, index_tensor, src_tensor]
+
+
+    def forward(self, dst_tensor, index_tensor, src_tensor):
+        dst_tensor.scatter_(0, index_tensor, src_tensor)
+        return dst_tensor
+
+
+class GatherOp(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def compute_size(self, input_shapes, dtype):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        tensor_dtype_size = get_dtype_bytes(dtype)
+        index_dtype_size = get_dtype_bytes("int64")
+
+        shape_func = lambda shape: math.prod(shape)
+
+        bytes_per_cnt = (
+            shape_func(tensor_shape) * tensor_dtype_size
+            + shape_func(tensor_shape) * index_dtype_size
+            + shape_func(tensor_shape) * tensor_dtype_size
+        )
+        
+        return bytes_per_cnt
+
+    def custom_create_tensors(self, input_shapes, torch_dtype, xpu_device):
+        # dst: [batch_size, len], dtype
+        # index: [batch_size, len], int64
+        # src: [batch_size, len], dtype
+        tensor_shape = input_shapes[0]
+
+        dst_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+        src_tensor = torch.empty(tensor_shape, dtype=torch_dtype, device=xpu_device)
+
+        # dim = 0
+        # dst[index[i, j], j] = src[i, j]
+        batch_size = tensor_shape[0]
+        tensor_len = tensor_shape[1]
+
+        index = [i for i in range(batch_size)]
+        random.shuffle(index)
+        index_tensor = torch.cat(
+            [torch.full((1, tensor_len), i, dtype=torch.int64, device=xpu_device) for i in index], 
+            dim=0
+        )
+        
+        return [dst_tensor, index_tensor, src_tensor]
+
+
+    def forward(self, dst_tensor, index_tensor, src_tensor):
+        torch.gather(src_tensor, 0, index_tensor, out=dst_tensor)
+        return dst_tensor
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/backends/utils.py b/toolbox/ByteMLPerf/byte_micro_perf/backends/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3216286ac4752a0129d92730a310d2984ffb9f62
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/backends/utils.py
@@ -0,0 +1,207 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List
+
+import numpy as np
+import torch
+
+
+def get_dtype_bytes(dtype: str):
+    torch_dtype = getattr(torch, dtype)
+    dtype_size = 0
+    if torch_dtype in [torch.int64, torch.int32, torch.int8]:
+        dtype_size = torch.iinfo(torch_dtype).bits // 8
+    elif torch_dtype in [torch.float32, torch.float16, torch.bfloat16]:
+        dtype_size = torch.finfo(torch_dtype).bits // 8
+    else:
+        # not supported yet
+        pass
+    return dtype_size
+
+
+def get_io_amount(op_name, input_shapes, dtype):
+    batch_size = input_shapes[0][0]
+    dtype_size = get_dtype_bytes(dtype)
+    if op_name in ["add", "mul", "sub", "div"]:
+        # c = a + b
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * math.prod(input_shapes[0])
+    elif op_name == "gemm":
+        M = input_shapes[0][0]
+        K = input_shapes[0][1]
+        N = input_shapes[1][1]
+        read_io_amount = dtype_size * (M * K + K * N)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * (M * N)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * (M * N)
+    elif op_name == "batch_gemm":
+        bs = input_shapes[0][0]
+        M = input_shapes[0][1]
+        K = input_shapes[0][2]
+        N = input_shapes[1][2]
+        read_io_amount = dtype_size * bs * (M * K + K * N)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * bs * (M * N)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * bs * (M * N)
+    elif op_name == "group_gemm":
+        in_size_list = []
+        out_size_list = []
+        m_list = []
+        for problem_shape in input_shapes:
+            M = problem_shape[0][0]
+            K = problem_shape[0][1]
+            N = problem_shape[1][1]
+            in_size_list.append(M * K + K * N)
+            out_size_list.append(M * N)
+            m_list.append(M)
+        batch_size = sum(m_list)
+        read_io_amount = dtype_size * sum(in_size_list)
+        if dtype != torch.int8:
+            write_io_amount = dtype_size * sum(out_size_list)
+        else:
+            write_io_amount = get_dtype_bytes("int32") * sum(out_size_list)
+    elif op_name in ["device2host"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = 0
+    elif op_name in ["host2device"]:
+        read_io_amount = 0
+        write_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+    elif op_name in ["reduce_sum", "reduce_max", "reduce_min"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * sum([math.prod(shape[:-1]) for shape in input_shapes])
+    elif op_name in ["unqiue", "sort"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = 2 * dtype_size * sum([math.prod(shape) for shape in input_shapes])
+    elif op_name in ["scatter", "gather"]:
+        tensor_shape = input_shapes[0]
+        read_io_amount = (dtype_size + get_dtype_bytes("int64")) * math.prod(tensor_shape)
+        write_io_amount = dtype_size * math.prod(tensor_shape)
+    elif op_name == "cast":
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = read_io_amount / 2 if dtype == torch.float32 else read_io_amount * 2
+    elif op_name in ["index_add"]:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes]) + get_dtype_bytes("int32") * input_shapes[1][0]
+        write_io_amount = dtype_size * math.prod(input_shapes[0])
+    else:
+        read_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+        write_io_amount = dtype_size * sum([math.prod(shape) for shape in input_shapes])
+
+    total_io_amount = read_io_amount + write_io_amount
+
+    return batch_size, total_io_amount, read_io_amount, write_io_amount
+
+
+def dump_communication_ops_report(
+    op_name: str,
+    dtype: str,
+    input_shapes: List[List[int]],
+    group_size: List[int],
+    bandwidth_limit: float,
+    latency: float,
+    error: str = ""
+):
+    size = math.prod(input_shapes[0])
+    dtype_size = get_dtype_bytes(dtype)
+    mb = dtype_size * size / 1024 / 1024
+    if error == "":
+        algo_bw = dtype_size * size / latency / 1e3
+
+        """
+        allreduce:      2 * (group_size - 1) * (tensor_size / group_size)
+        allgather:      1 * (group_size - 1) * (tensor_size / group_size)
+        reducescatter:  1 * (group_size - 1) * (tensor_size / group_size)
+        alltoall:       1 * (group_size - 1) * (tensor_size / group_size)
+        broadcast:      tensor_size
+        p2p:            tensor_size
+        """
+        bus_bw = algo_bw * (group_size - 1) / group_size
+        if op_name in ["broadcast", "p2p"]:
+            bus_bw = algo_bw
+        if op_name == "allreduce":
+            bus_bw *= 2
+
+        bandwidth_utils = None
+        if bandwidth_limit is not None:
+            bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Memory Size(MB)": round(mb, 2),
+            "Group": group_size,
+            "Kernel bandwidth(GB/s)": round(algo_bw, 2),
+            "Bus bandwidth(GB/s)": round(bus_bw, 2),
+            "Bandwidth Utilization(%)": bandwidth_utils,
+            "Avg latency(us)": round(latency, 2),
+        }
+    else:
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Memory Size(MB)": round(mb, 2),
+            "Group": group_size,
+            "Kernel bandwidth(GB/s)": 0,
+            "Bus bandwidth(GB/s)": 0,
+            "Bandwidth Utilization(%)": None,
+            "Avg latency(us)": 0,
+            "Error": error,
+        }
+    return report
+
+
+def dump_computation_ops_report(
+    op_name: str,
+    dtype: str,
+    input_shapes: List[List[int]],
+    bandwidth_limit: float,
+    latency: float,
+    error: str = ""
+):
+    batch_size, total_io_amount, read_io_amount, write_io_amount = get_io_amount(op_name, input_shapes, dtype)
+
+    if error == "":
+        qps = round(1000 / latency * batch_size, 2)
+        algo_bw = total_io_amount / latency / 1e3
+
+        bandwidth_utils = None
+        if bandwidth_limit is not None:
+            bandwidth_utils = round((algo_bw / bandwidth_limit) * 1e2, 2)
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Read IO Size(MB)": round(read_io_amount / 1024 / 1024, 2),
+            "Write IO Size(MB)": round(write_io_amount / 1024 / 1024, 2),
+            "Memory Size(MB)": round(total_io_amount / 1024 / 1024, 2),
+            "Kernel bandwidth(GB/s)": round(algo_bw, 2),
+            "Bandwidth Utilization(%)": bandwidth_utils,
+            "Avg latency(us)": round(latency, 2),
+            "QPS": qps,
+        }
+    else:
+        report = {
+            "Dtype": str(dtype),
+            "Tensor Shapes": input_shapes,
+            "Read IO Size(MB)": round(read_io_amount / 1024 / 1024, 2),
+            "Write IO Size(MB)": round(write_io_amount / 1024 / 1024, 2),
+            "Memory Size(MB)": round(total_io_amount / 1024 / 1024, 2),
+            "Kernel bandwidth(GB/s)": 0,
+            "Bandwidth Utilization(%)": None,
+            "Avg latency(us)": 0,
+            "QPS": 0,
+            "Error": error,
+        }
+    return report
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/compiled_cache.db b/toolbox/ByteMLPerf/byte_micro_perf/compiled_cache.db
new file mode 100644
index 0000000000000000000000000000000000000000..1894846cceaf75183b5eb2e4bb7f753badc9056f
Binary files /dev/null and b/toolbox/ByteMLPerf/byte_micro_perf/compiled_cache.db differ
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/core/perf_engine.py b/toolbox/ByteMLPerf/byte_micro_perf/core/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2774c8218943ccc794e5368b0d6d4d4d2f3afa6
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/core/perf_engine.py
@@ -0,0 +1,398 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import importlib
+import json
+import logging
+import math
+import os
+import subprocess
+import sys
+import pathlib
+import traceback
+import random
+from typing import Any, Dict, List
+import itertools
+
+
+import torch.multiprocessing as mp
+import virtualenv
+
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+from backends.backend import Backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        default="gemm",
+        help="The task going to be evaluted, refs to workloads/",
+    )
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/",
+    )
+    parser.add_argument(
+        "--vendor_path",
+        required=False,
+        help="The hardware configs need to be loaded, refs to vendor_zoo/NVIDIA/A100-PCIe.json",
+    )
+    parser.add_argument(
+        "--compile_only", action="store_true", help="Run compilation only"
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def load_workload(task: str) -> Dict[str, Any]:
+    """
+    Return a list of dictionary with model Configuration
+    Args: List[str]
+    Returns: List[dic]
+    """
+    modules_dir = (
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + "/workloads"
+    )
+
+    for file in os.listdir(modules_dir):
+        path = os.path.join(modules_dir, file)
+        if (
+            not file.startswith("_")
+            and not file.startswith(".")
+            and (file.endswith(".json") or os.path.isdir(path))
+            and file[: file.find(".json")] == task
+        ):
+            module_name = file
+            with open("workloads/" + module_name, "r") as f:
+                workload_dict = json.load(f)
+            return workload_dict
+    else:
+        log.error(
+            "Task name: [ {} ] was not found, please check your task name".format(task)
+        )
+
+def parse_workload(workload):
+    shape_list = []
+    if "input_shape_list" in workload:
+        shape_list.extend(workload["input_shape_list"])
+    # gemm or batch_gemm
+    elif "M/K/N" in workload:
+        if "batch_size" in workload:
+            for batch_size in workload["batch_size"]:
+                for M, K, N in workload["M/K/N"]:
+                    shape_list.append([
+                        [batch_size, M, K],
+                        [batch_size, K, N]
+                    ])
+        else:
+            for M, K, N in workload["M/K/N"]:
+                shape_list.append([[M, K], [K, N]])
+    # group_gemm
+    elif "MKN_choices" in workload:
+        seed = workload["seed"]
+        MKN_list = workload["MKN_choices"]
+        problems_list = workload["problems"]
+
+        random.seed(seed)
+        for problems in problems_list:
+            cur_inputs = []
+            for _ in range(problems):
+                M, K, N = [random.choice(MKN_list) for _ in range(3)]
+                cur_shapes = [[M, K], [K, N]]
+                cur_inputs.append(cur_shapes)
+        shape_list.append(cur_inputs)
+
+
+    if "input_shape_groups" in workload:
+        input_shape_groups = workload["input_shape_groups"] if isinstance(workload["input_shape_groups"], list) else [workload["input_shape_groups"]]
+
+        for input_shape_group in input_shape_groups:
+            if "inputs" in input_shape_group:
+                input_shape_list = []
+                for input_shapes in input_shape_group["inputs"]:
+                    input_shape_list.append([list(shape) for shape in itertools.product(*input_shapes)])
+                if len(input_shape_list) == 1:
+                    shape_list.extend(input_shape_list[0])
+                else:
+                    shape_list.extend([list(input_shape) for input_shape in zip(*input_shape_list)])
+
+            else:
+                gemm_keys = ["M", "K", "N", "MN", "MK", "KN"]
+                gemm_values = [input_shape_group.get(k, []) for k in gemm_keys]
+                if any(gemm_values):
+                    m ,k, n, mn, mk, kn = gemm_values
+                    # batch gemm
+                    if "batch_size" in input_shape_group:
+                        bs = input_shape_group.get("batch_size", [])
+                        if m and n and k:
+                            for p in itertools.product(bs, m, k, n):
+                                shape_list.append([[p[0], p[1], p[2]], [p[0], p[2], p[3]]])
+                        if mn and k:
+                            for p in itertools.product(bs, mn, k):
+                                shape_list.append([[p[0], p[1][0], p[2]], [p[0], p[2], p[1][1]]])
+                        if mk and n:
+                            for p in itertools.product(bs, mk, n):
+                                shape_list.append([[p[0], p[1][0], p[1][1]], [p[0], p[1][1], p[2]]])
+                        if m and kn:
+                            for p in itertools.product(bs, m, kn):
+                                shape_list.append([[p[0], p[1], p[2][0]], [p[0], p[2][0], p[2][1]]])
+                    # group gemm
+                    elif "gemm_group" in input_shape_group:
+                        groups = input_shape_group.get("gemm_group", [])
+                        kn = input_shape_group.get("KN", [])
+                        if k and n:
+                            kn.append([list(shape) for shape in itertools.product(k, n)])
+                        for group in groups:
+                            for _kn in kn:
+                                group_input_shape_list = []
+                                for m in group:
+                                    group_input_shape_list.append([[m, _kn[0]], [_kn[0], _kn[1]]])
+                                shape_list.append(group_input_shape_list)
+                    # gemm
+                    else:
+                        if m and n and k:
+                            for p in itertools.product(m, k, n):
+                                shape_list.append([[p[0], p[1]], [p[1], p[2]]])
+                        if mn and k:
+                            for p in itertools.product(mn, k):
+                                shape_list.append([[p[0][0], p[1]], [p[1], p[0][1]]])
+                        if mk and n:
+                            for p in itertools.product(mk, n):
+                                shape_list.append([[p[0][0], p[0][1]], [p[0][1], p[1]]])
+                        if m and kn:
+                            for p in itertools.product(m, kn):
+                                shape_list.append([[p[0], p[1][0]], [p[1][0], p[1][1]]])
+    return shape_list
+
+
+class PerfEngine:
+    def __init__(self) -> None:
+        super().__init__()
+        self.args = get_args()
+        self.workload = load_workload(self.args.task)
+        self.backend_type = self.args.hardware_type
+        self.old_os_path = os.environ["PATH"]
+        self.prev_sys_path = list(sys.path)
+        self.real_prefix = sys.prefix
+
+    def init_process(self, rank: int, world_size: int):
+        """
+        Initialize the distributed environment.
+
+        """
+        initialize_func = getattr(self.backend, "initialize_ccl")
+
+        # world_size may excced available device count
+        ret = initialize_func(rank, world_size)
+        if ret is not None and not ret:
+            return
+
+        status = self.start_perf(self.workload)
+        return status
+
+    def init_backend(self, hardware_type: str) -> Backend:
+        """
+        Load related compile backend with input hardware type
+
+        Arguments: str
+
+        Returns: Heterogeneous Backend()
+        """
+        log.info("Loading Heterogeneous Backend: {}".format(hardware_type))
+
+        backend = importlib.import_module(
+            "backends." + hardware_type + ".backend_" + hardware_type.lower()
+        )
+        backend = getattr(backend, "Backend" + hardware_type)
+        return backend(self.workload, self.args.vendor_path)
+
+    def start_engine(self) -> None:
+        #status = self.activate_venv(self.backend_type)
+        #if not status:
+        #    log.warning("Activate virtualenv Failed, Please Check...")
+
+        self.backend = self.init_backend(self.backend_type)
+        output_dir = os.path.abspath("reports/" + self.backend_type)
+        os.makedirs(output_dir, exist_ok=True)
+
+        if self.args.task in ["allreduce", "allgather", "reducescatter", "alltoall", "broadcast", "p2p"]:
+            for group in self.workload["group"]:
+                try:
+                    mp.spawn(fn=self.init_process, args=(group,), nprocs=group)
+                except Exception as e:
+                    traceback.print_exc()
+                    log.error(f"Execute task: {self.args.task} failed, group: {group}, error msg: {e}")
+        else:
+            status = self.start_perf(self.workload)
+
+        self.deactivate_venv()
+
+    def start_perf(self, workload: Dict[str, Any]) -> bool:
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        if local_rank == 0:
+            log.info(
+                "******************************************* Start to test op: [{}]. *******************************************".format(
+                    workload["operator"]
+                )
+            )
+
+        # Initalize Output Dir and Reports
+        output_dir = pathlib.Path("reports").joinpath(self.backend_type).joinpath(workload["operator"])
+        os.makedirs(output_dir, exist_ok=True)
+
+        op_name = workload["operator"]
+        base_report = {
+            "Operator": op_name.upper(),
+            "Backend": self.backend_type,
+            "Host Info": self.get_cpu_name(),
+            "Device Info": getattr(self.backend, "get_device_name")(),
+        }
+
+        op = getattr(self.backend, op_name.lower(), None)
+        if op is not None and callable(op):
+            op()
+        else:
+            raise ValueError(f"Unknown operation: {op_name.lower()}")
+
+        # get input shape info
+        shape_list = parse_workload(self.workload)
+
+        # dtype list
+        dtype_list = self.workload["dtype"]
+
+        for dtype in dtype_list:
+            perf_reports = []
+            base_report["Performance"] = {}
+
+            for input_shape in shape_list:
+                """
+                input_shape could be:
+                  List[int]: single shape. cos
+                  List[List[int]]: multiple inputs. add
+                  List[List[List[in]]]: multiple inputs with multiple problems. group_gemm
+                """
+                if local_rank == 0:
+                    log.info(f"Execute op: [{op_name.lower()}], input_shape: {input_shape}, dtype: {dtype}")
+                if isinstance(input_shape[0], int):
+                    input_shape = [input_shape]
+                try:
+                    reports = self.backend.perf(input_shape, dtype)
+                except Exception as e:
+                    traceback.print_exc()
+                    log.error(f"Execute op: {op_name.lower()} failed, input_shape: {input_shape}, dtype: {dtype}, error msg: {e}")
+                    reports = {}
+                perf_reports.append(reports)
+            base_report["Performance"] = perf_reports
+
+            # write output to json file
+            has_group = "Group" in base_report["Performance"][0]
+            output_report_path = (
+                f"result-{str(dtype)}"
+                + (
+                    f"-group{base_report['Performance'][0]['Group']}"
+                    if has_group
+                    else ""
+                )
+                + ".json"
+            )
+            output_report_path = os.path.join(output_dir, output_report_path)
+            if local_rank == 0:
+                # logging.info(base_report["Performance"])
+                with open(output_report_path, "w") as file:
+                    json.dump(base_report, file, indent=4)
+        if local_rank == 0:
+            log.info(
+                "******************************************* Test op: [{}] SUCCESS. *******************************************".format(
+                    workload["operator"]
+                )
+            )
+        return True
+
+    def get_cpu_name(self):
+        command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+        cpu_name = subprocess.check_output(command, shell=True)
+        return cpu_name.decode().strip()
+
+    def activate_venv(self, hardware_type: str) -> bool:
+        if os.path.exists("backends/" + hardware_type + "/requirements.txt"):
+            log.info("Activating Virtual Env for " + hardware_type)
+
+            venv_dir = os.path.join("backends", hardware_type + "/venv")
+            activate_file = os.path.join(venv_dir, "bin", "activate_this.py")
+            if not os.path.exists(venv_dir):
+                log.info("venv not exist, Creating Virtual Env for " + hardware_type)
+
+                virtualenv.create_environment(venv_dir, True)
+
+                exec(open(activate_file).read(), {"__file__": activate_file})
+                python_path = os.path.join(venv_dir, "bin", "python3")
+                subprocess.call(
+                    [python_path, "-m", "pip", "install", "--upgrade", "pip", "--quiet"]
+                )
+                subprocess.call(
+                    [
+                        python_path,
+                        "-m",
+                        "pip",
+                        "install",
+                        "-r",
+                        "backends/" + hardware_type + "/requirements.txt",
+                        "-q",
+                    ]
+                )
+            else:
+                exec(open(activate_file).read(), {"__file__": activate_file})
+                """
+                just in case install failed in pre-run.
+                """
+                python_path = os.path.join(venv_dir, "bin", "python3")
+                subprocess.call(
+                    [python_path, "-m", "pip", "install", "--upgrade", "pip", "--quiet"]
+                )
+                subprocess.call(
+                    [
+                        python_path,
+                        "-m",
+                        "pip",
+                        "install",
+                        "-r",
+                        "backends/" + hardware_type + "/requirements.txt",
+                        "-q",
+                    ]
+                )
+
+                if not hasattr(sys, "real_prefix"):
+                    return False
+                return True
+        return True
+
+    def deactivate_venv(self):
+        sys.path[:0] = self.prev_sys_path   #will also revert the added site-packages
+        sys.prefix = self.real_prefix
+        os.environ["PATH"] = self.old_os_path
+
+
+if __name__ == "__main__":
+    engine = PerfEngine()
+    engine.start_engine()
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/launch.py b/toolbox/ByteMLPerf/byte_micro_perf/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f31829180868a3f884d3433d84948680591761d8
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/launch.py
@@ -0,0 +1,108 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import logging
+import os
+import random
+import socket
+import subprocess
+import sys
+
+BYTE_MLPERF_ROOT = os.path.dirname(os.path.abspath(__file__))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("lanuch")
+
+
+def parse_task(task_dir):
+    tasks = []
+    if os.path.isdir(task_dir):
+        for root, _, files in os.walk(task_dir, topdown=False):
+            for name in files:
+                if name.endswith(".json"):
+                    tasks.append(name.rsplit('.', 1)[0])
+    return tasks
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task", default="", help="The task going to be evaluted, refs to workloads/"
+    )
+    parser.add_argument(
+        "--task_dir", default="", help="The direcotry of tasks going to be evaluted, e.g., set to workloads"
+    )
+    parser.add_argument(
+        "--hardware_type",
+        default="GPU",
+        help="The backend going to be evaluted, refs to backends/",
+    )
+    parser.add_argument(
+        "--vendor_path",
+        required=False,
+        help="The hardware configs need to be loaded, refs to vendor_zoo/NVIDIA/A100-PCIe.json",
+    )
+    parser.add_argument(
+        "--compile_only",
+        action="store_true",
+        help="Task will stoped after compilation finished",
+    )
+    parser.add_argument(
+        "--show_task_list", action="store_true", help="Print all task names"
+    )
+    parser.add_argument(
+        "--show_hardware_list",
+        action="store_true",
+        help="Print all hardware bytemlperf supported",
+    )
+    args = parser.parse_args()
+
+    if args.show_task_list:
+        logging.info("******************* Supported Task *******************")
+        for file in os.listdir("workloads"):
+            print(file[:-5])
+    if args.show_hardware_list:
+        log.info("***************** Supported Hardware Backend *****************")
+        for file in os.listdir("backends"):
+            if not file.endswith(".py") and not file.startswith("_"):
+                print(file)
+    if args.task or args.task_dir:
+        log.info("******************* Pip Package Installing *******************")
+        subprocess.call(
+            ["python3", "-m", "pip", "install", "pip", "--upgrade", "--quiet"]
+        )
+
+        subprocess.call(
+            ["python3", "-m", "pip", "install", "-r", "requirements.txt", "--quiet"]
+        )
+
+        if args.task:
+            if args.task_dir:
+                log.warning("task and task_dir are both set, task_dir will be ignored")
+            tasks = args.task.split(',')
+        elif args.task_dir:
+            tasks = parse_task(args.task_dir)
+        logging.info(f"******************* Tasks: {tasks}")
+        exit_code = 0
+        for task in tasks:
+            cmd = "python3 core/perf_engine.py --hardware_type {} --task {} --vendor_path {}".format(
+                args.hardware_type, task, args.vendor_path
+            )
+            exit_code = subprocess.call(cmd, shell=True)
+
+        sys.exit(exit_code)
diff --git a/toolbox/ByteMLPerf/byte_micro_perf/requirements.txt b/toolbox/ByteMLPerf/byte_micro_perf/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9adbfddb211eb4cbfa11958041c824719565a885
--- /dev/null
+++ b/toolbox/ByteMLPerf/byte_micro_perf/requirements.txt
@@ -0,0 +1,14 @@
+matplotlib
+pandas
+virtualenv==16.7.12
+scikit-learn
+prompt_toolkit
+tqdm
+opencv-python
+transformers
+tokenization
+fpdf
+attrs
+decorator
+typing-extensions
+pydot
\ No newline at end of file
diff --git a/toolbox/ByteMLPerf/docs/images/flowchat.png b/toolbox/ByteMLPerf/docs/images/flowchat.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a32f6eb7693027f4039e9d36ab58278de6117f2
Binary files /dev/null and b/toolbox/ByteMLPerf/docs/images/flowchat.png differ
diff --git a/toolbox/ByteMLPerf/docs/images/icon.png b/toolbox/ByteMLPerf/docs/images/icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e4654a4df64458f3358c42e5545bff0ebf8b51f
Binary files /dev/null and b/toolbox/ByteMLPerf/docs/images/icon.png differ
diff --git a/toolbox/ByteMLPerf/pylint.conf b/toolbox/ByteMLPerf/pylint.conf
new file mode 100644
index 0000000000000000000000000000000000000000..c6398108cf4df3ac866bf762e677f2f6afc0cc2b
--- /dev/null
+++ b/toolbox/ByteMLPerf/pylint.conf
@@ -0,0 +1,630 @@
+[MAIN]
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Clear in-memory caches upon conclusion of linting. Useful if running pylint
+# in a server-like mode.
+clear-cache-post-run=no
+
+# Load and enable all available extensions. Use --list-extensions to see a list
+# all available extensions.
+#enable-all-extensions=
+
+# In error mode, messages with a category besides ERROR or FATAL are
+# suppressed, and no reports are done by default. Error mode is compatible with
+# disabling specific errors.
+#errors-only=
+
+# Always return a 0 (non-error) status code, even if lint errors are found.
+# This is primarily useful in continuous integration scripts.
+#exit-zero=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list=
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
+# for backward compatibility.)
+extension-pkg-whitelist=
+
+# Return non-zero exit code if any of these messages/categories are detected,
+# even if score is above --fail-under value. Syntax same as enable. Messages
+# specified are enabled, while categories only check already-enabled messages.
+fail-on=
+
+# Specify a score threshold under which the program will exit with error.
+fail-under=10
+
+# Interpret the stdin as a python script, whose filename needs to be passed as
+# the module_or_package argument.
+#from-stdin=
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=CVS
+
+# Add files or directories matching the regular expressions patterns to the
+# ignore-list. The regex matches against paths and can be in Posix or Windows
+# format. Because '\\' represents the directory delimiter on Windows systems,
+# it can't be used as an escape character.
+ignore-paths=
+
+# Files or directories matching the regular expression patterns are skipped.
+# The regex matches against base names, not paths. The default value ignores
+# Emacs file locks
+ignore-patterns=^\.#
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use, and will cap the count on Windows to
+# avoid hangs.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python module names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=no
+
+# Minimum Python version to use for version dependent checks. Will default to
+# the version used to run pylint.
+py-version=3.7
+
+# Discover python modules and packages in the file system subtree.
+recursive=no
+
+# Add paths to the list of the source roots. Supports globbing patterns. The
+# source root is an absolute path or a path relative to the current working
+# directory used to determine a package namespace for modules located under the
+# source root.
+source-roots=
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# In verbose mode, extra non-checker-related info will be displayed.
+#verbose=
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style. If left empty, argument names will be checked with the set
+# naming style.
+#argument-rgx=
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style. If left empty, attribute names will be checked with the set naming
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=foo,
+          bar,
+          baz,
+          toto,
+          tutu,
+          tata
+
+# Bad variable names regexes, separated by a comma. If names match any regex,
+# they will always be refused
+bad-names-rgxs=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style. If left empty, class attribute names will be checked
+# with the set naming style.
+#class-attribute-rgx=
+
+# Naming style matching correct class constant names.
+class-const-naming-style=UPPER_CASE
+
+# Regular expression matching correct class constant names. Overrides class-
+# const-naming-style. If left empty, class constant names will be checked with
+# the set naming style.
+#class-const-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style. If left empty, class names will be checked with the set naming style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style. If left empty, constant names will be checked with the set naming
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style. If left empty, function names will be checked with the set
+# naming style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+           j,
+           k,
+           ex,
+           Run,
+           _
+
+# Good variable names regexes, separated by a comma. If names match any regex,
+# they will always be accepted
+good-names-rgxs=
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style. If left empty, inline iteration names will be checked
+# with the set naming style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style. If left empty, method names will be checked with the set naming style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style. If left empty, module names will be checked with the set naming style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct type alias names. If left empty, type
+# alias names will be checked with the set naming style.
+#typealias-rgx=
+
+# Regular expression matching correct type variable names. If left empty, type
+# variable names will be checked with the set naming style.
+#typevar-rgx=
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style. If left empty, variable names will be checked with the set
+# naming style.
+#variable-rgx=
+
+
+[CLASSES]
+
+# Warn about protected attribute access inside special methods
+check-protected-access-in-special-methods=no
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp,
+                      __post_init__
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[DESIGN]
+
+# List of regular expressions of class ancestor names to ignore when counting
+# public methods (see R0903)
+exclude-too-few-public-methods=
+
+# List of qualified class names to ignore when counting class parents (see
+# R0901)
+ignored-parents=
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement (see R0916).
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when caught.
+overgeneral-exceptions=builtins.BaseException,builtins.Exception
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[IMPORTS]
+
+# List of modules that can be imported at any level, not just the top level
+# one.
+allow-any-import-level=
+
+# Allow explicit reexports by alias from a package __init__.
+allow-reexport-from-package=no
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=
+
+# Output a graph (.gv or any supported image format) of external dependencies
+# to the given file (report RP0402 must not be disabled).
+ext-import-graph=
+
+# Output a graph (.gv or any supported image format) of all (i.e. internal and
+# external) dependencies to the given file (report RP0402 must not be
+# disabled).
+import-graph=
+
+# Output a graph (.gv or any supported image format) of internal dependencies
+# to the given file (report RP0402 must not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Couples of modules and preferred modules, separated by a comma.
+preferred-modules=
+
+
+[LOGGING]
+
+# The type of string formatting that logging methods do. `old` means using %
+# formatting, `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
+# UNDEFINED.
+confidence=HIGH,
+           CONTROL_FLOW,
+           INFERENCE,
+           INFERENCE_FAILURE,
+           UNDEFINED
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then re-enable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=raw-checker-failed,
+        bad-inline-option,
+        locally-disabled,
+        file-ignored,
+        suppressed-message,
+        useless-suppression,
+        deprecated-pragma,
+        use-symbolic-message-instead
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[METHOD_ARGS]
+
+# List of qualified names (i.e., library.method) which require a timeout
+# parameter e.g. 'requests.api.get,requests.api.post'
+timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+      XXX,
+      TODO
+
+# Regular expression of note tags to take in consideration.
+notes-rgx=
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit,argparse.parse_error
+
+
+[REPORTS]
+
+# Python expression which should return a score less than or equal to 10. You
+# have access to the variables 'fatal', 'error', 'warning', 'refactor',
+# 'convention', and 'info' which contain the number of messages in each
+# category, as well as 'statement' which is the total number of statements
+# analyzed. This score is used by the global evaluation report (RP0004).
+evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+#output-format=
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[SIMILARITIES]
+
+# Comments are removed from the similarity computation
+ignore-comments=yes
+
+# Docstrings are removed from the similarity computation
+ignore-docstrings=yes
+
+# Imports are removed from the similarity computation
+ignore-imports=yes
+
+# Signatures are removed from the similarity computation
+ignore-signatures=yes
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. No available dictionaries : You need to install
+# both the python package and the system dependency for enchant to work..
+spelling-dict=
+
+# List of comma separated words that should be considered directives if they
+# appear at the beginning of a comment and should not be checked.
+spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains the private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to the private dictionary (see the
+# --spelling-private-dict-file option) instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=no
+
+# This flag controls whether the implicit-str-concat should generate a warning
+# on implicit string concatenation in sequences defined over several lines.
+check-str-concat-over-line-jumps=no
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of symbolic message names to ignore for Mixin members.
+ignored-checks-for-mixins=no-member,
+                          not-async-context-manager,
+                          not-context-manager,
+                          attribute-defined-outside-init
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+# Regex pattern to define which classes are considered mixins.
+mixin-class-rgx=.*[Mm]ixin
+
+# List of decorators that change the signature of a decorated function.
+signature-mutators=
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of names allowed to shadow builtins
+allowed-redefined-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+          _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
diff --git a/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_16.png b/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_16.png
new file mode 100644
index 0000000000000000000000000000000000000000..b6489b27a9a842564b982b9f9b19d8105a3bdc8a
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_16.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_17.png b/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_17.png
new file mode 100644
index 0000000000000000000000000000000000000000..971af92d31837d6da5df3d53d7f00e287fea97a8
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/AWS/mem_17.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Graphcore/image_12.png b/toolbox/ByteMLPerf/vendor_zoo/Graphcore/image_12.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c2abda9a4b4043e7cd809aba2eac4075c56f970
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Graphcore/image_12.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Graphcore/mem_12.png b/toolbox/ByteMLPerf/vendor_zoo/Graphcore/mem_12.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfd3e79f2e83f85be0c23945546e7171633e2704
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Graphcore/mem_12.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Habana/image_14.png b/toolbox/ByteMLPerf/vendor_zoo/Habana/image_14.png
new file mode 100644
index 0000000000000000000000000000000000000000..6db434318a3bcaf11ecdf0ff89502878212057f8
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Habana/image_14.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Habana/mem_14.png b/toolbox/ByteMLPerf/vendor_zoo/Habana/mem_14.png
new file mode 100644
index 0000000000000000000000000000000000000000..adce2a466e1e1c0c36ee0c8b3b6d802c14ffa55a
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Habana/mem_14.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Habana/pe_14.png b/toolbox/ByteMLPerf/vendor_zoo/Habana/pe_14.png
new file mode 100644
index 0000000000000000000000000000000000000000..adce2a466e1e1c0c36ee0c8b3b6d802c14ffa55a
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Habana/pe_14.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_7.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_7.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e0891587d699378e4e08ced41cdfe67878207fb
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_7.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_8.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_8.png
new file mode 100644
index 0000000000000000000000000000000000000000..3111cd349fe65dfadfb656b8305c9e9fa382358b
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_8.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_9.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_9.png
new file mode 100644
index 0000000000000000000000000000000000000000..93335f33413656497a71491067e747c375767703
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/image_9.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_7.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_7.png
new file mode 100644
index 0000000000000000000000000000000000000000..db3996b73c8b635269e3c64d0ce7bb83d996c716
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_7.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_8.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_8.png
new file mode 100644
index 0000000000000000000000000000000000000000..db3996b73c8b635269e3c64d0ce7bb83d996c716
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_8.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_9.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_9.png
new file mode 100644
index 0000000000000000000000000000000000000000..db3996b73c8b635269e3c64d0ce7bb83d996c716
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/mem_9.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_7.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_7.png
new file mode 100644
index 0000000000000000000000000000000000000000..db0043861da069590e7eaf1f96386f0ec7a4366c
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_7.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_9.png b/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_9.png
new file mode 100644
index 0000000000000000000000000000000000000000..db0043861da069590e7eaf1f96386f0ec7a4366c
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Moffett/pe_9.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_0.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..1fd0af5316b7dc05521a5bd9ba425e11abce0635
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_0.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_1.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..b53324ccb4f07b2ae651aa3b528f13ed94ef1416
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_1.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_2.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..5af4f434dbb1899fbaf4ca30c6a423f7e28f26e8
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_2.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_3.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d258b00dcbcad1f4a8692f126469360036d0da06
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_3.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_4.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa897b4433b9b32333503f1b0ec2f5c445a65dd5
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_4.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_5.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_5.png
new file mode 100644
index 0000000000000000000000000000000000000000..09e64d5ab3614d367598d624fad76be1a60c1270
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_5.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_6.png b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_6.png
new file mode 100644
index 0000000000000000000000000000000000000000..09e64d5ab3614d367598d624fad76be1a60c1270
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/NVIDIA/image_6.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/image_10.png b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/image_10.png
new file mode 100644
index 0000000000000000000000000000000000000000..7a191a58b0ae6baf74206236c00bec8480d25d28
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/image_10.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/mem_10.png b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/mem_10.png
new file mode 100644
index 0000000000000000000000000000000000000000..a09e8defbca86952cc607ace303ae0178f3d421d
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/mem_10.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/pe_10.png b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/pe_10.png
new file mode 100644
index 0000000000000000000000000000000000000000..1c6a50d3079988f4e5cfa9f960581db5995dba3f
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/QUALCOMM/pe_10.png differ
diff --git a/toolbox/ByteMLPerf/vendor_zoo/Stream/image_13.png b/toolbox/ByteMLPerf/vendor_zoo/Stream/image_13.png
new file mode 100644
index 0000000000000000000000000000000000000000..63e4378c1fde4b7ef1a585fba9d8ee6883b03e9c
Binary files /dev/null and b/toolbox/ByteMLPerf/vendor_zoo/Stream/image_13.png differ