diff --git a/models/cv/face_recognition/facenet/ixrt/README.md b/models/cv/face_recognition/facenet/ixrt/README.md
index 62d4de802ca9a3014cf11ffe66e8ad1b6bae1a3a..cd58c9b9e3ceeb57d077729b7afc781093fb442c 100644
--- a/models/cv/face_recognition/facenet/ixrt/README.md
+++ b/models/cv/face_recognition/facenet/ixrt/README.md
@@ -18,12 +18,6 @@ Pretrained model: <https://drive.google.com/open?id=1R77HmFADxe87GmoLwzfgMu_HY0I
 
 Dataset: <https://vis-www.cs.umass.edu/lfw/lfw.tgz> to download the lfw dataset.
 
-```bash
-cd ${DeepSparkInference_PATH}/models/cv/face/facenet/ixrt
-# download and unzip 20180408-102900.zip
-unzip 20180408-102900.zip
-```
-
 ### Install Dependencies
 
 ```bash
@@ -41,14 +35,8 @@ pip3 install -r requirements.txt
 ```bash
 mkdir -p checkpoints
 mkdir -p facenet_weights
-git clone https://github.com/timesler/facenet-pytorch
-# facenet-pytorch/dependencies/facenet is submodule, pls make sure it has been cloned or you can clone directly from https://github.com/davidsandberg/facenet/tree/096ed770f163957c1e56efa7feeb194773920f6e
-mv /Path/facenet/ixrt/tensorflow2pytorch.py facenet-pytorch
-python3 ./facenet-pytorch/tensorflow2pytorch.py \
-        --facenet_weights_path ./facenet_weights \
-        --facenet_pb_path ./20180408-102900 \
-        --onnx_save_name facenet_export.onnx
-mv facenet_export.onnx ./facenet_weights
+cd facenet_weights
+wget http://files.deepspark.org.cn:880/deepspark/facenet_export.onnx
 ```
 
 ### Data preprocessing
diff --git a/models/cv/face_recognition/facenet/ixrt/build_engine.py b/models/cv/face_recognition/facenet/ixrt/build_engine.py
index 74a62202defa50397cc4227da2181eebe10ab3e9..057587f81ec202e4b1077e8dc7dd2a0fdc7bfa9e 100644
--- a/models/cv/face_recognition/facenet/ixrt/build_engine.py
+++ b/models/cv/face_recognition/facenet/ixrt/build_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import argparse
@@ -28,13 +13,13 @@ from load_ixrt_plugin import load_ixrt_plugin
 load_ixrt_plugin()
 
 def add_facenet_norm(onnx_model):
-    norm = helper.make_node('FacenetNorm_IxRT', inputs=['/last_bn/BatchNormalization_output_0'] , outputs=['/Pow_1_output_0'], name='facenet_norm_1', size=512)
+    norm = helper.make_node('FacenetNorm_IxRT', inputs=['1189'] , outputs=['1190'], name='facenet_norm_1', size=512)
     
     onnx_model = onnx.load(onnx_model)
     graph = onnx_model.graph
     nodes = graph.node
     graph.node.append(norm)
-    output = onnx.helper.make_tensor_value_info('/Pow_1_output_0', TensorProto.FLOAT, [64, 512, 1, 1])
+    output = onnx.helper.make_tensor_value_info('1190', TensorProto.FLOAT, [64, 512, 1, 1])
     graph = onnx.helper.make_graph(
         graph.node,
         "facenet model",
diff --git a/models/cv/face_recognition/facenet/ixrt/ci/prepare.sh b/models/cv/face_recognition/facenet/ixrt/ci/prepare.sh
index 9d7081e7f053f6117e91bd5df10c9ce25e61c04d..44ffa453bd2d83d1b56ce4274c62dc4c3d19b9fb 100644
--- a/models/cv/face_recognition/facenet/ixrt/ci/prepare.sh
+++ b/models/cv/face_recognition/facenet/ixrt/ci/prepare.sh
@@ -26,16 +26,6 @@ else
 fi
 
 pip3 install -r requirements.txt
-unzip -q /root/data/checkpoints/20180408-102900.zip -d ./
 unzip -q /root/data/datasets/facenet_datasets.zip -d ./
-mkdir -p checkpoints
 mkdir -p facenet_weights
-cp -r /root/data/3rd_party/facenet-pytorch ./
-cp ./tensorflow2pytorch.py facenet-pytorch
-python3 ./facenet-pytorch/tensorflow2pytorch.py \
-        --facenet_weights_path ./facenet_weights \
-        --facenet_pb_path ./20180408-102900 \
-        --onnx_save_name facenet_export.onnx
-mv facenet_export.onnx ./facenet_weights
-
-sed -i -e 's#/last_bn/BatchNormalization_output_0#1187#g' -e 's#/avgpool_1a/GlobalAveragePool_output_0#1178#g' deploy.py build_engine.py
\ No newline at end of file
+cp /root/data/checkpoints/facenet_export.onnx ./facenet_weights
diff --git a/models/cv/face_recognition/facenet/ixrt/common.py b/models/cv/face_recognition/facenet/ixrt/common.py
index 9db1327ad1531c452fb38182d747c81fc6f8eccf..4b9ae1140ebb347f2127c141ec0e5934ef02f59d 100644
--- a/models/cv/face_recognition/facenet/ixrt/common.py
+++ b/models/cv/face_recognition/facenet/ixrt/common.py
@@ -1,25 +1,10 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import glob
 import torch
 import tensorrt
 import numpy as np
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 
 from torch.utils.data import DataLoader, SubsetRandomSampler, SequentialSampler
 from torchvision import datasets, transforms
@@ -53,13 +38,15 @@ def get_io_bindings(engine):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size,
         }
         print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
diff --git a/models/cv/face_recognition/facenet/ixrt/config/FACENET_CONFIG b/models/cv/face_recognition/facenet/ixrt/config/FACENET_CONFIG
index 3b3282eff772fa4a2d46d2cc2aace1570ad0f1bb..74ea45fe130915d2c2775a58bbf514fec8c54bdd 100644
--- a/models/cv/face_recognition/facenet/ixrt/config/FACENET_CONFIG
+++ b/models/cv/face_recognition/facenet/ixrt/config/FACENET_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # IMGSIZE : 模型输入hw大小
 # MODEL_NAME : 生成onnx/engine的basename
 # ORIGINE_MODEL : 原始onnx文件名称
diff --git a/models/cv/face_recognition/facenet/ixrt/deploy.py b/models/cv/face_recognition/facenet/ixrt/deploy.py
index 79f4ce5880bb50f78127a923e09c446547ac3fd2..3036363f22df57f86dab738bd827ee993bc5a424 100644
--- a/models/cv/face_recognition/facenet/ixrt/deploy.py
+++ b/models/cv/face_recognition/facenet/ixrt/deploy.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import os
 import simplejson as json
@@ -34,7 +19,7 @@ def onnx_sim(onnx_name, save_name):
 
 def cut_model(onnx_name):
     input_names = ["input"]
-    output_names = ["/last_bn/BatchNormalization_output_0"]
+    output_names = ["1189"]
     onnx.utils.extract_model(onnx_name, onnx_name, input_names, output_names) 
 
 def fuse_matmul(onnx_name, save_onnx_name):
@@ -94,10 +79,10 @@ def fuse_matmul(onnx_name, save_onnx_name):
             graph.initializer.append(conv_bias_new_initializer)
 
             pre_node.op_type = "Conv"
-            pre_node.input[0] = "/avgpool_1a/GlobalAveragePool_output_0"
+            pre_node.input[0] = "1180"
             pre_node.input[1] = "conv_weights_new"
             pre_node.input.append("conv_bias_new")
-            pre_node.output[0] = "/last_bn/BatchNormalization_output_0"
+            pre_node.output[0] = "1189"
             dilations = onnx.helper.make_attribute("dilations", [1,1])
             group = onnx.helper.make_attribute("group", 1)
             kernel_shape = onnx.helper.make_attribute("kernel_shape", [1,1])
@@ -119,7 +104,7 @@ def fuse_matmul(onnx_name, save_onnx_name):
             graph.node.remove(node)
 
     if find_matmul==1:
-        output = onnx.helper.make_tensor_value_info('/last_bn/BatchNormalization_output_0', TensorProto.FLOAT, [64, 512, 1, 1])
+        output = onnx.helper.make_tensor_value_info('1189', TensorProto.FLOAT, [64, 512, 1, 1])
         graph = onnx.helper.make_graph(
             graph.node,
             "facenet model",
@@ -389,10 +374,10 @@ def add_facenet_norm(cfg_name):
 
     graph_json["nodes"]["facenet_norm_1"] = {
             "inputs": [
-                "/last_bn/BatchNormalization_output_0"
+                "1189"
             ],
             "outputs": [
-                "/Pow_1_output_0"
+                "1190"
             ],
             "op_type": "FacenetNorm",
             "attrbiute": {
@@ -400,7 +385,7 @@ def add_facenet_norm(cfg_name):
             }
         }
     graph_json["output"] = []
-    graph_json["output"].append({"name":"/Pow_1_output_0", "type":"float32"})
+    graph_json["output"].append({"name":"1190", "type":"float32"})
 
     with open(cfg_name, "w") as fh:
         json.dump(graph_json, fh, indent=4)
diff --git a/models/cv/face_recognition/facenet/ixrt/inference.py b/models/cv/face_recognition/facenet/ixrt/inference.py
index eaed8b27ca70fb1628c8e5b3351b9e72692150fd..74a43f3e8da05fc5f1559b3a99046040f4cc8412 100644
--- a/models/cv/face_recognition/facenet/ixrt/inference.py
+++ b/models/cv/face_recognition/facenet/ixrt/inference.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
@@ -25,8 +10,7 @@ from tqdm import tqdm
 
 import cv2
 import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 import torch
 import tensorrt
 from tensorrt.utils import topk
@@ -58,7 +42,6 @@ def main(config):
         print("Warm Done.")
 
     # Inference
-    metricResult = {"metricResult": {}}
     if config.test_mode == "FPS":
         torch.cuda.synchronize()
         start_time = time.time()
@@ -74,7 +57,6 @@ def main(config):
 
         print("FPS : ", fps)
         print(f"Performance Check : Test {fps} >= target {config.fps_target}")
-        metricResult["metricResult"]["FPS"] = round(fps, 3)
         if fps >= config.fps_target:
             print("pass!")
             exit()
@@ -86,7 +68,7 @@ def main(config):
 
         classes = []
         embeddings = []
-        start_time = time.time()
+
         for xb, yb in tqdm(embed_loader):
         
             output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
@@ -94,9 +76,11 @@ def main(config):
             xb = xb.numpy()
             xb = np.ascontiguousarray(xb)
 
-            cuda.memcpy_htod(inputs[0]["allocation"], xb)
+            err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], xb, xb.nbytes)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
             context.execute_v2(allocations)
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             output = output.reshape(output.shape[0],output.shape[1])
             #print("output shape ",output.shape)
@@ -104,8 +88,7 @@ def main(config):
             classes.extend(yb[0:current_imgs_num].numpy())
             embeddings.extend(output)
 
-        e2e_time = time.time() - start_time
-        print(f"E2E time: {e2e_time:.3f} seconds")
+
         embeddings_dict = dict(zip(crop_paths,embeddings))
 
         pairs = read_pairs(config.datasets_dir + config.pairs_name)
@@ -122,9 +105,6 @@ def main(config):
         #eer = brentq(lambda x: 1. - x - interpolate.interp1d(fpr, tpr, fill_value="extrapolate")(x), 0., 1.)
         #print('Equal Error Rate (EER): %1.3f' % eer)
 
-        metricResult["metricResult"]["E2E time"] = round(e2e_time, 3)
-        metricResult["metricResult"]["AUC"] = round(auc, 3)
-        metricResult["metricResult"]["Acc"] = round(np.mean(accuracy), 3)
         acc = np.mean(accuracy)
         print(f"Accuracy Check : Test {acc} >= target {config.acc_target}")
         if acc >= config.acc_target:
@@ -133,7 +113,6 @@ def main(config):
         else:
             print("failed!")
             exit(1)
-    print(metricResult)
 
 def parse_config():
     parser = argparse.ArgumentParser()
@@ -157,7 +136,7 @@ def parse_config():
         "--img",
         "--img-size",
         type=int,
-        default=160,
+        default=224,
         help="inference size h,w",
     )
     parser.add_argument("--use_async", action="store_true")
@@ -173,4 +152,4 @@ def parse_config():
 
 if __name__ == "__main__":
     config = parse_config()
-    main(config)
\ No newline at end of file
+    main(config)
diff --git a/models/cv/face_recognition/facenet/ixrt/load_ixrt_plugin.py b/models/cv/face_recognition/facenet/ixrt/load_ixrt_plugin.py
index ae47dc8e854b6bea1f768e65c4dd481048bfebce..932efbdfd1a4e91d8ddfd363adf6bce989df1709 100644
--- a/models/cv/face_recognition/facenet/ixrt/load_ixrt_plugin.py
+++ b/models/cv/face_recognition/facenet/ixrt/load_ixrt_plugin.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
diff --git a/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py b/models/cv/face_recognition/facenet/ixrt/modify_batchsize.py
similarity index 58%
rename from models/cv/object_detection/yolov7/ixrt/modify_batchsize.py
rename to models/cv/face_recognition/facenet/ixrt/modify_batchsize.py
index 3a88c1603bd6f457fd4965257627dc29edcda4d1..f329119d4b79feef2022c4b6a8d7a9ad4ea1f8f6 100644
--- a/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py
+++ b/models/cv/face_recognition/facenet/ixrt/modify_batchsize.py
@@ -1,29 +1,14 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
+import numpy as np
 import argparse
 
-def change_input_dim(model, bsz):
+def change_dim(model, bsz):
     batch_size = bsz
 
     # The following code changes the first dimension of every input to be batch_size
     # Modify as appropriate ... note that this requires all inputs to
     # have the same batch_size
-    inputs = model.graph.input
-    for input in inputs:
+    for input in model.graph.input:
         # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
         # Add checks as needed.
         dim1 = input.type.tensor_type.shape.dim[0]
@@ -38,6 +23,8 @@ def change_input_dim(model, bsz):
             # set batch size of 1
             dim1.dim_value = 1
 
+    return model
+
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--batch_size", type=int)
@@ -48,5 +35,5 @@ def parse_args():
 
 args = parse_args()
 model = onnx.load(args.origin_model)
-change_input_dim(model, args.batch_size)
-onnx.save(model, args.output_model)
\ No newline at end of file
+change_dim(model, args.batch_size)
+onnx.save(model, args.output_model)
diff --git a/models/cv/face_recognition/facenet/ixrt/quant.py b/models/cv/face_recognition/facenet/ixrt/quant.py
index 26413e3e0f58f219cce2bd78804de288cba1fd1a..e4bb3780c5788d184e63cba5a32c1f54b056c7a7 100644
--- a/models/cv/face_recognition/facenet/ixrt/quant.py
+++ b/models/cv/face_recognition/facenet/ixrt/quant.py
@@ -1,19 +1,5 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
+
 import torch
 from tensorrt.deploy.api import *
 from tensorrt.deploy.utils.seed import manual_seed
@@ -93,6 +79,14 @@ def quantize_model(args, model_name, model, dataloader):
         quant_format="qdq",
         disable_quant_names=None)
 
+def add_1190_scale(cfg_name):
+    graph_json = json.load(open(cfg_name))
+
+    graph_json["quant_info"]["1190"] = graph_json["quant_info"]["1189"]
+
+    with open(cfg_name, "w") as fh:
+        json.dump(graph_json, fh, indent=4)
+
 def create_argparser(*args, **kwargs):
     parser = ArgumentParser(*args, **kwargs)
     parser.add_argument("--batch_size", type=int, default=64)
@@ -128,6 +122,8 @@ def main():
     else:
         print("[Error] file name not correct ", args.model)
     quantize_model(args, model_name, model, dataloader)
+    json_name = f"./facenet_weights/{model_name}.json"
+    add_1190_scale(json_name)
 
 if __name__ == "__main__":
     main()
diff --git a/models/cv/face_recognition/facenet/ixrt/requirements.txt b/models/cv/face_recognition/facenet/ixrt/requirements.txt
index b1b549a88d296c1f16d6eeb65bc28b9ddefcaea8..09895311f8ac47415b7774fbdcb78cb6c73f2f1a 100644
--- a/models/cv/face_recognition/facenet/ixrt/requirements.txt
+++ b/models/cv/face_recognition/facenet/ixrt/requirements.txt
@@ -1,12 +1,9 @@
-tensorflow
-onnxsim
-scikit-learn
-tf_slim
 tqdm
-pycuda
-onnx
 tabulate
 scipy==1.8.0
-pycocotools
-opencv-python==4.6.0.66
-simplejson
\ No newline at end of file
+scikit-learn
+onnx
+onnxsim
+simplejson
+numpy==1.23.5
+opencv-python==4.6.0.66
\ No newline at end of file
diff --git a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
index 27e5e8ad859d95c86dfc9b29fdc78150b0c60c95..d1c797299665ca44546d2927e446b4daca769af3 100644
--- a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
+++ b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
@@ -43,6 +28,7 @@ do
       --tgt) TGT=${arguments[index]};;
     esac
 done
+
 PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
 echo PROJ_DIR : ${PROJ_DIR}
 RUN_DIR="${PROJ_DIR}/ixrt/"
@@ -102,13 +88,24 @@ if [ $PRECISION == "int8" ];then
     fi
 fi
 
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+        --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
 
 # Build Engine
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
 ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
-FINAL_MODEL=${SIM_MODEL}
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -130,7 +127,7 @@ python3 ${RUN_DIR}/inference.py     \
     --warm_up=${WARM_UP}            \
     --loop_count ${LOOP_COUNT}      \
     --test_mode ${RUN_MODE}         \
-    --fps_target ${TGT}             \
+    --acc_target ${TGT}             \
     --bsz ${BSZ}; check_status
 
 exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
index 401658cafd85297b9d98f7febb9e7c88746062ef..5e0cf7808166803037683bb2aacf4003503556bb 100644
--- a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
+++ b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
diff --git a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
index c2c2f176bcd0ea6bb00acedb6fbda80b47456a08..ea7cb3ecc6f675dc220fe9a2a81f94992c065cef 100644
--- a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
+++ b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
diff --git a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_performance.sh b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
index 7574347c028dfdb28e3b06016d4c61fb6d3e1328..21c419d73c7a6f69531f195bb0dd8cd0e12484ef 100644
--- a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
+++ b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
diff --git a/models/cv/face_recognition/facenet/ixrt/tensorflow2pytorch.py b/models/cv/face_recognition/facenet/ixrt/tensorflow2pytorch.py
deleted file mode 100644
index f76ba0fff91ae1ac334c2babbc10f0d65139b711..0000000000000000000000000000000000000000
--- a/models/cv/face_recognition/facenet/ixrt/tensorflow2pytorch.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import tensorflow.compat.v1 as tf
-tf.disable_v2_behavior()
-import torch
-import json
-import os, sys
-
-from dependencies.facenet.src import facenet
-from dependencies.facenet.src.models import inception_resnet_v1 as tf_mdl
-from dependencies.facenet.src.align import detect_face
-
-from models.inception_resnet_v1 import InceptionResnetV1
-from models.mtcnn import PNet, RNet, ONet
-
-
-def import_tf_params(tf_mdl_dir, sess):
-    """Import tensorflow model from save directory.
-    
-    Arguments:
-        tf_mdl_dir {str} -- Location of protobuf, checkpoint, meta files.
-        sess {tensorflow.Session} -- Tensorflow session object.
-    
-    Returns:
-        (list, list, list) -- Tuple of lists containing the layer names,
-            parameter arrays as numpy ndarrays, parameter shapes.
-    """
-    print('\nLoading tensorflow model\n')
-    if callable(tf_mdl_dir):
-        tf_mdl_dir(sess)
-    else:
-        facenet.load_model(tf_mdl_dir)
-
-    print('\nGetting model weights\n')
-    images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
-    print(images_placeholder)
-    tf_layers = tf.trainable_variables()
-    tf_params = sess.run(tf_layers)
-    print(tf.get_default_graph())
-
-    tf_shapes = [p.shape for p in tf_params]
-    tf_layers = [l.name for l in tf_layers]
-    
-    print(tf_shapes)
-    print(tf_shapes)
-    
-    if not callable(tf_mdl_dir):
-        path = os.path.join(tf_mdl_dir, 'layer_description.json')
-    else:
-        path = 'data/layer_description.json'
-    with open(path, 'w') as f:
-        json.dump({l: s for l, s in zip(tf_layers, tf_shapes)}, f)
-
-    return tf_layers, tf_params, tf_shapes
-
-
-def get_layer_indices(layer_lookup, tf_layers):
-    """Giving a lookup of model layer attribute names and tensorflow variable names,
-    find matching parameters.
-    
-    Arguments:
-        layer_lookup {dict} -- Dictionary mapping pytorch attribute names to (partial)
-            tensorflow variable names. Expects dict of the form {'attr': ['tf_name', ...]}
-            where the '...'s are ignored.
-        tf_layers {list} -- List of tensorflow variable names.
-    
-    Returns:
-        list -- The input dictionary with the list of matching inds appended to each item.
-    """
-    layer_inds = {}
-    for name, value in layer_lookup.items():
-        layer_inds[name] = value + [[i for i, n in enumerate(tf_layers) if value[0] in n]]
-    return layer_inds
-
-
-def load_tf_batchNorm(weights, layer):
-    """Load tensorflow weights into nn.BatchNorm object.
-    
-    Arguments:
-        weights {list} -- Tensorflow parameters.
-        layer {torch.nn.Module} -- nn.BatchNorm.
-    """
-    layer.bias.data = torch.tensor(weights[0]).view(layer.bias.data.shape)
-    layer.weight.data = torch.ones_like(layer.weight.data)
-    layer.running_mean = torch.tensor(weights[1]).view(layer.running_mean.shape)
-    layer.running_var = torch.tensor(weights[2]).view(layer.running_var.shape)
-
-
-def load_tf_conv2d(weights, layer, transpose=False):
-    """Load tensorflow weights into nn.Conv2d object.
-    
-    Arguments:
-        weights {list} -- Tensorflow parameters.
-        layer {torch.nn.Module} -- nn.Conv2d.
-    """
-    if isinstance(weights, list):
-        if len(weights) == 2:
-            layer.bias.data = (
-                torch.tensor(weights[1])
-                    .view(layer.bias.data.shape)
-            )
-        weights = weights[0]
-    
-    if transpose:
-        dim_order = (3, 2, 1, 0)
-    else:
-        dim_order = (3, 2, 0, 1)
-
-    layer.weight.data = (
-        torch.tensor(weights)
-            .permute(dim_order)
-            .view(layer.weight.data.shape)
-    )
-
-
-def load_tf_conv2d_trans(weights, layer):
-    return load_tf_conv2d(weights, layer, transpose=True)
-
-
-def load_tf_basicConv2d(weights, layer):
-    """Load tensorflow weights into grouped Conv2d+BatchNorm object.
-    
-    Arguments:
-        weights {list} -- Tensorflow parameters.
-        layer {torch.nn.Module} -- Object containing Conv2d+BatchNorm.
-    """
-    load_tf_conv2d(weights[0], layer.conv)
-    load_tf_batchNorm(weights[1:], layer.bn)
-
-
-def load_tf_linear(weights, layer):
-    """Load tensorflow weights into nn.Linear object.
-    
-    Arguments:
-        weights {list} -- Tensorflow parameters.
-        layer {torch.nn.Module} -- nn.Linear.
-    """
-    if isinstance(weights, list):
-        if len(weights) == 2:
-            layer.bias.data = (
-                torch.tensor(weights[1])
-                    .view(layer.bias.data.shape)
-            )
-        weights = weights[0]
-    layer.weight.data = (
-        torch.tensor(weights)
-            .transpose(-1, 0)
-            .view(layer.weight.data.shape)
-    )
-
-
-# High-level parameter-loading functions:
-
-def load_tf_block35(weights, layer):
-    load_tf_basicConv2d(weights[:4], layer.branch0)
-    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
-    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
-    load_tf_basicConv2d(weights[12:16], layer.branch2[0])
-    load_tf_basicConv2d(weights[16:20], layer.branch2[1])
-    load_tf_basicConv2d(weights[20:24], layer.branch2[2])
-    load_tf_conv2d(weights[24:26], layer.conv2d)
-
-
-def load_tf_block17_8(weights, layer):
-    load_tf_basicConv2d(weights[:4], layer.branch0)
-    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
-    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
-    load_tf_basicConv2d(weights[12:16], layer.branch1[2])
-    load_tf_conv2d(weights[16:18], layer.conv2d)
-
-
-def load_tf_mixed6a(weights, layer):
-    if len(weights) != 16:
-        raise ValueError(f'Number of weight arrays ({len(weights)}) not equal to 16')
-    load_tf_basicConv2d(weights[:4], layer.branch0)
-    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
-    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
-    load_tf_basicConv2d(weights[12:16], layer.branch1[2])
-
-
-def load_tf_mixed7a(weights, layer):
-    if len(weights) != 28:
-        raise ValueError(f'Number of weight arrays ({len(weights)}) not equal to 28')
-    load_tf_basicConv2d(weights[:4], layer.branch0[0])
-    load_tf_basicConv2d(weights[4:8], layer.branch0[1])
-    load_tf_basicConv2d(weights[8:12], layer.branch1[0])
-    load_tf_basicConv2d(weights[12:16], layer.branch1[1])
-    load_tf_basicConv2d(weights[16:20], layer.branch2[0])
-    load_tf_basicConv2d(weights[20:24], layer.branch2[1])
-    load_tf_basicConv2d(weights[24:28], layer.branch2[2])
-
-
-def load_tf_repeats(weights, layer, rptlen, subfun):
-    if len(weights) % rptlen != 0:
-        raise ValueError(f'Number of weight arrays ({len(weights)}) not divisible by {rptlen}')
-    weights_split = [weights[i:i+rptlen] for i in range(0, len(weights), rptlen)]
-    for i, w in enumerate(weights_split):
-        subfun(w, getattr(layer, str(i)))
-
-
-def load_tf_repeat_1(weights, layer):
-    load_tf_repeats(weights, layer, 26, load_tf_block35)
-
-
-def load_tf_repeat_2(weights, layer):
-    load_tf_repeats(weights, layer, 18, load_tf_block17_8)
-
-
-def load_tf_repeat_3(weights, layer):
-    load_tf_repeats(weights, layer, 18, load_tf_block17_8)
-
-
-def test_loaded_params(mdl, tf_params, tf_layers):
-    """Check each parameter in a pytorch model for an equivalent parameter
-    in a list of tensorflow variables.
-    
-    Arguments:
-        mdl {torch.nn.Module} -- Pytorch model.
-        tf_params {list} -- List of ndarrays representing tensorflow variables.
-        tf_layers {list} -- Corresponding list of tensorflow variable names.
-    """
-    tf_means = torch.stack([torch.tensor(p).mean() for p in tf_params])
-    for name, param in mdl.named_parameters():
-        pt_mean = param.data.mean()
-        matching_inds = ((tf_means - pt_mean).abs() < 1e-8).nonzero()
-        print(f'{name} equivalent to {[tf_layers[i] for i in matching_inds]}')
-
-
-def compare_model_outputs(pt_mdl, sess, test_data):
-    """Given some testing data, compare the output of pytorch and tensorflow models.
-    
-    Arguments:
-        pt_mdl {torch.nn.Module} -- Pytorch model.
-        sess {tensorflow.Session} -- Tensorflow session object.
-        test_data {torch.Tensor} -- Pytorch tensor.
-    """
-    print('\nPassing test data through TF model\n')
-    if isinstance(sess, tf.Session):
-        images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
-        phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
-        embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
-        feed_dict = {images_placeholder: test_data.numpy(), phase_train_placeholder: False}
-        tf_output = torch.tensor(sess.run(embeddings, feed_dict=feed_dict))
-    else:
-        tf_output = sess(test_data)
-
-    print(tf_output.shape, tf_output)
-
-    print('\nPassing test data through PT model\n')
-    pt_output = pt_mdl(test_data.permute(0, 3, 1, 2))
-    print(pt_output.shape, pt_output)
-
-    distance = (tf_output - pt_output).norm()
-    print(f'\nDistance {distance}\n')
-
-
-def compare_mtcnn(pt_mdl, tf_fun, sess, ind, test_data):
-    tf_mdls = tf_fun(sess)
-    tf_mdl = tf_mdls[ind]
-
-    print('\nPassing test data through TF model\n')
-    tf_output = tf_mdl(test_data.numpy())
-    tf_output = [torch.tensor(out) for out in tf_output]
-    print('\n'.join([str(o.view(-1)[:10]) for o in tf_output]))
-
-    print('\nPassing test data through PT model\n')
-    with torch.no_grad():
-        pt_output = pt_mdl(test_data.permute(0, 3, 2, 1))
-    pt_output = [torch.tensor(out) for out in pt_output]
-    for i in range(len(pt_output)):
-        if len(pt_output[i].shape) == 4:
-            pt_output[i] = pt_output[i].permute(0, 3, 2, 1).contiguous()
-    print('\n'.join([str(o.view(-1)[:10]) for o in pt_output]))
-
-    distance = [(tf_o - pt_o).norm() for tf_o, pt_o in zip(tf_output, pt_output)]
-    print(f'\nDistance {distance}\n')
-
-
-def load_tf_model_weights(mdl, layer_lookup, tf_mdl_dir, is_resnet=True, arg_num=None):
-    """Load tensorflow parameters into a pytorch model.
-    
-    Arguments:
-        mdl {torch.nn.Module} -- Pytorch model.
-        layer_lookup {[type]} -- Dictionary mapping pytorch attribute names to (partial)
-            tensorflow variable names, and a function suitable for loading weights.
-            Expects dict of the form {'attr': ['tf_name', function]}. 
-        tf_mdl_dir {str} -- Location of protobuf, checkpoint, meta files.
-    """
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        tf_layers, tf_params, tf_shapes = import_tf_params(tf_mdl_dir, sess)
-        layer_info = get_layer_indices(layer_lookup, tf_layers)
-
-        for layer_name, info in layer_info.items():
-            print(f'Loading {info[0]}/* into {layer_name}')
-            weights = [tf_params[i] for i in info[2]]
-            layer = getattr(mdl, layer_name)
-            info[1](weights, layer)
-
-        test_loaded_params(mdl, tf_params, tf_layers)
-
-        if is_resnet:
-            compare_model_outputs(mdl, sess, torch.randn(5, 160, 160, 3).detach())
-
-
-def tensorflow2pytorch(args):
-    lookup_inception_resnet_v1 = {
-        'conv2d_1a': ['InceptionResnetV1/Conv2d_1a_3x3', load_tf_basicConv2d],
-        'conv2d_2a': ['InceptionResnetV1/Conv2d_2a_3x3', load_tf_basicConv2d],
-        'conv2d_2b': ['InceptionResnetV1/Conv2d_2b_3x3', load_tf_basicConv2d],
-        'conv2d_3b': ['InceptionResnetV1/Conv2d_3b_1x1', load_tf_basicConv2d],
-        'conv2d_4a': ['InceptionResnetV1/Conv2d_4a_3x3', load_tf_basicConv2d],
-        'conv2d_4b': ['InceptionResnetV1/Conv2d_4b_3x3', load_tf_basicConv2d],
-        'repeat_1': ['InceptionResnetV1/Repeat/block35', load_tf_repeat_1],
-        'mixed_6a': ['InceptionResnetV1/Mixed_6a', load_tf_mixed6a],
-        'repeat_2': ['InceptionResnetV1/Repeat_1/block17', load_tf_repeat_2],
-        'mixed_7a': ['InceptionResnetV1/Mixed_7a', load_tf_mixed7a],
-        'repeat_3': ['InceptionResnetV1/Repeat_2/block8', load_tf_repeat_3],
-        'block8': ['InceptionResnetV1/Block8', load_tf_block17_8],
-        'last_linear': ['InceptionResnetV1/Bottleneck/weights', load_tf_linear],
-        'last_bn': ['InceptionResnetV1/Bottleneck/BatchNorm', load_tf_batchNorm],
-        # 'logits': ['Logits', load_tf_linear],
-    }
-
-    print('\nLoad CASIA-Webface-trained weights and save\n')
-    mdl = InceptionResnetV1(num_classes=10575).eval()
-    tf_mdl_dir = args.facenet_pb_path
-
-    load_tf_model_weights(mdl, lookup_inception_resnet_v1, tf_mdl_dir)
-    # print(f'????????')
-    # data_name = 'casia-webfacexxxxxxx'
-    # state_dict = mdl.state_dict()
-    # torch.save(state_dict, f'{tf_mdl_dir}-{data_name}.pt')
-
-    x = torch.rand(64, 3, 160, 160)#.cuda()
-    # y = resnet(x)
-    # print(y.shape)
-
-
-    f = f"{args.facenet_weights_path}/{args.onnx_save_name}"
-    torch.onnx.export(mdl, x, f, verbose=False, opset_version=11, 
-                    input_names=['input'], output_names=['output'], dynamic_axes=None)
-
-
-    
-import argparse
-def parse_args():
-    parser = argparse.ArgumentParser("deploy facenet")
-    parser.add_argument("--facenet_weights_path", default="", help="onnx model path")
-    parser.add_argument("--facenet_pb_path", default="", help="")
-    parser.add_argument("--onnx_save_name", default="", help="")
-
-    return parser.parse_args()
-args = parse_args()
-
-tensorflow2pytorch(args)
-
-
-# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-# print('Running on device: {}'.format(device))
-
-# # Load pretrained resnet model
-# resnet = InceptionResnetV1(
-#     classify=False,
-#     pretrained='casia-webface'
-# )#.to(device)
-
-# x = torch.rand(64, 3, 160, 160)#.cuda()
-# y = resnet(x)
-# print(y.shape)
-
-
-# f = f"{args.facenet_weights_path}/{args.onnx_save_name}"
-# torch.onnx.export(resnet, x, f, verbose=False, opset_version=11, input_names=['input'], output_names=['output'], dynamic_axes=None)
diff --git a/models/cv/face_recognition/facenet/ixrt/utils.py b/models/cv/face_recognition/facenet/ixrt/utils.py
index ab8f213bf6bf629ad073140f4ab886760c707759..f908e887663a97766b5006c4472f373ba71dfa60 100644
--- a/models/cv/face_recognition/facenet/ixrt/utils.py
+++ b/models/cv/face_recognition/facenet/ixrt/utils.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import math
 
diff --git a/models/cv/instance_segmentation/solov1/ixrt/ci/prepare.sh b/models/cv/instance_segmentation/solov1/ixrt/ci/prepare.sh
index 66c8f9d0525bc855866325817dd7ee87aad8989f..107ffda40c31521179d432506d46984ab75805fc 100644
--- a/models/cv/instance_segmentation/solov1/ixrt/ci/prepare.sh
+++ b/models/cv/instance_segmentation/solov1/ixrt/ci/prepare.sh
@@ -27,20 +27,7 @@ fi
 
 pip install -r requirements.txt
 
-cp -r /root/data/3rd_party/mmcv-v1.7.1 ./mmcv
-cp -r -T /root/data/repos/deepsparkhub/toolbox/MMDetection/patch/mmcv/v1.7.1 ./mmcv
-cd mmcv
-rm -rf mmcv/ops/csrc/common/cuda/spconv/ mmcv/ops/csrc/common/utils/spconv/
-rm -f mmcv/ops/csrc/pytorch/cpu/sparse_*
-rm -f mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
-rm -f mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
-rm -f mmcv/ops/csrc/pytorch/cuda/sparse_*
-rm -f mmcv/ops/csrc/pytorch/sp*
-
-bash clean_mmcv.sh
-bash build_mmcv.sh
-bash install_mmcv.sh
-cd ..
+pip install /root/data/install/mmcv_full-1.7.0+corex.20250108131027-cp310-cp310-linux_x86_64.whl
 
 mkdir -p checkpoints
 ln -s /root/data/checkpoints/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth ./
diff --git a/models/cv/object_detection/fcos/ixrt/README.md b/models/cv/object_detection/fcos/ixrt/README.md
index b11d07f0cf0a97f3a5bd99ccad9d562e56634160..721fed1552fa6acee11ec5cc089a6aefbb7614cd 100755
--- a/models/cv/object_detection/fcos/ixrt/README.md
+++ b/models/cv/object_detection/fcos/ixrt/README.md
@@ -34,48 +34,23 @@ apt install -y libgl1-mesa-glx
 pip3 install -r requirements.txt
 ```
 
-The inference of the FCOS model requires a dependency on a well-adapted mmcv-v1.7.0 library. Please inquire with the staff to obtain the relevant libraries.
-
-You can follow the script [prepare_mmcv.sh](https://gitee.com/deep-spark/deepsparkhub/blob/master/toolbox/MMDetection/prepare_mmcv.sh) to build:
-
-```bash
-cd mmcv
-sh build_mmcv.sh
-sh install_mmcv.sh
-```
-
 ### Model Conversion
 
 MMDetection is an open source object detection toolbox based on PyTorch. It is a part of the OpenMMLab project.It is utilized for model conversion. In MMDetection, Execute model conversion command, and the checkpoints folder needs to be created, (mkdir checkpoints) in project
 
 ```bash
 mkdir -p checkpoints
-git clone -b v2.25.0 https://github.com/open-mmlab/mmdetection.git
-cd mmdetection
-python3 tools/deployment/pytorch2onnx.py \
-    /Path/to/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py \
-    checkpoints/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth \
-    --output-file /Path/To/ixrt/data/checkpoints/r50_fcos.onnx \
-    --input-img demo/demo.jpg \
-    --test-img tests/data/color.jpg \
-    --shape 800 800 \
-    --show \
-    --verify \
-    --skip-postprocess \
-    --dynamic-export \
-    --cfg-options \
-      model.test_cfg.deploy_nms_pre=-1
+cd checkpoints
+wget http://files.deepspark.org.cn:880/deepspark/fcos_opt.onnx
 ```
 
-If there are issues such as input parameter mismatch during model export, it may be due to ONNX version. To resolve this, please delete the last parameter (dynamic_slice) from the return value of the_slice_helper function in the /usr/local/lib/python3.10/site-packages/mmcv/onnx/onnx_utils/symbolic_helper.py file.
-
 ## Model Inference
 
 ```bash
 export PROJ_DIR=./
-export DATASETS_DIR=/Path/to/coco/
-export CHECKPOINTS_DIR=/Path/to/checkpoints
-export RUN_DIR=./
+export DATASETS_DIR=./coco/
+export CHECKPOINTS_DIR=./checkpoints
+export RUN_DIR=../../ixrt_common
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/fcos/ixrt/build_engine.py b/models/cv/object_detection/fcos/ixrt/build_engine.py
deleted file mode 100755
index af649916756a27bde0aea18b9f3572a430a424d9..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/fcos/ixrt/build_engine.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import cv2
-import argparse
-import numpy as np
-
-import torch
-import tensorrt
-from calibration_dataset import getdataloader
-import cuda.cudart as cudart
-
-def assertSuccess(err):
-    assert(err == cudart.cudaError_t.cudaSuccess)
-
-class EngineCalibrator(tensorrt.IInt8EntropyCalibrator2):
-
-    def __init__(self, cache_file, datasets_dir, loop_count=10, bsz=1, img_sz=800):
-        super().__init__()
-        self.cache_file = cache_file
-        self.image_batcher  = getdataloader(datasets_dir, loop_count, batch_size=bsz, img_sz=img_sz)
-        self.batch_generator = iter(self.image_batcher)
-        size = img_sz*img_sz*3*bsz
-        __import__('pdb').set_trace()
-        err, self.batch_allocation = cudart.cudaMalloc(size)
-        assertSuccess(err)
-
-    def __del__(self):
-        err,= cudart.cudaFree(self.batch_allocation)
-        assertSuccess(err)
-
-    def get_batch_size(self):
-        return self.image_batcher.batch_size
-
-    def get_batch(self, names):
-        try:
-            batch, _ = next(self.batch_generator)
-            batch = batch.numpy()
-            __import__('pdb').set_trace()
-            cudart.cudaMemcpy(self.batch_allocation,
-                              np.ascontiguousarray(batch),
-                              batch.nbytes,
-                              cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
-            return [int(self.batch_allocation)]
-        except StopIteration:
-            return None
-
-    def read_calibration_cache(self):
-        if os.path.exists(self.cache_file):
-            with open(self.cache_file, "rb") as f:
-                return f.read()
-
-    def write_calibration_cache(self, cache):
-        with open(self.cache_file, "wb") as f:
-            f.write(cache)
-
-def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.VERBOSE)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-
-    precision = tensorrt.BuilderFlag.FP16
-    print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--engine", type=str, default=None)
-    parser.add_argument(
-        "--datasets_dir",
-        type=str,
-        default="",
-        help="ImageNet dir",
-    )
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/calibration_dataset.py b/models/cv/object_detection/fcos/ixrt/calibration_dataset.py
deleted file mode 100644
index d7525d5136168cc8fb1d24a28f1b71b85ce4cc92..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/fcos/ixrt/calibration_dataset.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-
-import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-from torchvision import models
-from torchvision import transforms as T
-
-
-class CalibrationImageNet(torchvision.datasets.ImageFolder):
-    def __init__(self, *args, **kwargs):
-        super(CalibrationImageNet, self).__init__(*args, **kwargs)
-        img2label_path = os.path.join(self.root, "val_map.txt")
-        if not os.path.exists(img2label_path):
-            raise FileNotFoundError(f"Not found label file `{img2label_path}`.")
-
-        self.img2label_map = self.make_img2label_map(img2label_path)
-
-    def make_img2label_map(self, path):
-        with open(path) as f:
-            lines = f.readlines()
-
-        img2lable_map = dict()
-        for line in lines:
-            line = line.lstrip().rstrip().split("\t")
-            if len(line) != 2:
-                continue
-            img_name, label = line
-            img_name = img_name.strip()
-            if img_name in [None, ""]:
-                continue
-            label = int(label.strip())
-            img2lable_map[img_name] = label
-        return img2lable_map
-
-    def __getitem__(self, index):
-        path, target = self.samples[index]
-        sample = self.loader(path)
-        if self.transform is not None:
-            sample = self.transform(sample)
-        # if self.target_transform is not None:
-        #     target = self.target_transform(target)
-        img_name = os.path.basename(path)
-        target = self.img2label_map[img_name]
-
-        return sample, target
-
-
-def create_dataloaders(data_path, num_samples=1024, img_sz=224, batch_size=2, workers=0):
-    dataset = CalibrationImageNet(
-        data_path,
-        transform=T.Compose(
-            [
-                T.Resize(256),
-                T.CenterCrop(img_sz),
-                T.ToTensor(),
-                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-            ]
-        ),
-    )
-
-    calibration_dataset = dataset
-    if num_samples is not None:
-        calibration_dataset = torch.utils.data.Subset(
-            dataset, indices=range(num_samples)
-        )
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-
-    verify_dataloader = DataLoader(
-        dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-
-    return calibration_dataloader, verify_dataloader
-
-
-def getdataloader(dataset_dir, step=20, batch_size=32, workers=2, img_sz=224, total_sample=50000):
-    num_samples = min(total_sample, step * batch_size)
-    if step < 0:
-        num_samples = None
-    calibration_dataloader, _ = create_dataloaders(
-        dataset_dir,
-        img_sz=img_sz,
-        batch_size=batch_size,
-        workers=workers,
-        num_samples=num_samples,
-    )
-    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/ci/prepare.sh b/models/cv/object_detection/fcos/ixrt/ci/prepare.sh
index a19f48112c37f0e4219772b902d69eed56477392..a04697754cb74c96dbec50e56bcb3a207fcb3b58 100644
--- a/models/cv/object_detection/fcos/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/fcos/ixrt/ci/prepare.sh
@@ -25,36 +25,6 @@ else
     echo "Not Support Os"
 fi
 pip3 install -r requirements.txt
-cp -r /root/data/3rd_party/mmcv-v1.7.1 ./mmcv
-cp -r -T /root/data/repos/deepsparkhub/toolbox/MMDetection/patch/mmcv/v1.7.1 ./mmcv
-cd mmcv
-rm -rf mmcv/ops/csrc/common/cuda/spconv/ mmcv/ops/csrc/common/utils/spconv/
-rm -f mmcv/ops/csrc/pytorch/cpu/sparse_*
-rm -f mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
-rm -f mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
-rm -f mmcv/ops/csrc/pytorch/cuda/sparse_*
-rm -f mmcv/ops/csrc/pytorch/sp*
-
-sed -i 's/return _slice(g, input, axes, starts, ends, steps, dynamic_slice)/return _slice(g, input, axes, starts, ends, steps)/' mmcv/onnx/onnx_utils/symbolic_helper.py
-
-bash clean_mmcv.sh
-bash build_mmcv.sh
-bash install_mmcv.sh
-cd ..
-
+pip install /root/data/install/mmcv_full-1.7.0+corex.20250108131027-cp310-cp310-linux_x86_64.whl
 mkdir -p checkpoints
-cp -r /root/data/3rd_party/mmdetection-v2.25.0 ./mmdetection
-cd mmdetection
-python3 tools/deployment/pytorch2onnx.py \
-    ../fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py \
-    /root/data/checkpoints/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth \
-    --output-file ../checkpoints/r50_fcos.onnx \
-    --input-img demo/demo.jpg \
-    --test-img tests/data/color.jpg \
-    --shape 800 800 \
-    --show \
-    --verify \
-    --skip-postprocess \
-    --dynamic-export \
-    --cfg-options \
-      model.test_cfg.deploy_nms_pre=-1
\ No newline at end of file
+cp /root/data/checkpoints/fcos_opt.onnx checkpoints/
diff --git a/models/cv/object_detection/fcos/ixrt/common.py b/models/cv/object_detection/fcos/ixrt/common.py
deleted file mode 100644
index b18a24394c934c40f1f1ab761ff946edbf69f53a..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/fcos/ixrt/common.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import numpy as np
-from tqdm import tqdm
-
-import tensorrt
-import pycuda.driver as cuda
-
-# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
-# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
-def box_class85to6(input):
-    center_x_y = input[:, :2]
-    side = input[:, 2:4]
-    conf = input[:, 4:5]
-    class_id = np.argmax(input[:, 5:], axis = -1)
-    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
-    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
-    x1_y1 = center_x_y - 0.5 * side
-    x2_y2 = center_x_y + 0.5 * side
-    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
-    return nms_input
-
-def save2json(batch_img_id, pred_boxes, json_result):
-    for i, boxes in enumerate(pred_boxes):
-        image_id = int(batch_img_id)
-        if boxes is not None:
-            x, y, w, h, c, p = boxes
-            if image_id!=-1:
-                
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                    "image_id": image_id,
-                    "category_id": c,
-                    "bbox": [x, y, w, h],
-                    "score": p,
-                    }
-                    )
-
-def create_engine_context(engine_path, logger):
-    with open(engine_path, "rb") as f:
-        runtime = tensorrt.Runtime(logger)
-        assert runtime
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine
-        context = engine.create_execution_context()
-        assert context
-
-    return engine, context
-
-def get_io_bindings(engine):
-    # Setup I/O bindings
-    inputs = []
-    outputs = []
-    allocations = []
-
-    for i in range(engine.num_bindings):
-        is_input = False
-        if engine.binding_is_input(i):
-            is_input = True
-        name = engine.get_binding_name(i)
-        dtype = engine.get_binding_dtype(i)
-        shape = engine.get_binding_shape(i)
-        if is_input:
-            batch_size = shape[0]
-        size = np.dtype(tensorrt.nptype(dtype)).itemsize
-        for s in shape:
-            size *= s
-        allocation = cuda.mem_alloc(size)
-        binding = {
-            "index": i,
-            "name": name,
-            "dtype": np.dtype(tensorrt.nptype(dtype)),
-            "shape": list(shape),
-            "allocation": allocation,
-        }
-        # print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
-        allocations.append(allocation)
-        if engine.binding_is_input(i):
-            inputs.append(binding)
-        else:
-            outputs.append(binding)
-    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py b/models/cv/object_detection/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
deleted file mode 100644
index 72d17de86f01b3f4b1b39e8ea6fb0dfa32abfe0a..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-_base_ = 'fcos_r50_caffe_fpn_gn-head_1x_coco.py'
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
-    bbox_head=dict(
-        norm_on_bbox=True,
-        centerness_on_reg=True,
-        dcn_on_last_conv=False,
-        center_sampling=True,
-        conv_bias=True,
-        loss_bbox=dict(type='GIoULoss', loss_weight=1.0)),
-    # training and testing settings
-    test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6)))
-
-# dataset settings
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True),
-    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1333, 800),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
-optimizer_config = dict(_delete_=True, grad_clip=None)
-
-lr_config = dict(warmup='linear')
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/fcos_ixrt_inference.py b/models/cv/object_detection/fcos/ixrt/fcos_ixrt_inference.py
deleted file mode 100644
index 9218ea3a2d5ac16be9e1bc11ba7ee4e4e7c0c3f2..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/fcos/ixrt/fcos_ixrt_inference.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import sys
-from tqdm import tqdm
-import numpy as np
-
-import argparse
-
-import torch
-import mmcv
-from mmdet.datasets import build_dataloader, build_dataset
-from mmdet.models import build_detector
-from mmdet.core import bbox2result
-import cv2
-import numpy as np
-import onnxruntime as rt
-
-import time
-
-import os 
-import copy
-from common import create_engine_context, get_io_bindings
-import pycuda.autoinit
-import pycuda.driver as cuda
-import tensorrt
-from tensorrt import Dims
-
-def check_target(inference, target):
-    satisfied = False
-    if inference > target:
-        satisfied = True  
-    return satisfied    
-
-def get_dataloder(args):
-    cfg_path = args.cfg_file
-    cfg = mmcv.Config.fromfile(cfg_path)
-    datasets_path = args.data_path
-    cfg['data']['val']['img_prefix'] = os.path.join(datasets_path, 'val2017')
-    cfg['data']['val']['ann_file'] = os.path.join(datasets_path, 'annotations/instances_val2017.json')   
-    dataset = build_dataset(cfg.data.val)
-    data_loader = build_dataloader(dataset, samples_per_gpu=args.batch_size, workers_per_gpu=args.num_workers, shuffle=False)
-    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
-    return dataset, data_loader, model
-    
-def eval_coco(args, inputs, outputs, allocations, context):
-    dataset, dataloader, model = get_dataloder(args)
-
-    # Prepare the output data
-    outputs_651 = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    outputs_766 = np.zeros(outputs[1]["shape"], outputs[1]["dtype"])
-    outputs_881 = np.zeros(outputs[2]["shape"], outputs[2]["dtype"])
-    outputs_996 = np.zeros(outputs[3]["shape"], outputs[3]["dtype"])
-    outputs_1111 = np.zeros(outputs[4]["shape"], outputs[4]["dtype"])
-    outputs_713 = np.zeros(outputs[5]["shape"], outputs[5]["dtype"])
-    outputs_828 = np.zeros(outputs[6]["shape"], outputs[6]["dtype"])
-    outputs_943 = np.zeros(outputs[7]["shape"], outputs[7]["dtype"])
-    outputs_1058 = np.zeros(outputs[8]["shape"], outputs[8]["dtype"])
-    outputs_1173 = np.zeros(outputs[9]["shape"], outputs[9]["dtype"])
-    outputs_705 = np.zeros(outputs[10]["shape"], outputs[10]["dtype"])
-    outputs_820 = np.zeros(outputs[11]["shape"], outputs[11]["dtype"])
-    outputs_935 = np.zeros(outputs[12]["shape"], outputs[12]["dtype"])
-    outputs_1050 = np.zeros(outputs[13]["shape"], outputs[13]["dtype"])
-    outputs_1165 = np.zeros(outputs[14]["shape"], outputs[14]["dtype"])
-
-    preds = []
-    for batch in tqdm(dataloader):
-        image = batch['img'][0].data.numpy()
-        image = image.astype(inputs[0]["dtype"])
-        # Set input
-        image = np.ascontiguousarray(image) 
-        cuda.memcpy_htod(inputs[0]["allocation"], image)
-        context.execute_v2(allocations)
-        # # Fetch output
-        cuda.memcpy_dtoh(outputs_651, outputs[0]["allocation"])
-        cuda.memcpy_dtoh(outputs_766, outputs[1]["allocation"])
-        cuda.memcpy_dtoh(outputs_881, outputs[2]["allocation"])
-        cuda.memcpy_dtoh(outputs_996, outputs[3]["allocation"])
-        cuda.memcpy_dtoh(outputs_1111, outputs[4]["allocation"])
-        cuda.memcpy_dtoh(outputs_713, outputs[5]["allocation"])
-        cuda.memcpy_dtoh(outputs_828, outputs[6]["allocation"])
-        cuda.memcpy_dtoh(outputs_943, outputs[7]["allocation"])
-        cuda.memcpy_dtoh(outputs_1058, outputs[8]["allocation"])
-        cuda.memcpy_dtoh(outputs_1173, outputs[9]["allocation"])
-        cuda.memcpy_dtoh(outputs_705, outputs[10]["allocation"])
-        cuda.memcpy_dtoh(outputs_820, outputs[11]["allocation"])
-        cuda.memcpy_dtoh(outputs_935, outputs[12]["allocation"])
-        cuda.memcpy_dtoh(outputs_1050, outputs[13]["allocation"])
-        cuda.memcpy_dtoh(outputs_1165, outputs[14]["allocation"])
-
-        cls_score = []
-        box_reg = []
-        score_factors = []
-        cls_score.append(torch.from_numpy(outputs_651))
-        cls_score.append(torch.from_numpy(outputs_766))
-        cls_score.append(torch.from_numpy(outputs_881))
-        cls_score.append(torch.from_numpy(outputs_996))
-        cls_score.append(torch.from_numpy(outputs_1111))
-
-        box_reg.append(torch.from_numpy(outputs_713))
-        box_reg.append(torch.from_numpy(outputs_828))
-        box_reg.append(torch.from_numpy(outputs_943))
-        box_reg.append(torch.from_numpy(outputs_1058))
-        box_reg.append(torch.from_numpy(outputs_1173))
-
-        score_factors.append(torch.from_numpy(outputs_705))
-        score_factors.append(torch.from_numpy(outputs_820))
-        score_factors.append(torch.from_numpy(outputs_935))
-        score_factors.append(torch.from_numpy(outputs_1050))
-        score_factors.append(torch.from_numpy(outputs_1165))
-
-        cls_score.sort(key=lambda x: x.shape[3],reverse=True)
-        box_reg.sort(key=lambda x: x.shape[3],reverse=True)       
-        score_factors.sort(key=lambda x: x.shape[3],reverse=True)
-
-        pred = model.bbox_head.get_bboxes(cls_score, box_reg, score_factors=score_factors, img_metas=batch['img_metas'][0].data[0], rescale=True)
-        bbox_results = [
-            bbox2result(det_bboxes, det_labels, model.bbox_head.num_classes)
-            for det_bboxes, det_labels in pred
-        ]
-        preds.extend(bbox_results)
-    eval_results = dataset.evaluate(preds, metric=['bbox'])
-    print(eval_results)
-    
-    map50 = eval_results['bbox_mAP_50']
-    return map50   
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    # engine args
-    parser.add_argument("--engine", type=str, default="./r50_fcos.engine")
-    parser.add_argument("--cfg_file", type=str, default="fcos_r50_caffe_fpn_gn-head_1x_coco.py")
-    parser.add_argument("--data_path", type=str, default="/home/datasets/cv/coco")
-    parser.add_argument("--batch_size", type=int, default=16)
-    parser.add_argument("--num_workers", type=int, default=4)
-    parser.add_argument("--image_file", type=str, default="/home/fangjian.hu/workspace/ixrt/data/fcos_test/test_800.jpg")
-    parser.add_argument("--warp_up", type=int, default=40)
-    parser.add_argument("--loop_count", type=int, default=50)
-    
-    parser.add_argument("--target_map", default=0.56, type=float, help="target map0.5")
-    parser.add_argument("--target_fps", default=50, type=float, help="target fps")
-    parser.add_argument("--task", default="precision", type=str, help="precision or pref")
-    
-    
-    args = parser.parse_args()
-    return args
-
-def main():
-    args= parse_args()
-    host_mem = tensorrt.IHostMemory
-    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-
-    # Load Engine
-    engine, context = create_engine_context(args.engine, logger)
-    inputs, outputs, allocations = get_io_bindings(engine)
-        
-    if args.task=="precision":
-        start_time = time.time()
-        map50= eval_coco(args,inputs, outputs, allocations, context)
-        end_time = time.time()
-        e2e_time = end_time - start_time
-        print(F"E2E time : {e2e_time:.3f} seconds")
-        
-        print("="*40)
-        print("MAP50:{0}".format(round(map50,3)))
-        print("="*40)
-        print(f"Check MAP50 Test : {round(map50,3)}  Target:{args.target_map} State : {'Pass' if round(map50,3) >= args.target_map else 'Fail'}")
-        status_map = check_target(map50, args.target_map)
-        sys.exit(int(not (status_map)))
-        
-    else:
-        torch.cuda.synchronize()
-        start_time = time.time()
-        for i in range(args.loop_count):
-            context.execute_v2(allocations)  
-        torch.cuda.synchronize()
-        end_time = time.time()
-        forward_time = end_time - start_time
-        fps = args.loop_count * args.batch_size / forward_time
-        print("="*40)
-        print("fps:{0}".format(round(fps,2)))
-        print("="*40)
-        print(f"Check fps Test : {round(fps,3)}  Target:{args.target_fps} State : {'Pass' if  fps >= args.target_fps else 'Fail'}")
-        status_fps = check_target(fps, args.target_fps)
-        sys.exit(int(not (status_fps)))
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py b/models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py
deleted file mode 100644
index 758d1d88571ac49ae26bc124f0052716bbf761d4..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-# model settings
-model = dict(
-    type='FCOS',
-    backbone=dict(
-        type='ResNet',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=False),
-        norm_eval=True,
-        style='caffe',
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint='open-mmlab://detectron/resnet50_caffe')),
-    neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        start_level=1,
-        add_extra_convs='on_output',  # use P5
-        num_outs=5,
-        relu_before_extra_convs=True),
-    bbox_head=dict(
-        type='FCOSHead',
-        num_classes=80,
-        in_channels=256,
-        stacked_convs=4,
-        feat_channels=256,
-        strides=[8, 16, 32, 64, 128],
-        loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
-        loss_centerness=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
-    # training and testing settings
-    train_cfg=dict(
-        assigner=dict(
-            type='MaxIoUAssigner',
-            pos_iou_thr=0.5,
-            neg_iou_thr=0.4,
-            min_pos_iou=0,
-            ignore_iof_thr=-1),
-        allowed_border=-1,
-        pos_weight=-1,
-        debug=False),
-    test_cfg=dict(
-        nms_pre=1000,
-        min_bbox_size=0,
-        score_thr=0.05,
-        nms=dict(type='nms', iou_threshold=0.5),
-        max_per_img=100))
-
-# dataset settings
-dataset_type = 'CocoDataset'
-data_root = 'data/coco/'
-img_norm_cfg = dict(
-    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(800, 800),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size=(800, 800)),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    samples_per_gpu=32,
-    workers_per_gpu=1,
-    val=dict(
-        type=dataset_type,
-        ann_file=data_root + 'annotations/instances_val2017.json',
-        img_prefix=data_root + 'images/val2017/',
-        pipeline=test_pipeline)
-    )
-evaluation = dict(interval=1, metric='bbox')
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/requirements.txt b/models/cv/object_detection/fcos/ixrt/requirements.txt
index a0763974b54feecde9c5a7654327122855e85eed..c6f9129a7799504d38ae80fd0468dc8ac3ba50a8 100644
--- a/models/cv/object_detection/fcos/ixrt/requirements.txt
+++ b/models/cv/object_detection/fcos/ixrt/requirements.txt
@@ -1,10 +1,9 @@
+yapf==0.40.2
+addict==2.4.0
+mmdet==3.3.0
 tqdm
 onnx
 onnxsim
-ultralytics
 pycocotools
-addict
-yapf
-pycuda
-mmdet==2.28.2
-opencv-python==4.6.0.66
\ No newline at end of file
+opencv-python==4.6.0.66
+mmengine
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_accuracy.sh b/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_accuracy.sh
index b6ccfe626f4a0f1ab6247aac4c4dc14f1998d3cb..aa081403bd07c008644e2db01e03332945fa4155 100755
--- a/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_accuracy.sh
+++ b/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_accuracy.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
@@ -24,7 +9,7 @@ check_status()
 }
 
 # Run paraments
-BSZ=1
+BSZ=32
 WARM_UP=-1
 TGT=-1
 LOOP_COUNT=-1
@@ -44,15 +29,15 @@ do
     esac
 done
 
-MODEL_NAME="r50_fcos"
+MODEL_NAME="fcos_opt"
+ORIGINE_MODEL="${CHECKPOINTS_DIR}/fcos_opt.onnx"
 
-echo PROJ_DIR ${PROJ_DIR}
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
 
 step=0
-
+CURRENT_MODEL=${ORIGINE_MODEL}
 # Simplify Model
 let step++
 echo;
@@ -62,22 +47,40 @@ if [ -f ${SIM_MODEL} ];then
     echo "  "Simplify Model Skipped, ${SIM_MODEL} has been existed
 else
     python3 ${RUN_DIR}/simplify_model.py \
-            --origin_model ${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx    \
+            --origin_model ${CURRENT_MODEL}   \
             --output_model ${SIM_MODEL}
     echo "  "Generate ${SIM_MODEL}
 fi
 
+CURRENT_MODEL=${SIM_MODEL}
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
 
 # Build Engine
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs{BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --model ${SIM_MODEL}                \
+        --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
@@ -86,11 +89,10 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Inference
-python3 ${RUN_DIR}/fcos_ixrt_inference.py \
+python3 ${RUN_DIR}/inference_mmdet.py \
         --engine ${ENGINE_FILE} \
         --cfg_file ${RUN_DIR}/fcos_r50_caffe_fpn_gn-head_1x_coco.py \
-        --task "precision" \
-        --data_path ${DATASETS_DIR} \
-        --batch_size 1 \
-        --target_map 0.54; check_status
+        --datasets ${DATASETS_DIR} \
+        --batchsize ${BSZ} \
+        --acc_target ${TGT}; check_status
 exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_performance.sh b/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_performance.sh
index 2bcf4d56b3fde440ecfdefd56be02cb1b673e428..26a6bc83fd13e3d1e28a1569e299de9dc7f2b4ad 100755
--- a/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_performance.sh
+++ b/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_performance.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
@@ -24,7 +9,7 @@ check_status()
 }
 
 # Run paraments
-BSZ=1
+BSZ=32
 WARM_UP=-1
 TGT=-1
 LOOP_COUNT=-1
@@ -44,15 +29,15 @@ do
     esac
 done
 
-MODEL_NAME="r50_fcos"
+MODEL_NAME="fcos_opt"
+ORIGINE_MODEL="${CHECKPOINTS_DIR}/fcos_opt.onnx"
 
-echo PROJ_DIR ${PROJ_DIR}
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
 
 step=0
-
+CURRENT_MODEL=${ORIGINE_MODEL}
 # Simplify Model
 let step++
 echo;
@@ -62,22 +47,40 @@ if [ -f ${SIM_MODEL} ];then
     echo "  "Simplify Model Skipped, ${SIM_MODEL} has been existed
 else
     python3 ${RUN_DIR}/simplify_model.py \
-            --origin_model ${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx    \
+            --origin_model ${CURRENT_MODEL}   \
             --output_model ${SIM_MODEL}
     echo "  "Generate ${SIM_MODEL}
 fi
 
+CURRENT_MODEL=${SIM_MODEL}
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
 
 # Build Engine
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs{BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --model ${SIM_MODEL}                \
+        --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
@@ -86,10 +89,11 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Inference
-python3 ${RUN_DIR}/fcos_ixrt_inference.py \
+python3 ${RUN_DIR}/inference_mmdet.py \
         --engine ${ENGINE_FILE} \
         --cfg_file ${RUN_DIR}/fcos_r50_caffe_fpn_gn-head_1x_coco.py \
-        --task "pref" \
-        --batch_size 1 \
-        --target_fps 40; check_status
+        --perf_only True \
+        --datasets ${DATASETS_DIR} \
+        --batchsize ${BSZ} \
+        --fps_target ${TGT}; check_status
 exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/build_engine.py b/models/cv/object_detection/ixrt_common/build_engine.py
similarity index 67%
rename from models/cv/object_detection/yolov5/ixrt/build_engine.py
rename to models/cv/object_detection/ixrt_common/build_engine.py
index a919bdd0183197ce125aa5492ec83e58e035675d..d47e45e518cc0bd35d2fd27f19f7da17bec44abf 100644
--- a/models/cv/object_detection/yolov5/ixrt/build_engine.py
+++ b/models/cv/object_detection/ixrt_common/build_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import argparse
diff --git a/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py b/models/cv/object_detection/ixrt_common/build_nms_engine.py
similarity index 78%
rename from models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
rename to models/cv/object_detection/ixrt_common/build_nms_engine.py
index 3be0d83d0d966018f59b87d22f628b9b1ddf9b21..d260fe48fc85f6d2add2051a063a90048e9d831f 100644
--- a/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
+++ b/models/cv/object_detection/ixrt_common/build_nms_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import argparse
 import torch
@@ -20,10 +5,12 @@ import onnx
 from onnx import helper
 from onnx import TensorProto, numpy_helper
 import tensorrt
+from os.path import dirname, exists, join
+import ctypes
 
 def create_onnx(args):
     nms = helper.make_node(
-        "NMS",
+        "DetectionNMS_IxRT",
         name="NMS",
         inputs=["nms_input"],
         outputs=["nms_output0", "nms_output1"],
@@ -57,10 +44,24 @@ def create_onnx(args):
     model = onnx.helper.make_model(graph, opset_imports=[op])
     onnx_path = args.path + "/nms.onnx"
     onnx.save(model, onnx_path)
+    
+def load_ixrt_plugin(
+    logger=tensorrt.Logger(tensorrt.Logger.WARNING), namespace="", dynamic_path=""
+):
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!"
+        )
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
 
 def build_engine(args):
     onnx_path = args.path + "/nms.onnx"
     IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    load_ixrt_plugin(IXRT_LOGGER)
     builder = tensorrt.Builder(IXRT_LOGGER)
     EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
     network = builder.create_network(EXPLICIT_BATCH)
diff --git a/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py b/models/cv/object_detection/ixrt_common/calibration_dataset.py
similarity index 55%
rename from models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py
rename to models/cv/object_detection/ixrt_common/calibration_dataset.py
index de37775a0c617fdefca4342423a6a47bdc9b9c41..2473f7d0933035ef2731dd928f75cb8fea72d2f3 100644
--- a/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py
+++ b/models/cv/object_detection/ixrt_common/calibration_dataset.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import torch
 import torchvision.datasets
diff --git a/models/cv/object_detection/yolov5s/ixrt/coco_labels.py b/models/cv/object_detection/ixrt_common/coco_labels.py
similarity index 71%
rename from models/cv/object_detection/yolov5s/ixrt/coco_labels.py
rename to models/cv/object_detection/ixrt_common/coco_labels.py
index 43f5bd82cd257efdcab2bdba6bad64d9bb90416e..69d38878ff16d66dfe7550fcd170ac91d0862318 100644
--- a/models/cv/object_detection/yolov5s/ixrt/coco_labels.py
+++ b/models/cv/object_detection/ixrt_common/coco_labels.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 labels = [
     "person",
     "bicycle",
diff --git a/models/cv/object_detection/yolov5s/ixrt/common.py b/models/cv/object_detection/ixrt_common/common.py
similarity index 78%
rename from models/cv/object_detection/yolov5s/ixrt/common.py
rename to models/cv/object_detection/ixrt_common/common.py
index 695e05ba9605a8b251d74567830ecbeb86387b4c..7d9a078eebf15a788c55cc5b657b352a33707a31 100644
--- a/models/cv/object_detection/yolov5s/ixrt/common.py
+++ b/models/cv/object_detection/ixrt_common/common.py
@@ -1,22 +1,8 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import numpy as np
 from tqdm import tqdm
 
 import tensorrt
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 
 # input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
 # output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
@@ -50,6 +36,23 @@ def save2json(batch_img_id, pred_boxes, json_result, class_trans):
                         "score": p,
                     }
                 )
+def save2json_nonms(batch_img_id, pred_boxes, json_result):
+    for i, boxes in enumerate(pred_boxes):
+        image_id = int(batch_img_id)
+        if boxes is not None:
+            x, y, w, h, c, p = boxes
+            if image_id!=-1:
+                
+                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
+                c = int(c)
+                json_result.append(
+                    {
+                    "image_id": image_id,
+                    "category_id": c,
+                    "bbox": [x, y, w, h],
+                    "score": p,
+                    }
+                    )
 
 def create_engine_context(engine_path, logger):
     with open(engine_path, "rb") as f:
@@ -80,13 +83,15 @@ def get_io_bindings(engine):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size,
         }
         print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
diff --git a/models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG b/models/cv/object_detection/ixrt_common/config/YOLOV3_CONFIG
similarity index 73%
rename from models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG
rename to models/cv/object_detection/ixrt_common/config/YOLOV3_CONFIG
index 8cbd0f490342f6a7f5e06fa1087bc65cc98afb55..9b1fe49135bbec6e05aee94ea89a871e04371494 100644
--- a/models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG
+++ b/models/cv/object_detection/ixrt_common/config/YOLOV3_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # BSZ : 构建engine以及推理时的batchsize
 # IMGSIZE : 模型输入hw大小
 # RUN_MODE : [FPS, MAP]
diff --git a/models/cv/object_detection/yolov5/ixrt/config/YOLOV5_CONFIG b/models/cv/object_detection/ixrt_common/config/YOLOV5M_CONFIG
similarity index 73%
rename from models/cv/object_detection/yolov5/ixrt/config/YOLOV5_CONFIG
rename to models/cv/object_detection/ixrt_common/config/YOLOV5M_CONFIG
index d6342be3715685c6c76bd63fbcc09ea8cf57209a..3eddc4f789033694b21330cce740bd229c1d9c6d 100644
--- a/models/cv/object_detection/yolov5/ixrt/config/YOLOV5_CONFIG
+++ b/models/cv/object_detection/ixrt_common/config/YOLOV5M_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # BSZ : 构建engine以及推理时的batchsize
 # IMGSIZE : 模型输入hw大小
 # RUN_MODE : [FPS, MAP]
diff --git a/models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG b/models/cv/object_detection/ixrt_common/config/YOLOV5S_CONFIG
old mode 100755
new mode 100644
similarity index 73%
rename from models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG
rename to models/cv/object_detection/ixrt_common/config/YOLOV5S_CONFIG
index 8aa23b8e276963cf63247b252962ddf521b66dfd..c3f46cf87029af585f8c40a6b5e435b4a41fc956
--- a/models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG
+++ b/models/cv/object_detection/ixrt_common/config/YOLOV5S_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # BSZ : 构建engine以及推理时的batchsize
 # IMGSIZE : 模型输入hw大小
 # RUN_MODE : [FPS, MAP]
diff --git a/models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG b/models/cv/object_detection/ixrt_common/config/YOLOV7_CONFIG
similarity index 73%
rename from models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG
rename to models/cv/object_detection/ixrt_common/config/YOLOV7_CONFIG
index e6cb457d7bd09f941e5fba164265838af0ab5cb0..4803e368f3e4fa20cf05576e1cd5f12594f5d102 100644
--- a/models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG
+++ b/models/cv/object_detection/ixrt_common/config/YOLOV7_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # BSZ : 构建engine以及推理时的batchsize
 # IMGSIZE : 模型输入hw大小
 # RUN_MODE : [FPS, MAP]
@@ -34,7 +19,6 @@ LAYER_FUSION=1
 DECODER_FASTER=1
 DECODER_NUM_CLASS=80
 DECODER_INPUT_NAMES=(/model/model.105/m.0/Conv_output_0 /model/model.105/m.1/Conv_output_0 /model/model.105/m.2/Conv_output_0)
-
 DECODER_8_ANCHOR=(12 16 19 36 40 28)
 DECODER_16_ANCHOR=(36 75 76 55 72 146)
 DECODER_32_ANCHOR=(142 110 192 243 459 401)
diff --git a/models/cv/object_detection/yolov5s/ixrt/cut_model.py b/models/cv/object_detection/ixrt_common/cut_model.py
similarity index 44%
rename from models/cv/object_detection/yolov5s/ixrt/cut_model.py
rename to models/cv/object_detection/ixrt_common/cut_model.py
index e9ee19aadf0809fe1b97e3225d09150fb54513f7..af0a3a4f0cc3caf05b95be3c77dea7728c931e3f 100644
--- a/models/cv/object_detection/yolov5s/ixrt/cut_model.py
+++ b/models/cv/object_detection/ixrt_common/cut_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/__init__.py b/models/cv/object_detection/ixrt_common/datasets/__init__.py
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/__init__.py
rename to models/cv/object_detection/ixrt_common/datasets/__init__.py
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py b/models/cv/object_detection/ixrt_common/datasets/coco.py
old mode 100755
new mode 100644
similarity index 86%
rename from models/cv/object_detection/yolov5s/ixrt/datasets/coco.py
rename to models/cv/object_detection/ixrt_common/datasets/coco.py
index 73c5df54761b917ecd0127fb56b61d9bd34c1196..7f355b8444e2bc8d38d5c89cb3217328c497420e
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py
+++ b/models/cv/object_detection/ixrt_common/datasets/coco.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os.path
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/common.py b/models/cv/object_detection/ixrt_common/datasets/common.py
similarity index 78%
rename from models/cv/object_detection/yolov5/ixrt/datasets/common.py
rename to models/cv/object_detection/ixrt_common/datasets/common.py
index ef36eba394917bc05af46f33be48463df50f540d..a8e5e6e7fbd958b2f9dfa83da1b0f8c5bc727041 100644
--- a/models/cv/object_detection/yolov5/ixrt/datasets/common.py
+++ b/models/cv/object_detection/ixrt_common/datasets/common.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
@@ -78,4 +63,6 @@ def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
 def clip_boxes(boxes, shape):
 
     boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
+    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+    
+    return boxes
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py b/models/cv/object_detection/ixrt_common/datasets/post_process.py
similarity index 68%
rename from models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
rename to models/cv/object_detection/ixrt_common/datasets/post_process.py
index 8590816a0df18b6ef296ebe305b15b81240ab1d0..7b411a50f075bd88ad0b17fb43a5455ef9e7c088 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
+++ b/models/cv/object_detection/ixrt_common/datasets/post_process.py
@@ -1,21 +1,8 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
+import torch
+import torch.nn.functional as F
 
 from .common import letterbox, scale_boxes, clip_boxes
 
@@ -26,6 +13,8 @@ def get_post_process(data_process_type):
         return Yolov3Postprocess
     elif data_process_type == "yolox":
         return YoloxPostprocess
+    elif data_process_type == "detr":
+        return DetrPostprocess
     return None
 
 def Yolov3Postprocess(
@@ -127,4 +116,42 @@ def YoloxPostprocess(
         all_box.append(boxes)
         data_offset += max_det * 6
 
-    return all_box
\ No newline at end of file
+    return all_box
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(-1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+
+def DetrPostprocess(pred_logits, pred_boxes, target_sizes):
+    
+    out_logits = torch.from_numpy(pred_logits) 
+    out_bbox = torch.from_numpy(pred_boxes)
+    assert len(target_sizes) == 2
+    
+    prob = F.softmax(out_logits, -1)
+    scores, labels = prob[..., :-1].max(-1)
+    
+    # convert to [x0, y0, x1, y1] format 
+    boxes = box_cxcywh_to_xyxy(out_bbox)
+    # and from relative [0, 1] to absolute [0, height] coordinates
+    img_w, img_h = target_sizes
+    scale_fct = torch.tensor([img_w, img_h, img_w, img_h])
+    boxes = boxes * scale_fct
+   
+    boxes = clip_boxes(boxes, target_sizes)
+    boxes = convert_to_xywh(boxes)
+
+    labels = labels.unsqueeze(1)
+    scores =scores.unsqueeze(1)
+    pred_boxes = torch.cat([
+            boxes, 
+            labels, 
+            scores], dim=1).numpy().tolist()
+    return pred_boxes
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py b/models/cv/object_detection/ixrt_common/datasets/pre_process.py
old mode 100755
new mode 100644
similarity index 73%
rename from models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py
rename to models/cv/object_detection/ixrt_common/datasets/pre_process.py
index c651f8adb7c8190c214fbbbb7769c7d0713e9619..e5b4ddfba9d1357cdada22b8c50b7d9b6d423998
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py
+++ b/models/cv/object_detection/ixrt_common/datasets/pre_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
@@ -26,6 +11,8 @@ def get_post_process(data_process_type):
         return Yolov3Preprocess
     elif data_process_type == "yolox":
         return YoloxPreprocess
+    elif data_process_type == "detr":
+        return DetrPreprocess
     return None
 
 def Yolov3Preprocess(image, img_size):
@@ -68,4 +55,22 @@ def YoloxPreprocess(img, img_size, swap=(2,0,1)):
     padded_img = padded_img.transpose(swap)
     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
 
-    return padded_img
\ No newline at end of file
+    return padded_img
+
+def DetrPreprocess(image, img_size):    
+    # img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    # img = img.resize((img_size, img_size))
+    
+    std = [0.485, 0.456, 0.406] 
+    mean = [0.229, 0.224, 0.225]
+    
+    image = cv2.resize(image, (img_size, img_size))
+    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
+    
+    image[0,:,:] = (image[0,:,:]- std[0])/mean[0]
+    image[1,:,:] = (image[1,:,:]- std[1])/mean[1]
+    image[2,:,:] = (image[2,:,:]- std[2])/mean[2]
+    
+    return image
+    
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py b/models/cv/object_detection/ixrt_common/datasets/vision.py
old mode 100755
new mode 100644
similarity index 89%
rename from models/cv/object_detection/yolov5s/ixrt/datasets/vision.py
rename to models/cv/object_detection/ixrt_common/datasets/vision.py
index eadefb2c5b35abd0a11fa85c65891461a210aef8..32da4a789767939efc1e83d89f2955145798a5f3
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py
+++ b/models/cv/object_detection/ixrt_common/datasets/vision.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov5/ixrt/deploy.py b/models/cv/object_detection/ixrt_common/deploy.py
similarity index 82%
rename from models/cv/object_detection/yolov5/ixrt/deploy.py
rename to models/cv/object_detection/ixrt_common/deploy.py
index 8c2cc424f699e01bc88dab98a29dc4c83e4d9b9e..a686f4ffe459ceec571cd390ba2588aca14ee868 100644
--- a/models/cv/object_detection/yolov5/ixrt/deploy.py
+++ b/models/cv/object_detection/ixrt_common/deploy.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # !/usr/bin/env python
 # -*- coding: utf-8 -*-
 import argparse
@@ -129,10 +114,10 @@ def parse_args():
     parser.add_argument("--dst", type=str)
     parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
     parser.add_argument("--decoder_input_names", nargs='+', type=str)
-    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
+    parser.add_argument("--decoder8_anchor", nargs='*', type=float)
+    parser.add_argument("--decoder16_anchor", nargs='*', type=float)
+    parser.add_argument("--decoder32_anchor", nargs='*', type=float)
+    parser.add_argument("--decoder64_anchor", nargs='*', type=float, default=None)
     parser.add_argument("--num_class", type=int, default=80)
     parser.add_argument("--faster", type=int, default=1)
     parser.add_argument("--focus_input", type=str, default=None)
diff --git a/models/cv/object_detection/ixrt_common/fcos_r50_caffe_fpn_gn-head_1x_coco.py b/models/cv/object_detection/ixrt_common/fcos_r50_caffe_fpn_gn-head_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..07b17960b31e7b46fffec2989c89b83ad397060c
--- /dev/null
+++ b/models/cv/object_detection/ixrt_common/fcos_r50_caffe_fpn_gn-head_1x_coco.py
@@ -0,0 +1,253 @@
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backend_args = None
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=1, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+load_from = None
+log_level = 'ERROR'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+model = dict(
+    backbone=dict(
+        depth=50,
+        frozen_stages=1,
+        init_cfg=dict(
+            checkpoint='open-mmlab://detectron/resnet50_caffe',
+            type='Pretrained'),
+        norm_cfg=dict(requires_grad=False, type='BN'),
+        norm_eval=True,
+        num_stages=4,
+        out_indices=(
+            0,
+            1,
+            2,
+            3,
+        ),
+        style='caffe',
+        type='ResNet'),
+    bbox_head=dict(
+        feat_channels=256,
+        in_channels=256,
+        loss_bbox=dict(loss_weight=1.0, type='IoULoss'),
+        loss_centerness=dict(
+            loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=True),
+        loss_cls=dict(
+            alpha=0.25,
+            gamma=2.0,
+            loss_weight=1.0,
+            type='FocalLoss',
+            use_sigmoid=True),
+        num_classes=80,
+        stacked_convs=4,
+        strides=[
+            8,
+            16,
+            32,
+            64,
+            128,
+        ],
+        type='FCOSHead'),
+    data_preprocessor=dict(
+        bgr_to_rgb=False,
+        mean=[
+            102.9801,
+            115.9465,
+            122.7717,
+        ],
+        pad_size_divisor=32,
+        std=[
+            1.0,
+            1.0,
+            1.0,
+        ],
+        type='DetDataPreprocessor'),
+    neck=dict(
+        add_extra_convs='on_output',
+        in_channels=[
+            256,
+            512,
+            1024,
+            2048,
+        ],
+        num_outs=5,
+        out_channels=256,
+        relu_before_extra_convs=True,
+        start_level=1,
+        type='FPN'),
+    test_cfg=dict(
+        max_per_img=100,
+        min_bbox_size=0,
+        nms=dict(iou_threshold=0.5, type='nms'),
+        nms_pre=1000,
+        score_thr=0.05),
+    type='FCOS')
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=35, norm_type=2),
+    optimizer=dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0001),
+    paramwise_cfg=dict(bias_decay_mult=0.0, bias_lr_mult=2.0),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        begin=0,
+        by_epoch=False,
+        end=500,
+        factor=0.3333333333333333,
+        type='ConstantLR'),
+    dict(
+        begin=0,
+        by_epoch=True,
+        end=12,
+        gamma=0.1,
+        milestones=[
+            8,
+            11,
+        ],
+        type='MultiStepLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='/home/xiaomei.wang/ixrt-modelzoo-new/data/datasets/coco2017',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    ann_file=
+    '/home/xiaomei.wang/ixrt-modelzoo-new/data/datasets/coco2017/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+test_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(keep_ratio=False, scale=(
+        800,
+        800,
+    ), type='Resize'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+        ),
+        type='PackDetInputs'),
+]
+train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=1)
+train_dataloader = dict(
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    batch_size=2,
+    dataset=dict(
+        ann_file='annotations/instances_train2017.json',
+        backend_args=None,
+        data_prefix=dict(img='train2017/'),
+        data_root='data/coco/',
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(keep_ratio=True, scale=(
+                1333,
+                800,
+            ), type='Resize'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(type='PackDetInputs'),
+        ],
+        type='CocoDataset'),
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(keep_ratio=True, scale=(
+        1333,
+        800,
+    ), type='Resize'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(type='PackDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    ann_file='data/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='DetLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './'
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/inference.py b/models/cv/object_detection/ixrt_common/inference.py
similarity index 74%
rename from models/cv/object_detection/yolov5s/ixrt/inference.py
rename to models/cv/object_detection/ixrt_common/inference.py
index ad87fe1ec0c2b9e5fa4271c31018a174fef370d5..99f22322e31e28d5f21219c754edce9e843e8db6 100644
--- a/models/cv/object_detection/yolov5s/ixrt/inference.py
+++ b/models/cv/object_detection/ixrt_common/inference.py
@@ -1,21 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import argparse
 import glob
 import json
@@ -25,11 +10,10 @@ import sys
 
 import torch
 import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 
 from coco_labels import coco80_to_coco91_class, labels
-from common import save2json, box_class85to6
+from common import save2json, save2json_nonms, box_class85to6
 from common import create_engine_context, get_io_bindings
 from calibration_dataset import create_dataloaders
 from datasets.post_process import get_post_process
@@ -82,7 +66,7 @@ def main(config):
     inputs, outputs, allocations = get_io_bindings(engine)
 
     # Load nms_engine
-    if config.test_mode == "MAP" and config.nms_type == "GPU":
+    if config.test_mode == "MAP" and config.nms_type == "GPU" and not config.no_nms:
         nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
         nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
         nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
@@ -98,8 +82,14 @@ def main(config):
         print("Warm Done.")
 
     # Prepare the output data
-    output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    print(f"output shape : {output.shape} output type : {output.dtype}")
+    if config.no_nms:
+        batch_pred_logits = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+        batch_pred_boxes = np.zeros(outputs[1]["shape"], outputs[1]["dtype"])
+        print(f"pred_logits shape : {batch_pred_logits.shape} pred_logits type : {batch_pred_logits.dtype}")
+        print(f"pred_boxes shape : {batch_pred_boxes.shape} pred_boxes type : {batch_pred_boxes.dtype}")
+    else:
+        output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+        print(f"output shape : {output.shape} output type : {output.dtype}")
 
     for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
         batch_data = batch_data.numpy()
@@ -109,7 +99,8 @@ def main(config):
         cur_bsz_sample = batch_data.shape[0]
 
         # Set input
-        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+        err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         # Forward
         start_time = time.time()
@@ -117,9 +108,10 @@ def main(config):
         end_time = time.time()
         forward_time += end_time - start_time
 
-        if config.test_mode == "MAP":
+        if config.test_mode == "MAP" and not config.no_nms:
             # Fetch output
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 1 : prepare data to nms
             _, box_num, box_unit = output.shape
@@ -138,10 +130,14 @@ def main(config):
             if config.nms_type == "GPU":
 
                 # Set nms input
-                cuda.memcpy_htod(nms_inputs[0]["allocation"], nms_input)
+                
+                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 nms_context.execute_v2(nms_allocations)
-                cuda.memcpy_dtoh(nms_output0, nms_outputs[0]["allocation"])
-                cuda.memcpy_dtoh(nms_output1, nms_outputs[1]["allocation"])
+                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 3 : post process + save
             pred_boxes = post_process_func(
@@ -153,6 +149,27 @@ def main(config):
                 max_det=config.max_det
             )
             save2json(batch_img_id, pred_boxes, json_result, class_map)
+        elif config.test_mode == "MAP" and config.no_nms:
+            # Fetch output
+            err, = cuda.cuMemcpyDtoH(batch_pred_logits, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+            err, = cuda.cuMemcpyDtoH(batch_pred_boxes, outputs[1]["allocation"], outputs[1]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+
+            for (pred_logits, pred_boxes, img_h, img_w, img_id) in zip(
+                batch_pred_logits, 
+                batch_pred_boxes, 
+                batch_img_shape[0],
+                batch_img_shape[1], 
+                batch_img_id):
+                pred_boxes = post_process_func(pred_logits, pred_boxes, [img_w, img_h])  
+                # print(img_id)
+                # print(img_w, img_h)
+                
+                # import ipdb
+                # ipdb.set_trace()
+                      
+                save2json_nonms(img_id, pred_boxes, json_result)
 
     fps = num_samples / forward_time
 
@@ -180,7 +197,6 @@ def main(config):
         with open(pred_json, "w") as f:
             json.dump(json_result, f)
 
-        start_time = time.time()
         anno_json = config.coco_gt
         anno = COCO(anno_json)  # init annotations api
         pred = anno.loadRes(pred_json)  # init predictions api
@@ -192,12 +208,10 @@ def main(config):
             f"==============================eval {config.model_name} {config.precision} coco map =============================="
         )
         eval.summarize()
-        e2e_time = time.time() - start_time
+
         map, map50 = eval.stats[:2]
-        print(F"E2E time : {e2e_time:.3f} seconds")
         print("MAP@0.5 : ", map50)
         print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        print(F"E2E time : {e2e_time:.3f} seconds")
         if map50 >= config.map_target:
             print("pass!")
             exit()
@@ -257,6 +271,7 @@ def parse_config():
     parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
     parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
     parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
+    parser.add_argument("--no_nms", type=bool, default=False, help="NMS")
 
     config = parser.parse_args()
     print("config:", config)
diff --git a/models/cv/object_detection/ixrt_common/inference_mmdet.py b/models/cv/object_detection/ixrt_common/inference_mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..18fd47550db7a5d816b2b9dacc9b912c12de03a9
--- /dev/null
+++ b/models/cv/object_detection/ixrt_common/inference_mmdet.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import time
+import argparse
+import tensorrt
+import torch
+import torchvision
+import numpy as np
+from tensorrt import Dims
+from cuda import cuda, cudart
+from tqdm import tqdm
+from mmdet.registry import RUNNERS
+from mmengine.config import Config
+
+from common import create_engine_context, get_io_bindings
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="ixrt engine path.")
+    
+    # parser.add_argument("--model_name", type=str, default="")
+    
+    parser.add_argument("--cfg_file", type=str, default="")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    # Load Engine && I/O bindings
+    engine, context = create_engine_context(args.engine, logger)
+    inputs, outputs, allocations = get_io_bindings(engine)
+    
+    if args.warmup > 0:
+        print("\nWarm Start.")
+        for i in range(args.warmup):
+            context.execute_v2(allocations)
+        print("Warm Done.")
+    
+    # just run perf test
+    if args.perf_only:
+        torch.cuda.synchronize()
+        start_time = time.time()
+
+        for i in range(10):
+            context.execute_v2(allocations)
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+        forward_time = end_time - start_time
+        num_samples = 10 * args.batchsize
+        fps = num_samples / forward_time
+
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {args.fps_target}")
+        if fps >= args.fps_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+    else:
+         # Runner config
+        cfg = Config.fromfile(args.cfg_file)
+        cfg.work_dir = "./"
+        
+        cfg['test_dataloader']['batch_size'] = batch_size
+        cfg['test_dataloader']['dataset']['data_root'] = args.datasets
+        cfg['test_dataloader']['dataset']['data_prefix']['img'] = 'val2017/'
+        cfg['test_evaluator']['ann_file'] = os.path.join(args.datasets, 'annotations/instances_val2017.json')
+        cfg['log_level'] = 'ERROR'
+
+        runner = RUNNERS.build(cfg)
+        
+        for input_data in tqdm(runner.test_dataloader):
+            
+            input_data = runner.model.data_preprocessor(input_data, False)
+            image = input_data['inputs'].cpu()
+            image = image.numpy().astype(inputs[0]["dtype"])
+            pad_batch = len(image) != batch_size
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+            image = np.ascontiguousarray(image)
+
+            (err,) = cudart.cudaMemcpy(
+                inputs[0]["allocation"],
+                image,
+                image.nbytes,
+                cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+            assert err == cudart.cudaError_t.cudaSuccess
+
+            context.execute_v2(allocations)
+            
+            cls_score = []
+            box_reg = []
+            score_factors = []
+            for i in range(len(outputs)):
+                output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+                (err,) = cudart.cudaMemcpy(
+                    output,
+                    outputs[i]["allocation"],
+                    outputs[i]["nbytes"],
+                    cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                )
+                assert err == cudart.cudaError_t.cudaSuccess
+
+                if pad_batch:
+                    output = output[:origin_size]
+
+                output = torch.from_numpy(output)
+                
+                if output.shape[1] == 80:
+                    cls_score.append(output)
+                elif output.shape[1] == 4:
+                    box_reg.append(output)
+                else:
+                    score_factors.append(output)
+
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in input_data['data_samples']
+            ]
+            
+            if "fovea_r50" or "fsaf" in args.cfg_file:
+                results_list = runner.model.bbox_head.predict_by_feat(cls_score, box_reg, batch_img_metas=batch_img_metas, rescale=True)
+            else:
+                results_list = runner.model.bbox_head.predict_by_feat(cls_score, box_reg, score_factors, batch_img_metas=batch_img_metas, rescale=True)
+
+            batch_data_samples = runner.model.add_pred_to_datasample(input_data['data_samples'], results_list)
+
+            runner.test_evaluator.process(data_samples=batch_data_samples, data_batch=input_data)
+
+        metrics = runner.test_evaluator.evaluate(len(runner.test_dataloader.dataset))
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/ixrt_common/load_ixrt_plugin.py
similarity index 46%
rename from models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py
rename to models/cv/object_detection/ixrt_common/load_ixrt_plugin.py
index ae47dc8e854b6bea1f768e65c4dd481048bfebce..932efbdfd1a4e91d8ddfd363adf6bce989df1709 100644
--- a/models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py
+++ b/models/cv/object_detection/ixrt_common/load_ixrt_plugin.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
diff --git a/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py b/models/cv/object_detection/ixrt_common/modify_batchsize.py
similarity index 66%
rename from models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py
rename to models/cv/object_detection/ixrt_common/modify_batchsize.py
index 3a88c1603bd6f457fd4965257627dc29edcda4d1..00ed65dd16bf19445396df7f72d81d653eed756d 100644
--- a/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py
+++ b/models/cv/object_detection/ixrt_common/modify_batchsize.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 
diff --git a/models/cv/object_detection/yolov5s/ixrt/quant.py b/models/cv/object_detection/ixrt_common/quant.py
similarity index 74%
rename from models/cv/object_detection/yolov5s/ixrt/quant.py
rename to models/cv/object_detection/ixrt_common/quant.py
index 36fd39a13c2e1e40f4dc0098f042e66e4bd0d26a..bcf5d9b6f73ee58fee41e27252425e7b9dc4e6fb 100644
--- a/models/cv/object_detection/yolov5s/ixrt/quant.py
+++ b/models/cv/object_detection/ixrt_common/quant.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import random
 import argparse
diff --git a/models/cv/object_detection/yolov3/ixrt/requirements.txt b/models/cv/object_detection/ixrt_common/requirements.txt
similarity index 82%
rename from models/cv/object_detection/yolov3/ixrt/requirements.txt
rename to models/cv/object_detection/ixrt_common/requirements.txt
index b0f4374b2b778c81875da50d088fecedd01689c9..46ef4ec8d824eadb4dbce5a88a997abbd38a6747 100644
--- a/models/cv/object_detection/yolov3/ixrt/requirements.txt
+++ b/models/cv/object_detection/ixrt_common/requirements.txt
@@ -4,4 +4,5 @@ onnxsim
 ultralytics
 pycocotools
 opencv-python==4.6.0.66
-pycuda
\ No newline at end of file
+pycuda
+seaborn
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/simplify_model.py b/models/cv/object_detection/ixrt_common/simplify_model.py
similarity index 46%
rename from models/cv/object_detection/fcos/ixrt/simplify_model.py
rename to models/cv/object_detection/ixrt_common/simplify_model.py
index 1400fd81ddb4b3fae1b20d0fd35082a692f5d292..b4254b6f903cb5f8775e43b2f80d5572bf45b1d6 100644
--- a/models/cv/object_detection/fcos/ixrt/simplify_model.py
+++ b/models/cv/object_detection/ixrt_common/simplify_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/models/cv/object_detection/yolov3/ixrt/README.md b/models/cv/object_detection/yolov3/ixrt/README.md
index c4fd306f99e24253e0123b4e192803082ebdde20..dc7dad6fcd6db1b3da0d60ea92c3eff7b1543b57 100644
--- a/models/cv/object_detection/yolov3/ixrt/README.md
+++ b/models/cv/object_detection/yolov3/ixrt/README.md
@@ -30,7 +30,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 ```
 
 ### Model Conversion
@@ -50,13 +50,13 @@ mv weights/export.onnx /Path/to/checkpoints/yolov3.onnx
 ## Model Inference
 
 ```bash
-export PROJ_DIR=/Path/to/yolov3/ixrt
-export DATASETS_DIR=/Path/to/coco2017/
+export PROJ_DIR=./
+export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=./coco/annotations/instances_val2017.json
-export EVAL_DIR=./coco/val2017
-export RUN_DIR=/Path/to/yolov3/ixrt
-export CONFIG_DIR=config/YOLOV3_CONFIG
+export EVAL_DIR=./coco/images/val2017
+export RUN_DIR=../../ixrt_common
+export CONFIG_DIR=../../ixrt_common/config/YOLOV3_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov3/ixrt/build_engine.py b/models/cv/object_detection/yolov3/ixrt/build_engine.py
deleted file mode 100644
index a919bdd0183197ce125aa5492ec83e58e035675d..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/build_engine.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import cv2
-import argparse
-import numpy as np
-
-import torch
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    # engine args
-    parser.add_argument("--engine", type=str, default=None)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
deleted file mode 100644
index de37775a0c617fdefca4342423a6a47bdc9b9c41..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-from datasets.coco import CocoDetection
-
-def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
-    dataset = CocoDetection(
-        root=data_path,
-        annFile=annFile,
-        img_size=img_sz,
-        data_process_type=data_process_type
-    )
-    calibration_dataset = dataset
-    num_samples = min(5000, batch_size * step)
-    if num_samples > 0:
-        calibration_dataset = torch.utils.data.Subset(
-            dataset, indices=range(num_samples)
-        )
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
index 9fb652f13b4e02f2077b955a6a51ab1ccf8c8eb6..7d6d6fba9efd91fd2192fc1e5d46bbb927e12115 100644
--- a/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
@@ -25,7 +25,7 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 mkdir checkpoints
 unzip -q /root/data/3rd_party/onnx_tflite_yolov3.zip -d ./
 cp /root/data/checkpoints/yolov3.weights onnx_tflite_yolov3/weights
diff --git a/models/cv/object_detection/yolov3/ixrt/coco_labels.py b/models/cv/object_detection/yolov3/ixrt/coco_labels.py
deleted file mode 100644
index 43f5bd82cd257efdcab2bdba6bad64d9bb90416e..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/coco_labels.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-labels = [
-    "person",
-    "bicycle",
-    "car",
-    "motorcycle",
-    "airplane",
-    "bus",
-    "train",
-    "truck",
-    "boat",
-    "traffic light",
-    "fire hydrant",
-    "stop sign",
-    "parking meter",
-    "bench",
-    "bird",
-    "cat",
-    "dog",
-    "horse",
-    "sheep",
-    "cow",
-    "elephant",
-    "bear",
-    "zebra",
-    "giraffe",
-    "backpack",
-    "umbrella",
-    "handbag",
-    "tie",
-    "suitcase",
-    "frisbee",
-    "skis",
-    "snowboard",
-    "sports ball",
-    "kite",
-    "baseball bat",
-    "baseball glove",
-    "skateboard",
-    "surfboard",
-    "tennis racket",
-    "bottle",
-    "wine glass",
-    "cup",
-    "fork",
-    "knife",
-    "spoon",
-    "bowl",
-    "banana",
-    "apple",
-    "sandwich",
-    "orange",
-    "broccoli",
-    "carrot",
-    "hot dog",
-    "pizza",
-    "donut",
-    "cake",
-    "chair",
-    "couch",
-    "potted plant",
-    "bed",
-    "dining table",
-    "toilet",
-    "tv",
-    "laptop",
-    "mouse",
-    "remote",
-    "keyboard",
-    "cell phone",
-    "microwave",
-    "oven",
-    "toaster",
-    "sink",
-    "refrigerator",
-    "book",
-    "clock",
-    "vase",
-    "scissors",
-    "teddy bear",
-    "hair drier",
-    "toothbrush",
-]
-def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
-    return [
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
-        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
-
-__all__ = ["labels"]
diff --git a/models/cv/object_detection/yolov3/ixrt/common.py b/models/cv/object_detection/yolov3/ixrt/common.py
deleted file mode 100644
index aba2117c9942d6823abf73bf3ab94c291a7705e2..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/common.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import numpy as np
-from tqdm import tqdm
-
-import tensorrt
-import pycuda.driver as cuda
-
-# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
-# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
-def box_class85to6(input):
-    center_x_y = input[:, :2]
-    side = input[:, 2:4]
-    conf = input[:, 4:5]
-    class_id = np.argmax(input[:, 5:], axis = -1)
-    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
-    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
-    x1_y1 = center_x_y - 0.5 * side
-    x2_y2 = center_x_y + 0.5 * side
-    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
-    return nms_input
-
-def save2json(batch_img_id, pred_boxes, json_result, class_trans):
-    for i, boxes in enumerate(pred_boxes):
-        if boxes is not None:
-            image_id = int(batch_img_id[i])
-            # have no target
-            if image_id == -1:
-                continue
-            for x, y, w, h, c, p in boxes:
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                        "image_id": image_id,
-                        "category_id": class_trans[c - 1],
-                        "bbox": [x, y, w, h],
-                        "score": p,
-                    }
-                )
-
-def create_engine_context(engine_path, logger):
-    with open(engine_path, "rb") as f:
-        runtime = tensorrt.Runtime(logger)
-        assert runtime
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine
-        context = engine.create_execution_context()
-        assert context
-
-    return engine, context
-
-def get_io_bindings(engine):
-    # Setup I/O bindings
-    inputs = []
-    outputs = []
-    allocations = []
-
-    for i in range(engine.num_bindings):
-        is_input = False
-        if engine.binding_is_input(i):
-            is_input = True
-        name = engine.get_binding_name(i)
-        dtype = engine.get_binding_dtype(i)
-        shape = engine.get_binding_shape(i)
-        if is_input:
-            batch_size = shape[0]
-        size = np.dtype(tensorrt.nptype(dtype)).itemsize
-        for s in shape:
-            size *= s
-        allocation = cuda.mem_alloc(size)
-        binding = {
-            "index": i,
-            "name": name,
-            "dtype": np.dtype(tensorrt.nptype(dtype)),
-            "shape": list(shape),
-            "allocation": allocation,
-        }
-        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
-        allocations.append(allocation)
-        if engine.binding_is_input(i):
-            inputs.append(binding)
-        else:
-            outputs.append(binding)
-    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/cut_model.py b/models/cv/object_detection/yolov3/ixrt/cut_model.py
deleted file mode 100644
index e9ee19aadf0809fe1b97e3225d09150fb54513f7..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/cut_model.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import onnx
-import argparse
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    parser.add_argument("--input_names", nargs='+', type=str)
-    parser.add_argument("--output_names", nargs='+', type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
-print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov3/ixrt/datasets/__init__.py
deleted file mode 100644
index 162e24b462289dcee7b7a2888b93fad1115def81..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/datasets/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/coco.py b/models/cv/object_detection/yolov3/ixrt/datasets/coco.py
deleted file mode 100644
index 73c5df54761b917ecd0127fb56b61d9bd34c1196..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/datasets/coco.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os.path
-from typing import Any, Callable, List, Optional, Tuple
-
-import cv2
-
-from .vision import VisionDataset
-from .pre_process import get_post_process
-class CocoDetection(VisionDataset):
-    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        annFile: str,
-        img_size: int,
-        data_process_type: str,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-        
-    ) -> None:
-        super().__init__(root, transforms, transform, target_transform)
-        from pycocotools.coco import COCO
-
-        self.coco = COCO(annFile)
-        self.ids = list(sorted(self.coco.imgs.keys()))
-        self.img_size = img_size
-        
-        self.transforms = get_post_process(data_process_type)
-
-    def _load_image(self, id: int):
-        path = self.coco.loadImgs(id)[0]["file_name"]
-        data = cv2.imread(os.path.join(self.root, path))
-        return data
-
-    def _load_target(self, id: int) -> List[Any]:
-        return self.coco.loadAnns(self.coco.getAnnIds(id))
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        id = self.ids[index]
-        image = self._load_image(id)
-        target = self._load_target(id)
-        origin_shape = image.shape[:2]
-
-        if self.transforms is not None:
-            image = self.transforms(image, self.img_size)
-
-        if len(target) > 0:
-            image_id = target[0]["image_id"]
-        else:
-            # have no target
-            image_id = -1
-        return image, origin_shape, image_id
-
-    def __len__(self) -> int:
-        return len(self.ids)
-
-
-class CocoCaptions(CocoDetection):
-    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-
-    Example:
-
-        .. code:: python
-
-            import torchvision.datasets as dset
-            import torchvision.transforms as transforms
-            cap = dset.CocoCaptions(root = 'dir where images are',
-                                    annFile = 'json annotation file',
-                                    transform=transforms.PILToTensor())
-
-            print('Number of samples: ', len(cap))
-            img, target = cap[3] # load 4th sample
-
-            print("Image Size: ", img.size())
-            print(target)
-
-        Output: ::
-
-            Number of samples: 82783
-            Image Size: (3L, 427L, 640L)
-            [u'A plane emitting smoke stream flying over a mountain.',
-            u'A plane darts across a bright blue sky behind a mountain covered in snow',
-            u'A plane leaves a contrail above the snowy mountain top.',
-            u'A mountain that has a plane flying overheard in the distance.',
-            u'A mountain view with a plume of smoke in the background']
-
-    """
-
-    def _load_target(self, id: int) -> List[str]:
-        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/common.py b/models/cv/object_detection/yolov3/ixrt/datasets/common.py
deleted file mode 100644
index ef36eba394917bc05af46f33be48463df50f540d..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/datasets/common.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-    if not scaleup:  # only scale down, do not scale up (for better val mAP)
-        r = min(r, 1.0)
-
-    # Compute padding
-    ratio = r, r  # width, height ratios
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-    if auto:  # minimum rectangle
-        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
-    elif scaleFill:  # stretch
-        dw, dh = 0.0, 0.0
-        new_unpad = (new_shape[1], new_shape[0])
-        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
-    return im, ratio, (dw, dh)
-
-def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
-    # Rescale boxes (xyxy) from net_shape to ori_shape
-
-    if use_letterbox:
-
-        gain = min(
-            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
-        )  # gain  = new / old
-        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
-            net_shape[0] - ori_shape[0] * gain
-        ) / 2.0
-
-        boxes[:, [0, 2]] -= pad[0]  # x padding
-        boxes[:, [1, 3]] -= pad[1]  # y padding
-        boxes[:, :4] /= gain
-    else:
-        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
-
-        boxes[:, 0] /= x_scale
-        boxes[:, 1] /= y_scale
-        boxes[:, 2] /= x_scale
-        boxes[:, 3] /= y_scale
-
-    clip_boxes(boxes, ori_shape)
-    return boxes
-
-def clip_boxes(boxes, shape):
-
-    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
deleted file mode 100644
index c651f8adb7c8190c214fbbbb7769c7d0713e9619..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Preprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Preprocess
-    elif data_process_type == "yolox":
-        return YoloxPreprocess
-    return None
-
-def Yolov3Preprocess(image, img_size):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    image = cv2.resize(image, (img_size, img_size))
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def Yolov5Preprocess(image, img_size, augment=False):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    if r != 1:  # if sizes are not equal
-        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
-        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
-
-    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
-
-    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def YoloxPreprocess(img, img_size, swap=(2,0,1)):
-
-    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
-    r = min(img_size / img.shape[0], img_size / img.shape[1])
-    resized_img = cv2.resize(
-        img,
-        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        interpolation=cv2.INTER_LINEAR, 
-    ).astype(np.uint8)
-
-    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-    padded_img = padded_img.transpose(swap)
-    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-
-    return padded_img
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/vision.py b/models/cv/object_detection/yolov3/ixrt/datasets/vision.py
deleted file mode 100644
index eadefb2c5b35abd0a11fa85c65891461a210aef8..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/datasets/vision.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-import torch.utils.data as data
-
-from types import FunctionType
-
-def _log_api_usage_once(obj: Any) -> None:
-
-    """
-    Logs API usage(module and name) within an organization.
-    In a large ecosystem, it's often useful to track the PyTorch and
-    TorchVision APIs usage. This API provides the similar functionality to the
-    logging module in the Python stdlib. It can be used for debugging purpose
-    to log which methods are used and by default it is inactive, unless the user
-    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
-    Please note it is triggered only once for the same API call within a process.
-    It does not collect any data from open-source users since it is no-op by default.
-    For more information, please refer to
-    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
-    * Logging policy: https://github.com/pytorch/vision/issues/5052;
-
-    Args:
-        obj (class instance or method): an object to extract info from.
-    """
-    module = obj.__module__
-    if not module.startswith("torchvision"):
-        module = f"torchvision.internal.{module}"
-    name = obj.__class__.__name__
-    if isinstance(obj, FunctionType):
-        name = obj.__name__
-    torch._C._log_api_usage_once(f"{module}.{name}")
-
-class VisionDataset(data.Dataset):
-    """
-    Base Class For making datasets which are compatible with torchvision.
-    It is necessary to override the ``__getitem__`` and ``__len__`` method.
-
-    Args:
-        root (string): Root directory of dataset.
-        transforms (callable, optional): A function/transforms that takes in
-            an image and a label and returns the transformed versions of both.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-
-    .. note::
-
-        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
-    """
-
-    _repr_indent = 4
-
-    def __init__(
-        self,
-        root: str,
-        transforms: Optional[Callable] = None,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-    ) -> None:
-        _log_api_usage_once(self)
-        if isinstance(root, str):
-            root = os.path.expanduser(root)
-        self.root = root
-
-        has_transforms = transforms is not None
-        has_separate_transform = transform is not None or target_transform is not None
-        if has_transforms and has_separate_transform:
-            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
-
-        # for backwards-compatibility
-        self.transform = transform
-        self.target_transform = target_transform
-
-        if has_separate_transform:
-            transforms = StandardTransform(transform, target_transform)
-        self.transforms = transforms
-
-    def __getitem__(self, index: int) -> Any:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            (Any): Sample and meta data, optionally transformed by the respective transforms.
-        """
-        raise NotImplementedError
-
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    def __repr__(self) -> str:
-        head = "Dataset " + self.__class__.__name__
-        body = [f"Number of datapoints: {self.__len__()}"]
-        if self.root is not None:
-            body.append(f"Root location: {self.root}")
-        body += self.extra_repr().splitlines()
-        if hasattr(self, "transforms") and self.transforms is not None:
-            body += [repr(self.transforms)]
-        lines = [head] + [" " * self._repr_indent + line for line in body]
-        return "\n".join(lines)
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def extra_repr(self) -> str:
-        return ""
-
-
-class StandardTransform:
-    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
-        self.transform = transform
-        self.target_transform = target_transform
-
-    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
-        if self.transform is not None:
-            input = self.transform(input)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-        return input, target
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def __repr__(self) -> str:
-        body = [self.__class__.__name__]
-        if self.transform is not None:
-            body += self._format_transform_repr(self.transform, "Transform: ")
-        if self.target_transform is not None:
-            body += self._format_transform_repr(self.target_transform, "Target transform: ")
-
-        return "\n".join(body)
diff --git a/models/cv/object_detection/yolov3/ixrt/deploy.py b/models/cv/object_detection/yolov3/ixrt/deploy.py
deleted file mode 100644
index 8c2cc424f699e01bc88dab98a29dc4c83e4d9b9e..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/deploy.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-# !/usr/bin/env python
-# -*- coding: utf-8 -*-
-import argparse
-from tensorrt.deploy.api import GraphTransform, create_source, create_target
-
-class Transform:
-    def __init__(self, graph):
-        self.t = GraphTransform(graph)
-        self.graph = graph
-
-    def ReplaceFocus(self, input_edge, outputs, to_op):
-        input_var = self.graph.get_variable(input_edge)
-        op = self.graph.get_operator(to_op)
-        self.t.delete_operators_between_var_op(
-            from_var=input_var, to_op=op
-        )
-        self.t.make_operator(
-            "Focus", inputs=input_edge, outputs=outputs
-        )
-        return self.graph
-
-    def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
-        if attributes["anchor"] is None:
-            del attributes["anchor"]
-        self.t.make_operator(
-            op_type, inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-    def AddConcatOp(self, inputs: list, outputs, **attributes):
-        self.t.make_operator(
-            "Concat", inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-def customize_ops(graph, args):
-    t = Transform(graph)
-    fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
-    if fuse_focus:
-        graph = t.ReplaceFocus(
-            input_edge=args.focus_input,
-            outputs=args.focus_output,
-            to_op=args.focus_last_node
-        )
-    decoder_input = args.decoder_input_names
-    num = len(decoder_input) // 3
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[:num],
-        outputs=["decoder_8"],
-        op_type=args.decoder_type,
-        anchor=args.decoder8_anchor,
-        num_class=args.num_class,
-        stride=8,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num:num*2],
-        outputs=["decoder_16"],
-        op_type=args.decoder_type,
-        anchor=args.decoder16_anchor,
-        num_class=args.num_class,
-        stride=16,
-        faster_impl=args.faster
-    )
-    
-    if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:num*2+1],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2+1:],
-            outputs=["decoder_64"],
-            op_type=args.decoder_type,
-            anchor=args.decoder64_anchor,
-            num_class=args.num_class,
-            stride=64,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
-            outputs=["output"],
-            axis=1
-        )
-    else:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_32", "decoder_16", "decoder_8"],
-            outputs=["output"],
-            axis=1
-        )
-
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
-    return graph
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--src", type=str)
-    parser.add_argument("--dst", type=str)
-    parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
-    parser.add_argument("--decoder_input_names", nargs='+', type=str)
-    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
-    parser.add_argument("--num_class", type=int, default=80)
-    parser.add_argument("--faster", type=int, default=1)
-    parser.add_argument("--focus_input", type=str, default=None)
-    parser.add_argument("--focus_output", type=str, default=None)
-    parser.add_argument("--focus_last_node", type=str, default=None)
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-
-    args = parse_args()
-    graph = create_source(args.src)()
-    graph = customize_ops(graph, args)
-    create_target(saved_path=args.dst).export(graph)
-    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/inference.py b/models/cv/object_detection/yolov3/ixrt/inference.py
deleted file mode 100644
index 4241328227c174d29ab093d4317e5591ab920b88..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/inference.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import argparse
-import glob
-import json
-import os
-import time
-import sys
-
-import torch
-import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
-
-from coco_labels import coco80_to_coco91_class, labels
-from common import save2json, box_class85to6
-from common import create_engine_context, get_io_bindings
-from calibration_dataset import create_dataloaders
-from datasets.post_process import get_post_process
-
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-from tqdm import tqdm
-from tqdm.contrib import tzip
-
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-
-    # Load dataloader
-    dataloader = create_dataloaders(
-        data_path=config.eval_dir,
-        annFile=config.coco_gt,
-        img_sz=config.imgsz,
-        batch_size=config.bsz,
-        step=config.loop_count,
-        data_process_type=config.data_process_type
-    )
-
-    # Load post process func
-    if config.test_mode == "MAP":
-        post_process_func = get_post_process(config.data_process_type)
-
-    bsz = config.bsz
-    num_samples = 5000
-    if config.loop_count > 0 and config.loop_count < num_samples/bsz :
-        num_samples = bsz * config.loop_count
-    num_batch = len(dataloader)
-    print("=" * 30)
-    print(f"Test Mode : {'Asynchronous' if config.use_async else 'Synchronous'}")
-    print(f"Total sample : {num_samples}\nBatch_size : {bsz}\nRun Batch : {num_batch}")
-    print("=" * 30)
-
-    json_result = []
-    forward_time = 0.0
-    class_map = coco80_to_coco91_class()
-
-    host_mem = tensorrt.IHostMemory
-    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-
-    # Load Engine
-    engine, context = create_engine_context(config.model_engine, logger)
-    inputs, outputs, allocations = get_io_bindings(engine)
-
-    # Load nms_engine
-    if config.test_mode == "MAP" and config.nms_type == "GPU":
-        nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
-        nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
-        nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
-        nms_output1 = np.zeros(nms_outputs[1]["shape"], nms_outputs[1]["dtype"])
-        print(f"nms_output0 shape : {nms_output0.shape}   nms_output0 type : {nms_output0.dtype}")
-        print(f"nms_output1 shape : {nms_output1.shape}   nms_output1 type : {nms_output1.dtype}")
-
-    # Warm up
-    if config.warm_up > 0:
-        print("\nWarm Start.")
-        for i in range(config.warm_up):
-            context.execute_v2(allocations)
-        print("Warm Done.")
-
-    # Prepare the output data
-    output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    print(f"output shape : {output.shape} output type : {output.dtype}")
-
-    for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
-        batch_data = batch_data.numpy()
-        batch_img_shape = [batch_img_shape[0].numpy(), batch_img_shape[1].numpy()]
-        # batch_img_id = batch_img_id.numpy()
-
-        cur_bsz_sample = batch_data.shape[0]
-
-        # Set input
-        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
-
-        # Forward
-        start_time = time.time()
-        context.execute_v2(allocations)
-        end_time = time.time()
-        forward_time += end_time - start_time
-
-        if config.test_mode == "MAP":
-            # Fetch output
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
-
-            # Step 1 : prepare data to nms
-            _, box_num, box_unit = output.shape
-            if config.debug:
-                print(f"[Debug] box_num(25200) : {box_num}, box_unit(6) : {box_unit}")
-
-            if config.decoder_faster == 0:
-                nms_input = box_class85to6(output.reshape(-1, box_unit))
-            else:
-                nms_input = output
-
-            # Step 2 : nms
-            # cpu nms(TODO)
-
-            # gpu nms
-            if config.nms_type == "GPU":
-
-                # Set nms input
-                cuda.memcpy_htod(nms_inputs[0]["allocation"], nms_input)
-                nms_context.execute_v2(nms_allocations)
-                cuda.memcpy_dtoh(nms_output0, nms_outputs[0]["allocation"])
-                cuda.memcpy_dtoh(nms_output1, nms_outputs[1]["allocation"])
-
-            # Step 3 : post process + save
-            pred_boxes = post_process_func(
-                ori_img_shape=batch_img_shape,
-                imgsz=(config.imgsz, config.imgsz),
-                box_datas=nms_output0,
-                box_nums=nms_output1,
-                sample_num=cur_bsz_sample,
-                max_det=config.max_det
-            )
-            save2json(batch_img_id, pred_boxes, json_result, class_map)
-    fps = num_samples / forward_time
-
-    if config.test_mode == "FPS":
-        print("FPS : ", fps)
-        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
-        if fps >= config.fps_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(1)
-
-    if config.test_mode == "MAP":
-        if len(json_result) == 0:
-            print("Predict zero box!")
-            exit(1)
-
-        if not os.path.exists(config.pred_dir):
-            os.makedirs(config.pred_dir)
-
-        pred_json = os.path.join(
-            config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
-        )
-        with open(pred_json, "w") as f:
-            json.dump(json_result, f)
-
-        start_time = time.time()
-        anno_json = config.coco_gt
-        anno = COCO(anno_json)  # init annotations api
-        pred = anno.loadRes(pred_json)  # init predictions api
-        eval = COCOeval(anno, pred, "bbox")
-
-        eval.evaluate()
-        eval.accumulate()
-        print(
-            f"==============================eval {config.model_name} {config.precision} coco map =============================="
-        )
-        eval.summarize()
-        e2e_time = time.time() - start_time
-        map, map50 = eval.stats[:2]
-        print("MAP@0.5 : ", map50)
-        print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        print(F"E2E time : {e2e_time:.3f} seconds")
-        if map50 >= config.map_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(1)
-
-def parse_config():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", type=str, default="YOLOV5s", help="YOLOV3 YOLOV5 YOLOV7 YOLOX"
-    )
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
-    parser.add_argument(
-        "--model_engine",
-        type=str,
-        default="",
-        help="model engine path",
-    )
-    parser.add_argument(
-        "--nms_engine",
-        type=str,
-        default="",
-        help="nms engine path",
-    )
-    parser.add_argument(
-        "--coco_gt",
-        type=str,
-        default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
-        help="coco instances_val2017.json",
-    )
-    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
-    parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
-    parser.add_argument(
-        "--eval_dir",
-        type=str,
-        default="data/datasets/cv/coco2017/val2017",
-        help="coco image dir",
-    )
-    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
-    parser.add_argument(
-        "--imgsz",
-        "--img",
-        "--img-size",
-        type=int,
-        default=640,
-        help="inference size h,w",
-    )
-    parser.add_argument("--max_det", type=int, default=1000, help="maximum detections per image")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--use_async", action="store_true")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
-    parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
-    parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
-    parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
-    parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
-
-    config = parser.parse_args()
-    print("config:", config)
-    return config
-
-if __name__ == "__main__":
-    config = parse_config()
-    main(config)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
deleted file mode 100644
index 3a88c1603bd6f457fd4965257627dc29edcda4d1..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import onnx
-import argparse
-
-def change_input_dim(model, bsz):
-    batch_size = bsz
-
-    # The following code changes the first dimension of every input to be batch_size
-    # Modify as appropriate ... note that this requires all inputs to
-    # have the same batch_size
-    inputs = model.graph.input
-    for input in inputs:
-        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
-        # Add checks as needed.
-        dim1 = input.type.tensor_type.shape.dim[0]
-        # update dim to be a symbolic value
-        if isinstance(batch_size, str):
-            # set dynamic batch size
-            dim1.dim_param = batch_size
-        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
-            # set given batch size
-            dim1.dim_value = int(batch_size)
-        else:
-            # set batch size of 1
-            dim1.dim_value = 1
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--batch_size", type=int)
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-model = onnx.load(args.origin_model)
-change_input_dim(model, args.batch_size)
-onnx.save(model, args.output_model)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/quant.py b/models/cv/object_detection/yolov3/ixrt/quant.py
deleted file mode 100644
index 36fd39a13c2e1e40f4dc0098f042e66e4bd0d26a..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/quant.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import random
-import argparse
-import numpy as np
-from tensorrt.deploy import static_quantize
-
-import torch
-from calibration_dataset import create_dataloaders
-
-def setseed(seed=42):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str)
-    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
-    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
-    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
-    parser.add_argument("--disable_quant_names", nargs='*', type=str)
-    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
-    parser.add_argument("--bsz", type=int, default=32)
-    parser.add_argument("--step", type=int, default=20)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--imgsz", type=int, default=640)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-setseed(args.seed)
-model_name = args.model_name
-
-out_dir = args.save_dir
-dataloader = create_dataloaders(
-    data_path=args.dataset_dir,
-    annFile=args.ann_file,
-    img_sz=args.imgsz,
-    batch_size=args.bsz,
-    step=args.step,
-    data_process_type=args.data_process_type
-)
-# print("disable_quant_names : ", args.disable_quant_names)
-static_quantize(args.model,
-        calibration_dataloader=dataloader,
-        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
-        observer=args.observer,
-        data_preprocess=lambda x: x[0].to("cuda"),
-        quant_format="qdq",
-        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
index 7d6a609e03064b8334da1d3e91b958ead081686f..81f27858bba76f9ac497b7144883d0b3fd045f3d 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
index 4fdf2ada533a5ff82d0b68bf8c41648cca6d1eec..357fb10ba605e56bcaa853037c40213b6be09609 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
@@ -46,7 +32,6 @@ done
 source ${CONFIG_DIR}
 ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
-echo PROJ_DIR : ${PROJ_DIR}
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
index e216261271056f05c59ee49373c5389726c9097b..a0961b5c1d63687a5c99d4ddaeba10237ea72f99 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
index 7faed28cd3cea40673078c1bbb0eac4311e2c9a6..d0f1f48a3d42c655cff6130aad715fc56048b2e0 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov3/ixrt/simplify_model.py b/models/cv/object_detection/yolov3/ixrt/simplify_model.py
deleted file mode 100644
index 1400fd81ddb4b3fae1b20d0fd35082a692f5d292..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov3/ixrt/simplify_model.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import onnx
-import argparse
-from onnxsim import simplify
-
-# Simplify
-def simplify_model(args):
-    onnx_model = onnx.load(args.origin_model)
-    model_simp, check = simplify(onnx_model)
-    model_simp = onnx.shape_inference.infer_shapes(model_simp)
-    onnx.save(model_simp, args.output_model)
-    print("  Simplify onnx Done.")
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-simplify_model(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/README.md b/models/cv/object_detection/yolov4/ixrt/README.md
index e425710b4b003898dd90c276f2ee6177ccf881dd..f6bd831e431a4c70064a9d348d143e397bd8889e 100644
--- a/models/cv/object_detection/yolov4/ixrt/README.md
+++ b/models/cv/object_detection/yolov4/ixrt/README.md
@@ -38,21 +38,26 @@ pip3 install -r requirements.txt
 git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git yolov4
 
 # download weight
-mkdir data
-wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights -P data
+mkdir checkpoints
+wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights -P checkpoints
 
 # export onnx model
-python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight data/yolov4.weights --batchsize 16 --output data/yolov4.onnx
-mv yolov4_16_3_608_608_static.onnx data/yolov4.onnx
-
-# Use onnxsim optimize onnx model
-onnxsim data/yolov4.onnx data/yolov4_sim.onnx
-
-# Make sure the dataset path is "data/coco"
+python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight yolov4.weights --output yolov4.onnx
+mv yolov4.onnx checkpoints/yolov4.onnx
 ```
 
 ## Model Inference
 
+```bash
+export PROJ_DIR=./
+export DATASETS_DIR=./coco/
+export CHECKPOINTS_DIR=./checkpoints
+export COCO_GT=./coco/annotations/instances_val2017.json
+export EVAL_DIR=./coco/images/val2017
+export RUN_DIR=./
+export CONFIG_DIR=config/YOLOV4_CONFIG
+```
+
 ### FP16
 
 ```bash
diff --git a/models/cv/object_detection/yolov4/ixrt/build_engine.py b/models/cv/object_detection/yolov4/ixrt/build_engine.py
index ec4080edd3c275a4595cbfb407a21cebdada7fa7..d47e45e518cc0bd35d2fd27f19f7da17bec44abf 100644
--- a/models/cv/object_detection/yolov4/ixrt/build_engine.py
+++ b/models/cv/object_detection/yolov4/ixrt/build_engine.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import os
 import cv2
 import argparse
@@ -19,13 +5,11 @@ import numpy as np
 
 import torch
 import tensorrt
-from tensorrt import Dims
 
 from load_ixrt_plugin import load_ixrt_plugin
 load_ixrt_plugin()
 
-
-def build_engine_trtapi_staticshape(config):
+def main(config):
     IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
     builder = tensorrt.Builder(IXRT_LOGGER)
     EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
@@ -42,42 +26,6 @@ def build_engine_trtapi_staticshape(config):
     engine_file_path = config.engine
     with open(engine_file_path, "wb") as f:
         f.write(plan)
-    print("Build static shape engine done!")
-
-
-def build_engine_trtapi_dynamicshape(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-
-    profile = builder.create_optimization_profile()
-    profile.set_shape("input",
-                        Dims([1, 3, 608, 608]),
-                        Dims([32, 3, 608, 608]),
-                        Dims([64, 3, 608, 608]),
-    )
-    build_config.add_optimization_profile(profile)
-
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    # set dynamic
-    num_inputs = network.num_inputs
-    for i in range(num_inputs):
-        input_tensor = network.get_input(i)
-        input_tensor.shape = Dims([-1, 3, 608, 608])
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-    print("Build dynamic shape engine done!")
-
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -90,8 +38,6 @@ def parse_args():
     args = parser.parse_args()
     return args
 
-
 if __name__ == "__main__":
     args = parse_args()
-    build_engine_trtapi_staticshape(args)
-    # build_engine_trtapi_dynamicshape(args)
+    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov4/ixrt/calibration_dataset.py
similarity index 55%
rename from models/cv/object_detection/yolov5/ixrt/calibration_dataset.py
rename to models/cv/object_detection/yolov4/ixrt/calibration_dataset.py
index de37775a0c617fdefca4342423a6a47bdc9b9c41..578e013db932c53f0cfa2790e375d7b699081168 100644
--- a/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py
+++ b/models/cv/object_detection/yolov4/ixrt/calibration_dataset.py
@@ -1,22 +1,10 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import torch
 import torchvision.datasets
 from torch.utils.data import DataLoader
+
+
+
 from datasets.coco import CocoDetection
 
 def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
diff --git a/models/cv/object_detection/yolov4/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov4/ixrt/ci/prepare.sh
index f5381ef3db88f61e26d50b601f58b046ffa79317..63b534209dc77e84945c8f442bb7c201e9905171 100644
--- a/models/cv/object_detection/yolov4/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov4/ixrt/ci/prepare.sh
@@ -30,10 +30,8 @@ pip3 install -r requirements.txt
 # clone yolov4
 cp -r /root/data/3rd_party/yolov4 ./
 
-mkdir data
+mkdir checkpoints
 # export onnx model
-python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight /root/data/checkpoints/yolov4.weights --batchsize 16 --output data/yolov4.onnx
-mv yolov4_16_3_608_608_static.onnx data/yolov4.onnx
+python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight /root/data/checkpoints/yolov4.weights --output yolov4.onnx
+mv yolov4.onnx checkpoints/yolov4.onnx
 
-# Use onnxsim optimize onnx model
-onnxsim data/yolov4.onnx data/yolov4_sim.onnx
diff --git a/models/cv/object_detection/yolov4/ixrt/coco_labels.py b/models/cv/object_detection/yolov4/ixrt/coco_labels.py
index 5fc21282c7fa393e9d15e8bdc16c741dc7e78448..69d38878ff16d66dfe7550fcd170ac91d0862318 100644
--- a/models/cv/object_detection/yolov4/ixrt/coco_labels.py
+++ b/models/cv/object_detection/yolov4/ixrt/coco_labels.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 labels = [
     "person",
     "bicycle",
diff --git a/models/cv/object_detection/yolov4/ixrt/config/YOLOV4_CONFIG b/models/cv/object_detection/yolov4/ixrt/config/YOLOV4_CONFIG
new file mode 100644
index 0000000000000000000000000000000000000000..c04994949f0f65d799f77fb9f2ec62b96c0becbe
--- /dev/null
+++ b/models/cv/object_detection/yolov4/ixrt/config/YOLOV4_CONFIG
@@ -0,0 +1,49 @@
+# BSZ : 构建engine以及推理时的batchsize
+# IMGSIZE : 模型输入hw大小
+# RUN_MODE : [FPS, MAP]
+# PRECISION : [float16, int8]
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件
+# COCO_GT : COCOEVAL标签文件
+# DATASET_DIR : 量化/推理数据集路径
+# CHECKPOINTS_DIR : 存放生成的onnx/engine路径
+# LAYER_FUSION : decoder部分走融合算子实现  0不融合 1融合
+# DECODER_FASTER : 有两种融合实现,faster版本速度快且可以直接对接gpu nms;另一种实现的输出和onnx保持一致.  1:faster
+IMGSIZE=416
+MODEL_NAME=yolov4
+ORIGINE_MODEL=yolov4.onnx
+DATA_PROCESS_TYPE=yolov4
+MODEL_INPUT_NAMES=(input)
+
+LAYER_FUSION=1
+DECODER_FASTER=1
+DECODER_NUM_CLASS=80
+DECODER_INPUT_NAMES=(/models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0)
+DECODER_8_ANCHOR=(12 16 19 36 40 28)
+DECODER_16_ANCHOR=(36 75 76 55 72 146)
+DECODER_32_ANCHOR=(142 110 192 243 459 401)
+
+# NMS CONFIG
+    # IOU_THRESH : iou阈值
+    # SCORE_THRESH : bbox置信度阈值
+    # MAX_BOX_PRE_IMG : 每张图片预测bbox的数量上限
+    # ALL_BOX_NUM : nms接收每张图片的box数量
+    # NMS_TYPE : GPU/CPU(TODO)
+IOU_THRESH=0.6
+SCORE_THRESH=0.001
+MAX_BOX_PRE_IMG=1000
+ALL_BOX_NUM=10647
+NMS_TYPE=GPU
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+    # QUANT_OBSERVER : 量化策略，可选 [hist_percentile, percentile, minmax, entropy, ema]
+    # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致，有些op可能推导shape错误(比如Reshape)
+    # QUANT_STEP : 量化步数
+    # QUANT_SEED : 随机种子 保证量化结果可复现
+    # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=hist_percentile
+QUANT_BATCHSIZE=1
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=()
+QUANT_EXIST_ONNX=
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/cut_model.py b/models/cv/object_detection/yolov4/ixrt/cut_model.py
index cf4f88dae926b8d15356c7f6b48d89fe80dc9f2a..af0a3a4f0cc3caf05b95be3c77dea7728c931e3f 100644
--- a/models/cv/object_detection/yolov4/ixrt/cut_model.py
+++ b/models/cv/object_detection/yolov4/ixrt/cut_model.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import onnx
 import argparse
 from onnxsim import simplify
@@ -27,4 +13,4 @@ def parse_args():
 
 args = parse_args()
 onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
-print("  Cut Model Done.")
+print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/deploy.py b/models/cv/object_detection/yolov4/ixrt/deploy.py
index 084356ec8cb14a0604bf994faca4ce15834e4b15..ec56b7ab83c6b271c92de6e5c36153927f629887 100644
--- a/models/cv/object_detection/yolov4/ixrt/deploy.py
+++ b/models/cv/object_detection/yolov4/ixrt/deploy.py
@@ -1,90 +1,8 @@
 # !/usr/bin/env python
 # -*- coding: utf-8 -*-
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import argparse
-import copy
-
-from typing import Union, Callable, List
-
-from tensorrt.deploy.api import *
-from tensorrt.deploy.backend.onnx.converter import default_converter
-from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
-from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
-from tensorrt.deploy.ir.operator_type import OperatorType as OP
-from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
-from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
-from tensorrt.deploy.ir import Graph
-from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
-from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
 from tensorrt.deploy.api import GraphTransform, create_source, create_target
 
-class FuseMishPass(BasePass):
-    def process(self, graph: Graph) -> Graph:
-        pattern = build_sequence_graph([OP.SOFTPLUS, OP.TANH, OP.MUL])
-
-        matcher = GraphMatcher(pattern, strict=False)
-        self.transform = GraphTransform(graph)
-        matcher.findall(graph, self.fuse_mish)
-        return graph
-
-    def fuse_mish(self, graph: Graph, pattern_graph: PatternGraph):
-        softplus = pattern_graph.nodes[0].operator
-        mul = pattern_graph.nodes[-1].operator
-
-        if not self.can_fused(graph, pattern_graph):
-            return
-
-        self.transform.delete_operators_between_op_op(softplus, mul)
-
-        mish_op = Operator(
-            name=generate_operator_name(graph, pattern="Mish_{idx}"),
-            op_type=OP.MISH,
-            inputs=copy.copy(softplus.inputs),
-            outputs=copy.copy(mul.outputs),
-        )
-        mish_op.is_quant_operator = softplus.is_quant_operator and mul.is_quant_operator
-        graph.add_operator(mish_op)
-
-    def can_fused(self, graph: Graph, pattern_graph: PatternGraph):
-        softplus = pattern_graph.nodes[0].operator
-        mul = pattern_graph.nodes[-1].operator
-
-        # 检查 Softplus, tanh 的输出是不是只有一个 OP 使用
-        # 如果有多个 OP 使用，则不能融合
-        for node in pattern_graph.nodes[:2]:
-            next_ops = graph.get_next_operators(node.operator)
-            if len(next_ops) != 1:
-                return False
-
-        # 检查 Mul 的输入是不是和 Softplus 是同源的
-        softplus_prev_op = graph.get_previous_operators(softplus)
-        if len(softplus_prev_op) != 1:
-            return False
-
-        mul_prev_op = graph.get_previous_operators(mul)
-        if len(mul_prev_op) != 2:
-            return False
-
-        for op in mul_prev_op:
-            if op is softplus_prev_op[0]:
-                return True
-
-        return False
-
-
 class Transform:
     def __init__(self, graph):
         self.t = GraphTransform(graph)
@@ -168,24 +86,32 @@ def customize_ops(graph, args):
             outputs=["output"],
             axis=1
         )
-    else:
+    elif args.with_nms:
         graph = t.AddConcatOp(
             inputs=["decoder_32", "decoder_16", "decoder_8"],
             outputs=["output"],
             axis=1
         )
 
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
+        graph.outputs.clear()
+        graph.add_output("output")
+        graph.outputs["output"].dtype = "FLOAT"
+    else:
+        graph.outputs.clear()
+        graph.add_output("decoder_8")
+        graph.outputs["decoder_8"].dtype = "FLOAT"
+        graph.add_output("decoder_16")
+        graph.outputs["decoder_16"].dtype = "FLOAT"
+        graph.add_output("decoder_32")
+        graph.outputs["decoder_32"].dtype = "FLOAT"
     return graph
 
-
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--src", type=str)
     parser.add_argument("--dst", type=str)
     parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
+    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
     parser.add_argument("--decoder_input_names", nargs='+', type=str)
     parser.add_argument("--decoder8_anchor", nargs='*', type=int)
     parser.add_argument("--decoder16_anchor", nargs='*', type=int)
@@ -199,12 +125,10 @@ def parse_args():
     args = parser.parse_args()
     return args
 
-
 if __name__ == "__main__":
 
     args = parse_args()
     graph = create_source(args.src)()
     graph = customize_ops(graph, args)
-    graph = FuseMishPass().process(graph)
     create_target(saved_path=args.dst).export(graph)
-    print("Surged onnx lies on", args.dst)
+    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/export.py b/models/cv/object_detection/yolov4/ixrt/export.py
index 7c8bbfa5aa79f1a982c340690658325d23fa4b54..db7e06cc9bfc5dd2b301c75f2472271222efa446 100644
--- a/models/cv/object_detection/yolov4/ixrt/export.py
+++ b/models/cv/object_detection/yolov4/ixrt/export.py
@@ -32,11 +32,6 @@ def parse_args():
                     required=True, 
                     help="darknet weights path.")
     
-    parser.add_argument("--batchsize", 
-                    type=int, 
-                    required=True, 
-                    help="Onnx model batchsize.")
-    
     parser.add_argument("--output", 
                     type=str, 
                     required=True, 
@@ -49,7 +44,7 @@ def parse_args():
 def main():
     args = parse_args()
 
-    transform_to_onnx(args.cfg, args.weight, args.batchsize, args.output)
+    transform_to_onnx(args.cfg, args.weight, -1, args.output)
     
 if __name__ == "__main__":
     main()
diff --git a/models/cv/object_detection/yolov4/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov4/ixrt/load_ixrt_plugin.py
index 2bb0abc21bd5806c51d6b908e3e3407cfdb62cc8..932efbdfd1a4e91d8ddfd363adf6bce989df1709 100644
--- a/models/cv/object_detection/yolov4/ixrt/load_ixrt_plugin.py
+++ b/models/cv/object_detection/yolov4/ixrt/load_ixrt_plugin.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
@@ -23,4 +9,4 @@ def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="",
             f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
     ctypes.CDLL(dynamic_path)
     tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
+    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov4/ixrt/modify_batchsize.py
similarity index 62%
rename from models/cv/object_detection/yolov5/ixrt/modify_batchsize.py
rename to models/cv/object_detection/yolov4/ixrt/modify_batchsize.py
index 3a88c1603bd6f457fd4965257627dc29edcda4d1..f696ae5517dfb15c020c533332c02a2b6b06c873 100644
--- a/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py
+++ b/models/cv/object_detection/yolov4/ixrt/modify_batchsize.py
@@ -1,20 +1,7 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
+import copy
+import numpy as np
 
 def change_input_dim(model, bsz):
     batch_size = bsz
@@ -46,7 +33,22 @@ def parse_args():
     args = parser.parse_args()
     return args
 
+def modify_resize_nodes(model, bsz):
+    print("modify resize")
+    for node in model.graph.node:
+        if node.op_type == "Resize":
+            if len(node.input) >= 4 and node.input[3]:
+                sizes_name = node.input[3]
+                for initializer in model.graph.initializer:
+                    if initializer.name == sizes_name:
+                        shape = copy.deepcopy(onnx.numpy_helper.to_array(initializer))
+                        shape[0] = shape[0] * bsz
+                        new_sizes = np.array(shape, dtype=np.int64)
+                        initializer.CopyFrom(onnx.numpy_helper.from_array(new_sizes, name=initializer.name))
+                        break
+    
 args = parse_args()
 model = onnx.load(args.origin_model)
 change_input_dim(model, args.batch_size)
-onnx.save(model, args.output_model)
\ No newline at end of file
+modify_resize_nodes(model, args.batch_size)
+onnx.save(model, args.output_model)
diff --git a/models/cv/object_detection/yolov4/ixrt/quant.py b/models/cv/object_detection/yolov4/ixrt/quant.py
index 70265cbc25d24d4ed41640c76f78a1839555f749..d73212ca60a4985cc036f67e8fb0b3c70ba24e4d 100644
--- a/models/cv/object_detection/yolov4/ixrt/quant.py
+++ b/models/cv/object_detection/yolov4/ixrt/quant.py
@@ -1,50 +1,34 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import os
-import cv2
 import random
 import argparse
 import numpy as np
 from tensorrt.deploy import static_quantize
 
 import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-from common import letterbox
-
+import sys
+sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
+print(sys.path)
+from calibration_dataset import create_dataloaders
 
 def setseed(seed=42):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
 
-
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_name", type=str)
-    parser.add_argument("--model", type=str,  default="yolov4_bs16_without_decoder.onnx")
+    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
+    parser.add_argument("--data_process_type", type=str,  default="none")
     parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
     parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
     parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
     parser.add_argument("--disable_quant_names", nargs='*', type=str)
-    parser.add_argument("--save_quant_model", type=str,  help="save the quantization model path", default=None)
-    parser.add_argument("--bsz", type=int, default=16)
-    parser.add_argument("--step", type=int, default=32)
+    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
+    parser.add_argument("--bsz", type=int, default=32)
+    parser.add_argument("--step", type=int, default=20)
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--imgsz", type=int, default=608)
-    parser.add_argument("--use_letterbox", action="store_true")
+    parser.add_argument("--imgsz", type=int, default=640)
     args = parser.parse_args()
     return args
 
@@ -52,54 +36,20 @@ args = parse_args()
 setseed(args.seed)
 model_name = args.model_name
 
-
-def get_dataloader(data_dir, step=32, batch_size=16, new_shape=[608, 608], use_letterbox=False):
-    num = step * batch_size
-    val_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
-    random.shuffle(val_list)
-    pic_list = val_list[:num]
-
-    calibration_dataset = []
-    for file_path in pic_list:
-        pic_data = cv2.imread(file_path)
-        org_img = pic_data
-        assert org_img is not None, 'Image not Found ' + file_path
-        h0, w0 = org_img.shape[:2]
-
-        if use_letterbox:
-            img, ratio, dwdh = letterbox(org_img, new_shape=(new_shape[1], new_shape[0]), auto=False, scaleup=True)
-        else:
-            img = cv2.resize(org_img, new_shape)
-        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-        img = np.ascontiguousarray(img) / 255.0  # 0~1 np array
-        img = torch.from_numpy(img).float()
-
-        calibration_dataset.append(img)
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=True,
-        batch_size=batch_size,
-        drop_last=True
-    )
-    return calibration_dataloader
-
-dataloader = get_dataloader(
-    data_dir=args.dataset_dir,
-    step=args.step,
+out_dir = args.save_dir
+dataloader = create_dataloaders(
+    data_path=args.dataset_dir,
+    annFile=args.ann_file,
+    img_sz=args.imgsz,
     batch_size=args.bsz,
-    new_shape=(args.imgsz, args.imgsz),
-    use_letterbox=args.use_letterbox
+    step=args.step,
+    data_process_type=args.data_process_type
 )
-
-dirname = os.path.dirname(args.save_quant_model)
-quant_json_path = os.path.join(dirname, f"quantized_{model_name}.json")
-
+# print("disable_quant_names : ", args.disable_quant_names)
 static_quantize(args.model,
         calibration_dataloader=dataloader,
-        save_quant_onnx_path=args.save_quant_model,
-        save_quant_params_path=quant_json_path,
+        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
         observer=args.observer,
-        data_preprocess=lambda x: x.to("cuda"),
+        data_preprocess=lambda x: x[0].to("cuda"),
         quant_format="qdq",
-        disable_quant_names=args.disable_quant_names)
+        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_accuracy.sh b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_accuracy.sh
index c33dc591362e34df05378869f4254190ef5a6985..c86762e0f22e64a37813310851799c39d9dfed21 100644
--- a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_accuracy.sh
@@ -1,92 +1,185 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
-PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
-DATASETS_DIR=${DATASETS_DIR:-"${PROJ_DIR}/data/coco"}
-COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-EVAL_DIR=${DATASETS_DIR}/images/val2017
-CHECKPOINTS_DIR="${PROJ_DIR}/data"
-RUN_DIR="${PROJ_DIR}"
-ORIGINE_MODEL=${CHECKPOINTS_DIR}
+# Run paraments
+BSZ=32
+WARM_UP=-1
+TGT=0.65
+LOOP_COUNT=-1
+RUN_MODE=MAP
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
-echo Model Name : yolov4_darknet
+echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-BATCH_SIZE=16
-CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
 
-# Cut decoder part
-echo "Cut decoder part"
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
 else
-    python3 ${RUN_DIR}/cut_model.py  \
-        --input_model ${CURRENT_MODEL}     \
-        --output_model ${FINAL_MODEL}      \
-        --input_names input                \
-        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+CURRENT_MODEL=${SIM_MODEL}
+
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
+fi
+CURRENT_MODEL=${NO_DECODER_MODEL}
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
 
-# add decoder op
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV3Decoder             \
+            --with_nms             True                   \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
-    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
-    python3 ${RUN_DIR}/deploy.py             \
-        --src ${CURRENT_MODEL}               \
-        --dst ${FINAL_MODEL}                 \
-        --decoder_type YoloV3Decoder          \
-        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
-        --decoder8_anchor 12 16 19 36 40 28            \
-        --decoder16_anchor 36 75 76 55 72 146          \
-        --decoder32_anchor 142 110 192 243 459 401
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
     echo "  "Generate ${FINAL_MODEL}
 fi
 CURRENT_MODEL=${FINAL_MODEL}
 
 # Build Engine
-echo Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_fp16.engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --precision float16                        \
+        --precision ${PRECISION}                \
         --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
 
 # Inference
-echo Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
 RUN_BATCH_SIZE=16
 python3 ${RUN_DIR}/inference.py                 \
     --test_mode MAP                             \
@@ -100,4 +193,4 @@ python3 ${RUN_DIR}/inference.py                 \
     --pred_dir ${CHECKPOINTS_DIR}               \
     --precision float16                            \
     --map_target 0.30; check_status
-exit ${EXIT_STATUS}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_performance.sh b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_performance.sh
index a4a83ce72e715997ca64cf35b0b0ff0e8bd351a5..dabe655b7b15af873888208ebd1a3d1a2b36c441 100644
--- a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_performance.sh
+++ b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_performance.sh
@@ -1,92 +1,186 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
-PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
-DATASETS_DIR=${DATASETS_DIR:-"${PROJ_DIR}/data/coco"}
-COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-EVAL_DIR=${DATASETS_DIR}/images/val2017
-CHECKPOINTS_DIR="${PROJ_DIR}/data"
-RUN_DIR="${PROJ_DIR}"
-ORIGINE_MODEL=${CHECKPOINTS_DIR}
+# Run paraments
+BSZ=32
+WARM_UP=3
+TGT=1010
+LOOP_COUNT=100
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
-echo Model Name : yolov4_darknet
+echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-BATCH_SIZE=16
-CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
 
-# Cut decoder part
-echo "Cut decoder part"
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
 else
-    python3 ${RUN_DIR}/cut_model.py  \
-        --input_model ${CURRENT_MODEL}     \
-        --output_model ${FINAL_MODEL}      \
-        --input_names input                \
-        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
+CURRENT_MODEL=${SIM_MODEL}
+
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
+fi
+CURRENT_MODEL=${NO_DECODER_MODEL}
 
-# add decoder op
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
+fi
+
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV3Decoder             \
+            --with_nms             False                   \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
-    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
-    python3 ${RUN_DIR}/deploy.py             \
-        --src ${CURRENT_MODEL}               \
-        --dst ${FINAL_MODEL}                 \
-        --decoder_type YoloV3Decoder          \
-        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
-        --decoder8_anchor 12 16 19 36 40 28            \
-        --decoder16_anchor 36 75 76 55 72 146          \
-        --decoder32_anchor 142 110 192 243 459 401
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
     echo "  "Generate ${FINAL_MODEL}
 fi
 CURRENT_MODEL=${FINAL_MODEL}
 
 # Build Engine
-echo Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_fp16.engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --precision float16                        \
+        --precision ${PRECISION}                \
         --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
 
 # Inference
-echo Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
 RUN_BATCH_SIZE=16
 python3 ${RUN_DIR}/inference.py                 \
     --test_mode FPS                             \
@@ -100,4 +194,4 @@ python3 ${RUN_DIR}/inference.py                 \
     --pred_dir ${CHECKPOINTS_DIR}               \
     --precision float16                            \
     --map_target 0.30; check_status
-exit ${EXIT_STATUS}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_accuracy.sh b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_accuracy.sh
index 20e593785bc25bb06b8c0f2e537542981cc65c00..646b115f5232dd0fd3fa873398537b3c5f5c823a 100644
--- a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_accuracy.sh
@@ -1,110 +1,185 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
-PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
-DATASETS_DIR=${DATASETS_DIR:-"${PROJ_DIR}/data/coco"}
-COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-EVAL_DIR=${DATASETS_DIR}/images/val2017
-CHECKPOINTS_DIR="${PROJ_DIR}/data"
-RUN_DIR="${PROJ_DIR}"
-ORIGINE_MODEL=${CHECKPOINTS_DIR}
+# Run paraments
+BSZ=32
+WARM_UP=-1
+TGT=0.65
+LOOP_COUNT=-1
+RUN_MODE=MAP
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
-echo Model Name : yolov4_darknet
+echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-BATCH_SIZE=16
-CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
 
-# Cut decoder part
-echo "Cut decoder part"
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
 else
-    python3 ${RUN_DIR}/cut_model.py  \
-        --input_model ${CURRENT_MODEL}     \
-        --output_model ${FINAL_MODEL}      \
-        --input_names input                \
-        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
+CURRENT_MODEL=${SIM_MODEL}
 
-# quant
-FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
 else
-    python3 ${RUN_DIR}/quant.py             \
-        --model_name "YOLOV4_DARKNET"       \
-        --model ${CURRENT_MODEL}            \
-        --bsz ${BATCH_SIZE}                 \
-        --dataset_dir ${EVAL_DIR}           \
-        --ann_file ${COCO_GT}               \
-        --observer "hist_percentile"        \
-        --save_quant_model ${FINAL_MODEL}   \
-        --imgsz 608
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
+fi
+CURRENT_MODEL=${NO_DECODER_MODEL}
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
 
-# add decoder op
-FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV3Decoder             \
+            --with_nms             True                   \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
-    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
-    python3 ${RUN_DIR}/deploy.py             \
-        --src ${CURRENT_MODEL}               \
-        --dst ${FINAL_MODEL}                 \
-        --decoder_type YoloV3Decoder          \
-        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
-        --decoder8_anchor 12 16 19 36 40 28            \
-        --decoder16_anchor 36 75 76 55 72 146          \
-        --decoder32_anchor 142 110 192 243 459 401
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
     echo "  "Generate ${FINAL_MODEL}
 fi
 CURRENT_MODEL=${FINAL_MODEL}
 
 # Build Engine
-echo Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_int8.engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --precision int8                        \
+        --precision ${PRECISION}                \
         --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
 
 # Inference
-echo Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
 RUN_BATCH_SIZE=16
 python3 ${RUN_DIR}/inference.py                 \
     --test_mode MAP                             \
@@ -116,6 +191,6 @@ python3 ${RUN_DIR}/inference.py                 \
     --eval_dir ${EVAL_DIR}                      \
     --coco_gt ${COCO_GT}                        \
     --pred_dir ${CHECKPOINTS_DIR}               \
-    --precision int8                            \
+    --precision float16                            \
     --map_target 0.30; check_status
-exit ${EXIT_STATUS}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_performance.sh b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_performance.sh
index 7f11038651ffe1e27c8e57f40e7b6f74b67e1945..4665a65f6f8b3052b2beabb6221364a3b927b7bc 100644
--- a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_performance.sh
+++ b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_performance.sh
@@ -1,110 +1,186 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
-PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
-DATASETS_DIR=${DATASETS_DIR:-"${PROJ_DIR}/data/coco"}
-COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-EVAL_DIR=${DATASETS_DIR}/images/val2017
-CHECKPOINTS_DIR="${PROJ_DIR}/data"
-RUN_DIR="${PROJ_DIR}"
-ORIGINE_MODEL=${CHECKPOINTS_DIR}
+# Run paraments
+BSZ=32
+WARM_UP=3
+TGT=1010
+LOOP_COUNT=100
+RUN_MODE=FPS
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
-echo Model Name : yolov4_darknet
+echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-BATCH_SIZE=16
-CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
 
-# Cut decoder part
-echo "Cut decoder part"
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
 else
-    python3 ${RUN_DIR}/cut_model.py  \
-        --input_model ${CURRENT_MODEL}     \
-        --output_model ${FINAL_MODEL}      \
-        --input_names input                \
-        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
+CURRENT_MODEL=${SIM_MODEL}
 
-# quant
-FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
 else
-    python3 ${RUN_DIR}/quant.py             \
-        --model_name "YOLOV4_DARKNET"       \
-        --model ${CURRENT_MODEL}            \
-        --bsz ${BATCH_SIZE}                 \
-        --dataset_dir ${EVAL_DIR}           \
-        --ann_file ${COCO_GT}               \
-        --observer "hist_percentile"        \
-        --save_quant_model ${FINAL_MODEL}   \
-        --imgsz 608
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
+CURRENT_MODEL=${NO_DECODER_MODEL}
 
-# add decoder op
-FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
+fi
+
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV3Decoder             \
+            --with_nms             False                   \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
-    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
-    python3 ${RUN_DIR}/deploy.py             \
-        --src ${CURRENT_MODEL}               \
-        --dst ${FINAL_MODEL}                 \
-        --decoder_type YoloV3Decoder          \
-        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
-        --decoder8_anchor 12 16 19 36 40 28            \
-        --decoder16_anchor 36 75 76 55 72 146          \
-        --decoder32_anchor 142 110 192 243 459 401
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
     echo "  "Generate ${FINAL_MODEL}
 fi
 CURRENT_MODEL=${FINAL_MODEL}
 
 # Build Engine
-echo Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_int8.engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --precision int8                        \
+        --precision ${PRECISION}                \
         --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
 
 # Inference
-echo Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
 RUN_BATCH_SIZE=16
 python3 ${RUN_DIR}/inference.py                 \
     --test_mode FPS                             \
@@ -116,6 +192,6 @@ python3 ${RUN_DIR}/inference.py                 \
     --eval_dir ${EVAL_DIR}                      \
     --coco_gt ${COCO_GT}                        \
     --pred_dir ${CHECKPOINTS_DIR}               \
-    --precision int8                            \
+    --precision float16                            \
     --map_target 0.30; check_status
-exit ${EXIT_STATUS}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/simplify_model.py b/models/cv/object_detection/yolov4/ixrt/simplify_model.py
similarity index 46%
rename from models/cv/object_detection/yolov5/ixrt/simplify_model.py
rename to models/cv/object_detection/yolov4/ixrt/simplify_model.py
index 1400fd81ddb4b3fae1b20d0fd35082a692f5d292..b4254b6f903cb5f8775e43b2f80d5572bf45b1d6 100644
--- a/models/cv/object_detection/yolov5/ixrt/simplify_model.py
+++ b/models/cv/object_detection/yolov4/ixrt/simplify_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/models/cv/object_detection/yolov5/ixrt/README.md b/models/cv/object_detection/yolov5/ixrt/README.md
index 6870c7d070549070236f8ad1a40f81489ce9b60e..69f568404f9abe717223373f3a76746cbc48d2a7 100644
--- a/models/cv/object_detection/yolov5/ixrt/README.md
+++ b/models/cv/object_detection/yolov5/ixrt/README.md
@@ -30,7 +30,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 ```
 
 ### Model Conversion
@@ -54,13 +54,13 @@ mv yolov5m.onnx /Path/to/checkpoints
 ## Model Inference
 
 ```bash
-export PROJ_DIR=/Path/to/yolov5/ixrt
-export DATASETS_DIR=/Path/to/coco2017/
+export PROJ_DIR=./
+export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-export EVAL_DIR=${DATASETS_DIR}/val2017
-export RUN_DIR=/Path/to/yolov5/ixrt
-export CONFIG_DIR=config/YOLOV5_CONFIG
+export EVAL_DIR=${DATASETS_DIR}/images/val2017
+export RUN_DIR=../../ixrt_common
+export CONFIG_DIR=../../ixrt_common/config/YOLOV5M_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py
deleted file mode 100644
index 3be0d83d0d966018f59b87d22f628b9b1ddf9b21..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import argparse
-import torch
-import onnx
-from onnx import helper
-from onnx import TensorProto, numpy_helper
-import tensorrt
-
-def create_onnx(args):
-    nms = helper.make_node(
-        "NMS",
-        name="NMS",
-        inputs=["nms_input"],
-        outputs=["nms_output0", "nms_output1"],
-        nMaxKeep=args.max_box_pre_img,
-        fIoUThresh=args.iou_thresh,
-        fScoreThresh=args.score_thresh
-    )
-    graph = helper.make_graph(
-        nodes=[nms],
-        name="gpu_nms",
-        inputs=[
-            helper.make_tensor_value_info(
-                "nms_input", onnx.TensorProto.FLOAT, (args.bsz, args.all_box_num, 6)
-            )
-        ],
-        outputs=[
-            helper.make_tensor_value_info(
-                "nms_output0", onnx.TensorProto.FLOAT, (args.bsz, args.max_box_pre_img, 6)
-            ),
-            helper.make_tensor_value_info(
-                "nms_output1", onnx.TensorProto.INT32, (args.bsz,)
-            )
-        ],
-        initializer=[]
-    )
-
-    op = onnx.OperatorSetIdProto()
-    op.version = 13
-    model = onnx.helper.make_model(graph)
-
-    model = onnx.helper.make_model(graph, opset_imports=[op])
-    onnx_path = args.path + "/nms.onnx"
-    onnx.save(model, onnx_path)
-
-def build_engine(args):
-    onnx_path = args.path + "/nms.onnx"
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(onnx_path)
-    plan = builder.build_serialized_network(network, build_config)
-
-    engine_path = args.path + "/nms.engine"
-    with open(engine_path, "wb") as f:
-        f.write(plan)
-
-def main(args):
-    create_onnx(args)
-    build_engine(args)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--bsz", type=int, default=1, help="batch size")
-    parser.add_argument("--path", type=str)
-    parser.add_argument("--all_box_num", type=int, default=25200)
-    parser.add_argument("--max_box_pre_img", type=int, default=1000)
-    parser.add_argument("--iou_thresh", type=float, default=0.6)
-    parser.add_argument("--score_thresh", type=float, default=0.001)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
index b66c06b56fd42240b3578ef2a764a39c5fe06b03..b99ab99d2fccb0f83ceadda8983ac202ff3f4268 100644
--- a/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
@@ -25,7 +25,7 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 
 mkdir checkpoints
 cp -r /root/data/3rd_party/yolov5 ./
diff --git a/models/cv/object_detection/yolov5/ixrt/coco_labels.py b/models/cv/object_detection/yolov5/ixrt/coco_labels.py
deleted file mode 100644
index 43f5bd82cd257efdcab2bdba6bad64d9bb90416e..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/coco_labels.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-labels = [
-    "person",
-    "bicycle",
-    "car",
-    "motorcycle",
-    "airplane",
-    "bus",
-    "train",
-    "truck",
-    "boat",
-    "traffic light",
-    "fire hydrant",
-    "stop sign",
-    "parking meter",
-    "bench",
-    "bird",
-    "cat",
-    "dog",
-    "horse",
-    "sheep",
-    "cow",
-    "elephant",
-    "bear",
-    "zebra",
-    "giraffe",
-    "backpack",
-    "umbrella",
-    "handbag",
-    "tie",
-    "suitcase",
-    "frisbee",
-    "skis",
-    "snowboard",
-    "sports ball",
-    "kite",
-    "baseball bat",
-    "baseball glove",
-    "skateboard",
-    "surfboard",
-    "tennis racket",
-    "bottle",
-    "wine glass",
-    "cup",
-    "fork",
-    "knife",
-    "spoon",
-    "bowl",
-    "banana",
-    "apple",
-    "sandwich",
-    "orange",
-    "broccoli",
-    "carrot",
-    "hot dog",
-    "pizza",
-    "donut",
-    "cake",
-    "chair",
-    "couch",
-    "potted plant",
-    "bed",
-    "dining table",
-    "toilet",
-    "tv",
-    "laptop",
-    "mouse",
-    "remote",
-    "keyboard",
-    "cell phone",
-    "microwave",
-    "oven",
-    "toaster",
-    "sink",
-    "refrigerator",
-    "book",
-    "clock",
-    "vase",
-    "scissors",
-    "teddy bear",
-    "hair drier",
-    "toothbrush",
-]
-def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
-    return [
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
-        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
-
-__all__ = ["labels"]
diff --git a/models/cv/object_detection/yolov5/ixrt/common.py b/models/cv/object_detection/yolov5/ixrt/common.py
deleted file mode 100644
index aba2117c9942d6823abf73bf3ab94c291a7705e2..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/common.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import numpy as np
-from tqdm import tqdm
-
-import tensorrt
-import pycuda.driver as cuda
-
-# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
-# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
-def box_class85to6(input):
-    center_x_y = input[:, :2]
-    side = input[:, 2:4]
-    conf = input[:, 4:5]
-    class_id = np.argmax(input[:, 5:], axis = -1)
-    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
-    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
-    x1_y1 = center_x_y - 0.5 * side
-    x2_y2 = center_x_y + 0.5 * side
-    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
-    return nms_input
-
-def save2json(batch_img_id, pred_boxes, json_result, class_trans):
-    for i, boxes in enumerate(pred_boxes):
-        if boxes is not None:
-            image_id = int(batch_img_id[i])
-            # have no target
-            if image_id == -1:
-                continue
-            for x, y, w, h, c, p in boxes:
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                        "image_id": image_id,
-                        "category_id": class_trans[c - 1],
-                        "bbox": [x, y, w, h],
-                        "score": p,
-                    }
-                )
-
-def create_engine_context(engine_path, logger):
-    with open(engine_path, "rb") as f:
-        runtime = tensorrt.Runtime(logger)
-        assert runtime
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine
-        context = engine.create_execution_context()
-        assert context
-
-    return engine, context
-
-def get_io_bindings(engine):
-    # Setup I/O bindings
-    inputs = []
-    outputs = []
-    allocations = []
-
-    for i in range(engine.num_bindings):
-        is_input = False
-        if engine.binding_is_input(i):
-            is_input = True
-        name = engine.get_binding_name(i)
-        dtype = engine.get_binding_dtype(i)
-        shape = engine.get_binding_shape(i)
-        if is_input:
-            batch_size = shape[0]
-        size = np.dtype(tensorrt.nptype(dtype)).itemsize
-        for s in shape:
-            size *= s
-        allocation = cuda.mem_alloc(size)
-        binding = {
-            "index": i,
-            "name": name,
-            "dtype": np.dtype(tensorrt.nptype(dtype)),
-            "shape": list(shape),
-            "allocation": allocation,
-        }
-        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
-        allocations.append(allocation)
-        if engine.binding_is_input(i):
-            inputs.append(binding)
-        else:
-            outputs.append(binding)
-    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/cut_model.py b/models/cv/object_detection/yolov5/ixrt/cut_model.py
deleted file mode 100644
index e9ee19aadf0809fe1b97e3225d09150fb54513f7..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/cut_model.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import onnx
-import argparse
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    parser.add_argument("--input_names", nargs='+', type=str)
-    parser.add_argument("--output_names", nargs='+', type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
-print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py
deleted file mode 100644
index 162e24b462289dcee7b7a2888b93fad1115def81..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/coco.py b/models/cv/object_detection/yolov5/ixrt/datasets/coco.py
deleted file mode 100644
index 73c5df54761b917ecd0127fb56b61d9bd34c1196..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/coco.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os.path
-from typing import Any, Callable, List, Optional, Tuple
-
-import cv2
-
-from .vision import VisionDataset
-from .pre_process import get_post_process
-class CocoDetection(VisionDataset):
-    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        annFile: str,
-        img_size: int,
-        data_process_type: str,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-        
-    ) -> None:
-        super().__init__(root, transforms, transform, target_transform)
-        from pycocotools.coco import COCO
-
-        self.coco = COCO(annFile)
-        self.ids = list(sorted(self.coco.imgs.keys()))
-        self.img_size = img_size
-        
-        self.transforms = get_post_process(data_process_type)
-
-    def _load_image(self, id: int):
-        path = self.coco.loadImgs(id)[0]["file_name"]
-        data = cv2.imread(os.path.join(self.root, path))
-        return data
-
-    def _load_target(self, id: int) -> List[Any]:
-        return self.coco.loadAnns(self.coco.getAnnIds(id))
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        id = self.ids[index]
-        image = self._load_image(id)
-        target = self._load_target(id)
-        origin_shape = image.shape[:2]
-
-        if self.transforms is not None:
-            image = self.transforms(image, self.img_size)
-
-        if len(target) > 0:
-            image_id = target[0]["image_id"]
-        else:
-            # have no target
-            image_id = -1
-        return image, origin_shape, image_id
-
-    def __len__(self) -> int:
-        return len(self.ids)
-
-
-class CocoCaptions(CocoDetection):
-    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-
-    Example:
-
-        .. code:: python
-
-            import torchvision.datasets as dset
-            import torchvision.transforms as transforms
-            cap = dset.CocoCaptions(root = 'dir where images are',
-                                    annFile = 'json annotation file',
-                                    transform=transforms.PILToTensor())
-
-            print('Number of samples: ', len(cap))
-            img, target = cap[3] # load 4th sample
-
-            print("Image Size: ", img.size())
-            print(target)
-
-        Output: ::
-
-            Number of samples: 82783
-            Image Size: (3L, 427L, 640L)
-            [u'A plane emitting smoke stream flying over a mountain.',
-            u'A plane darts across a bright blue sky behind a mountain covered in snow',
-            u'A plane leaves a contrail above the snowy mountain top.',
-            u'A mountain that has a plane flying overheard in the distance.',
-            u'A mountain view with a plume of smoke in the background']
-
-    """
-
-    def _load_target(self, id: int) -> List[str]:
-        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py
deleted file mode 100644
index 8590816a0df18b6ef296ebe305b15b81240ab1d0..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox, scale_boxes, clip_boxes
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Postprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Postprocess
-    elif data_process_type == "yolox":
-        return YoloxPostprocess
-    return None
-
-def Yolov3Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=False
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def Yolov5Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=True
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def YoloxPostprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
-            boxes[:, :4] /= r
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py
deleted file mode 100644
index c651f8adb7c8190c214fbbbb7769c7d0713e9619..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Preprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Preprocess
-    elif data_process_type == "yolox":
-        return YoloxPreprocess
-    return None
-
-def Yolov3Preprocess(image, img_size):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    image = cv2.resize(image, (img_size, img_size))
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def Yolov5Preprocess(image, img_size, augment=False):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    if r != 1:  # if sizes are not equal
-        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
-        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
-
-    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
-
-    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def YoloxPreprocess(img, img_size, swap=(2,0,1)):
-
-    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
-    r = min(img_size / img.shape[0], img_size / img.shape[1])
-    resized_img = cv2.resize(
-        img,
-        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        interpolation=cv2.INTER_LINEAR, 
-    ).astype(np.uint8)
-
-    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-    padded_img = padded_img.transpose(swap)
-    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-
-    return padded_img
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/vision.py b/models/cv/object_detection/yolov5/ixrt/datasets/vision.py
deleted file mode 100644
index eadefb2c5b35abd0a11fa85c65891461a210aef8..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/vision.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-import torch.utils.data as data
-
-from types import FunctionType
-
-def _log_api_usage_once(obj: Any) -> None:
-
-    """
-    Logs API usage(module and name) within an organization.
-    In a large ecosystem, it's often useful to track the PyTorch and
-    TorchVision APIs usage. This API provides the similar functionality to the
-    logging module in the Python stdlib. It can be used for debugging purpose
-    to log which methods are used and by default it is inactive, unless the user
-    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
-    Please note it is triggered only once for the same API call within a process.
-    It does not collect any data from open-source users since it is no-op by default.
-    For more information, please refer to
-    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
-    * Logging policy: https://github.com/pytorch/vision/issues/5052;
-
-    Args:
-        obj (class instance or method): an object to extract info from.
-    """
-    module = obj.__module__
-    if not module.startswith("torchvision"):
-        module = f"torchvision.internal.{module}"
-    name = obj.__class__.__name__
-    if isinstance(obj, FunctionType):
-        name = obj.__name__
-    torch._C._log_api_usage_once(f"{module}.{name}")
-
-class VisionDataset(data.Dataset):
-    """
-    Base Class For making datasets which are compatible with torchvision.
-    It is necessary to override the ``__getitem__`` and ``__len__`` method.
-
-    Args:
-        root (string): Root directory of dataset.
-        transforms (callable, optional): A function/transforms that takes in
-            an image and a label and returns the transformed versions of both.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-
-    .. note::
-
-        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
-    """
-
-    _repr_indent = 4
-
-    def __init__(
-        self,
-        root: str,
-        transforms: Optional[Callable] = None,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-    ) -> None:
-        _log_api_usage_once(self)
-        if isinstance(root, str):
-            root = os.path.expanduser(root)
-        self.root = root
-
-        has_transforms = transforms is not None
-        has_separate_transform = transform is not None or target_transform is not None
-        if has_transforms and has_separate_transform:
-            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
-
-        # for backwards-compatibility
-        self.transform = transform
-        self.target_transform = target_transform
-
-        if has_separate_transform:
-            transforms = StandardTransform(transform, target_transform)
-        self.transforms = transforms
-
-    def __getitem__(self, index: int) -> Any:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            (Any): Sample and meta data, optionally transformed by the respective transforms.
-        """
-        raise NotImplementedError
-
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    def __repr__(self) -> str:
-        head = "Dataset " + self.__class__.__name__
-        body = [f"Number of datapoints: {self.__len__()}"]
-        if self.root is not None:
-            body.append(f"Root location: {self.root}")
-        body += self.extra_repr().splitlines()
-        if hasattr(self, "transforms") and self.transforms is not None:
-            body += [repr(self.transforms)]
-        lines = [head] + [" " * self._repr_indent + line for line in body]
-        return "\n".join(lines)
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def extra_repr(self) -> str:
-        return ""
-
-
-class StandardTransform:
-    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
-        self.transform = transform
-        self.target_transform = target_transform
-
-    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
-        if self.transform is not None:
-            input = self.transform(input)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-        return input, target
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def __repr__(self) -> str:
-        body = [self.__class__.__name__]
-        if self.transform is not None:
-            body += self._format_transform_repr(self.transform, "Transform: ")
-        if self.target_transform is not None:
-            body += self._format_transform_repr(self.target_transform, "Target transform: ")
-
-        return "\n".join(body)
diff --git a/models/cv/object_detection/yolov5/ixrt/inference.py b/models/cv/object_detection/yolov5/ixrt/inference.py
deleted file mode 100644
index c0476b899ba0ec51ab4aedc0596f19cb283952ab..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/inference.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import argparse
-import glob
-import json
-import os
-import time
-import sys
-
-import torch
-import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
-
-from coco_labels import coco80_to_coco91_class, labels
-from common import save2json, box_class85to6
-from common import create_engine_context, get_io_bindings
-from calibration_dataset import create_dataloaders
-from datasets.post_process import get_post_process
-
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-from tqdm import tqdm
-from tqdm.contrib import tzip
-
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-
-    # Load dataloader
-    dataloader = create_dataloaders(
-        data_path=config.eval_dir,
-        annFile=config.coco_gt,
-        img_sz=config.imgsz,
-        batch_size=config.bsz,
-        step=config.loop_count,
-        data_process_type=config.data_process_type
-    )
-
-    # Load post process func
-    if config.test_mode == "MAP":
-        post_process_func = get_post_process(config.data_process_type)
-
-    bsz = config.bsz
-    num_samples = 5000
-    if config.loop_count > 0 and config.loop_count < num_samples/bsz :
-        num_samples = bsz * config.loop_count
-    num_batch = len(dataloader)
-    print("=" * 30)
-    print(f"Test Mode : {'Asynchronous' if config.use_async else 'Synchronous'}")
-    print(f"Total sample : {num_samples}\nBatch_size : {bsz}\nRun Batch : {num_batch}")
-    print("=" * 30)
-
-    json_result = []
-    forward_time = 0.0
-    class_map = coco80_to_coco91_class()
-
-    host_mem = tensorrt.IHostMemory
-    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-
-    # Load Engine
-    engine, context = create_engine_context(config.model_engine, logger)
-    inputs, outputs, allocations = get_io_bindings(engine)
-
-    # Load nms_engine
-    if config.test_mode == "MAP" and config.nms_type == "GPU":
-        nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
-        nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
-        nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
-        nms_output1 = np.zeros(nms_outputs[1]["shape"], nms_outputs[1]["dtype"])
-        print(f"nms_output0 shape : {nms_output0.shape}   nms_output0 type : {nms_output0.dtype}")
-        print(f"nms_output1 shape : {nms_output1.shape}   nms_output1 type : {nms_output1.dtype}")
-
-    # Warm up
-    if config.warm_up > 0:
-        print("\nWarm Start.")
-        for i in range(config.warm_up):
-            context.execute_v2(allocations)
-        print("Warm Done.")
-
-    # Prepare the output data
-    output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    print(f"output shape : {output.shape} output type : {output.dtype}")
-
-    for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
-        batch_data = batch_data.numpy()
-        batch_img_shape = [batch_img_shape[0].numpy(), batch_img_shape[1].numpy()]
-        # batch_img_id = batch_img_id.numpy()
-
-        cur_bsz_sample = batch_data.shape[0]
-
-        # Set input
-        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
-
-        # Forward
-        start_time = time.time()
-        context.execute_v2(allocations)
-        end_time = time.time()
-        forward_time += end_time - start_time
-
-        if config.test_mode == "MAP":
-            # Fetch output
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
-
-            # Step 1 : prepare data to nms
-            _, box_num, box_unit = output.shape
-            if config.debug:
-                print(f"[Debug] box_num(25200) : {box_num}, box_unit(6) : {box_unit}")
-
-            if config.decoder_faster == 0:
-                nms_input = box_class85to6(output.reshape(-1, box_unit))
-            else:
-                nms_input = output
-
-            # Step 2 : nms
-            # cpu nms(TODO)
-
-            # gpu nms
-            if config.nms_type == "GPU":
-
-                # Set nms input
-                cuda.memcpy_htod(nms_inputs[0]["allocation"], nms_input)
-                nms_context.execute_v2(nms_allocations)
-                cuda.memcpy_dtoh(nms_output0, nms_outputs[0]["allocation"])
-                cuda.memcpy_dtoh(nms_output1, nms_outputs[1]["allocation"])
-
-            # Step 3 : post process + save
-            pred_boxes = post_process_func(
-                ori_img_shape=batch_img_shape,
-                imgsz=(config.imgsz, config.imgsz),
-                box_datas=nms_output0,
-                box_nums=nms_output1,
-                sample_num=cur_bsz_sample,
-                max_det=config.max_det
-            )
-            save2json(batch_img_id, pred_boxes, json_result, class_map)
-
-    fps = num_samples / forward_time
-
-    if config.test_mode == "FPS":
-        print("FPS : ", fps)
-        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
-        if fps >= config.fps_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(1)
-
-    if config.test_mode == "MAP":
-        if len(json_result) == 0:
-            print("Predict zero box!")
-            exit(1)
-
-        if not os.path.exists(config.pred_dir):
-            os.makedirs(config.pred_dir)
-
-        pred_json = os.path.join(
-            config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
-        )
-        with open(pred_json, "w") as f:
-            json.dump(json_result, f)
-
-        start_time = time.time()
-        anno_json = config.coco_gt
-        anno = COCO(anno_json)  # init annotations api
-        pred = anno.loadRes(pred_json)  # init predictions api
-        eval = COCOeval(anno, pred, "bbox")
-
-        eval.evaluate()
-        eval.accumulate()
-        print(
-            f"==============================eval {config.model_name} {config.precision} coco map =============================="
-        )
-        eval.summarize()
-        e2e_time = time.time() - start_time
-        map, map50 = eval.stats[:2]
-        print(F"E2E time : {e2e_time:.3f} seconds")
-        print("MAP@0.5 : ", map50)
-        print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        print(F"E2E time : {e2e_time:.3f} seconds")
-        if map50 >= config.map_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(1)
-
-def parse_config():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", type=str, default="YOLOV5s", help="YOLOV3 YOLOV5 YOLOV7 YOLOX"
-    )
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
-    parser.add_argument(
-        "--model_engine",
-        type=str,
-        default="",
-        help="model engine path",
-    )
-    parser.add_argument(
-        "--nms_engine",
-        type=str,
-        default="",
-        help="nms engine path",
-    )
-    parser.add_argument(
-        "--coco_gt",
-        type=str,
-        default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
-        help="coco instances_val2017.json",
-    )
-    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
-    parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
-    parser.add_argument(
-        "--eval_dir",
-        type=str,
-        default="data/datasets/cv/coco2017/val2017",
-        help="coco image dir",
-    )
-    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
-    parser.add_argument(
-        "--imgsz",
-        "--img",
-        "--img-size",
-        type=int,
-        default=640,
-        help="inference size h,w",
-    )
-    parser.add_argument("--max_det", type=int, default=1000, help="maximum detections per image")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--use_async", action="store_true")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
-    parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
-    parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
-    parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
-    parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
-
-    config = parser.parse_args()
-    print("config:", config)
-    return config
-
-if __name__ == "__main__":
-    config = parse_config()
-    main(config)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py
deleted file mode 100644
index ae47dc8e854b6bea1f768e65c4dd481048bfebce..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import ctypes
-import tensorrt
-from os.path import join, dirname, exists
-def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    ctypes.CDLL(dynamic_path)
-    tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/quant.py b/models/cv/object_detection/yolov5/ixrt/quant.py
deleted file mode 100644
index 36fd39a13c2e1e40f4dc0098f042e66e4bd0d26a..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/quant.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import random
-import argparse
-import numpy as np
-from tensorrt.deploy import static_quantize
-
-import torch
-from calibration_dataset import create_dataloaders
-
-def setseed(seed=42):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str)
-    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
-    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
-    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
-    parser.add_argument("--disable_quant_names", nargs='*', type=str)
-    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
-    parser.add_argument("--bsz", type=int, default=32)
-    parser.add_argument("--step", type=int, default=20)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--imgsz", type=int, default=640)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-setseed(args.seed)
-model_name = args.model_name
-
-out_dir = args.save_dir
-dataloader = create_dataloaders(
-    data_path=args.dataset_dir,
-    annFile=args.ann_file,
-    img_sz=args.imgsz,
-    batch_size=args.bsz,
-    step=args.step,
-    data_process_type=args.data_process_type
-)
-# print("disable_quant_names : ", args.disable_quant_names)
-static_quantize(args.model,
-        calibration_dataloader=dataloader,
-        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
-        observer=args.observer,
-        data_preprocess=lambda x: x[0].to("cuda"),
-        quant_format="qdq",
-        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/requirements.txt b/models/cv/object_detection/yolov5/ixrt/requirements.txt
deleted file mode 100644
index 10a9fba6a70545eee20ab0db7bb740b1d4807f95..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5/ixrt/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-tqdm
-onnx
-onnxsim
-ultralytics==8.3.97
-pycocotools
-opencv-python==4.6.0.66
-pycuda
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
index 140ab8ace521610303cbfc0582e0c5eaf6188c62..52ec959f1ea4b6b192b111c5b837904183d17876 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
index 01542134796eda4a0d46c33e3d28c23120e690ba..5e2f97fb43b1480fcce35669e7d2346f63eb64a8 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
index 18d11eff42fc5ecf60ae3338ae1c4688ff252127..606fc94c8c7f11a517b7bf3fe1fcf22ccb6d68d1 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
index 08525d287a6e850b7ee253bb13dc165c2253f045..b29836695881a184603d63159e0872c508d94486 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov5s/ixrt/README.md b/models/cv/object_detection/yolov5s/ixrt/README.md
index 079c9644b671b5137d64627173657430e9b22ef3..88f55f22a1f9ed3f815bd23e6f07cbb80c37e192 100755
--- a/models/cv/object_detection/yolov5s/ixrt/README.md
+++ b/models/cv/object_detection/yolov5s/ixrt/README.md
@@ -27,7 +27,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 ```
 
 ### Model Conversion
@@ -53,13 +53,13 @@ popd
 ## Model Inference
 
 ```bash
-export PROJ_DIR=/Path/to/yolov5s/ixrt
-export DATASETS_DIR=/Path/to/coco2017/
+export PROJ_DIR=./
+export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-export EVAL_DIR=${DATASETS_DIR}/val2017
-export RUN_DIR=${PROJ_DIR}/
-export CONFIG_DIR=config/YOLOV5S_CONFIG
+export EVAL_DIR=${DATASETS_DIR}/images/val2017
+export RUN_DIR=../../ixrt_common
+export CONFIG_DIR=../../ixrt_common/config/YOLOV5S_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov5s/ixrt/build_engine.py b/models/cv/object_detection/yolov5s/ixrt/build_engine.py
deleted file mode 100644
index a919bdd0183197ce125aa5492ec83e58e035675d..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/build_engine.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import cv2
-import argparse
-import numpy as np
-
-import torch
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    # engine args
-    parser.add_argument("--engine", type=str, default=None)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py
deleted file mode 100644
index 3be0d83d0d966018f59b87d22f628b9b1ddf9b21..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import argparse
-import torch
-import onnx
-from onnx import helper
-from onnx import TensorProto, numpy_helper
-import tensorrt
-
-def create_onnx(args):
-    nms = helper.make_node(
-        "NMS",
-        name="NMS",
-        inputs=["nms_input"],
-        outputs=["nms_output0", "nms_output1"],
-        nMaxKeep=args.max_box_pre_img,
-        fIoUThresh=args.iou_thresh,
-        fScoreThresh=args.score_thresh
-    )
-    graph = helper.make_graph(
-        nodes=[nms],
-        name="gpu_nms",
-        inputs=[
-            helper.make_tensor_value_info(
-                "nms_input", onnx.TensorProto.FLOAT, (args.bsz, args.all_box_num, 6)
-            )
-        ],
-        outputs=[
-            helper.make_tensor_value_info(
-                "nms_output0", onnx.TensorProto.FLOAT, (args.bsz, args.max_box_pre_img, 6)
-            ),
-            helper.make_tensor_value_info(
-                "nms_output1", onnx.TensorProto.INT32, (args.bsz,)
-            )
-        ],
-        initializer=[]
-    )
-
-    op = onnx.OperatorSetIdProto()
-    op.version = 13
-    model = onnx.helper.make_model(graph)
-
-    model = onnx.helper.make_model(graph, opset_imports=[op])
-    onnx_path = args.path + "/nms.onnx"
-    onnx.save(model, onnx_path)
-
-def build_engine(args):
-    onnx_path = args.path + "/nms.onnx"
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(onnx_path)
-    plan = builder.build_serialized_network(network, build_config)
-
-    engine_path = args.path + "/nms.engine"
-    with open(engine_path, "wb") as f:
-        f.write(plan)
-
-def main(args):
-    create_onnx(args)
-    build_engine(args)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--bsz", type=int, default=1, help="batch size")
-    parser.add_argument("--path", type=str)
-    parser.add_argument("--all_box_num", type=int, default=25200)
-    parser.add_argument("--max_box_pre_img", type=int, default=1000)
-    parser.add_argument("--iou_thresh", type=float, default=0.6)
-    parser.add_argument("--score_thresh", type=float, default=0.001)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov5s/ixrt/ci/prepare.sh
index b9f3a57f9d4bb7c25c55aa5621b8eb378093bd03..a08c47d7053f6c3a910a533b81e4331e8dbf3dfc 100644
--- a/models/cv/object_detection/yolov5s/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/ci/prepare.sh
@@ -25,7 +25,7 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 
 mkdir -p checkpoints
 cp -r /root/data/3rd_party/yolov5 ./
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py
deleted file mode 100755
index 162e24b462289dcee7b7a2888b93fad1115def81..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/common.py b/models/cv/object_detection/yolov5s/ixrt/datasets/common.py
deleted file mode 100755
index ef36eba394917bc05af46f33be48463df50f540d..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/common.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-    if not scaleup:  # only scale down, do not scale up (for better val mAP)
-        r = min(r, 1.0)
-
-    # Compute padding
-    ratio = r, r  # width, height ratios
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-    if auto:  # minimum rectangle
-        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
-    elif scaleFill:  # stretch
-        dw, dh = 0.0, 0.0
-        new_unpad = (new_shape[1], new_shape[0])
-        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
-    return im, ratio, (dw, dh)
-
-def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
-    # Rescale boxes (xyxy) from net_shape to ori_shape
-
-    if use_letterbox:
-
-        gain = min(
-            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
-        )  # gain  = new / old
-        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
-            net_shape[0] - ori_shape[0] * gain
-        ) / 2.0
-
-        boxes[:, [0, 2]] -= pad[0]  # x padding
-        boxes[:, [1, 3]] -= pad[1]  # y padding
-        boxes[:, :4] /= gain
-    else:
-        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
-
-        boxes[:, 0] /= x_scale
-        boxes[:, 1] /= y_scale
-        boxes[:, 2] /= x_scale
-        boxes[:, 3] /= y_scale
-
-    clip_boxes(boxes, ori_shape)
-    return boxes
-
-def clip_boxes(boxes, shape):
-
-    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py
deleted file mode 100755
index 8590816a0df18b6ef296ebe305b15b81240ab1d0..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox, scale_boxes, clip_boxes
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Postprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Postprocess
-    elif data_process_type == "yolox":
-        return YoloxPostprocess
-    return None
-
-def Yolov3Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=False
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def Yolov5Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=True
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def YoloxPostprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
-            boxes[:, :4] /= r
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/deploy.py b/models/cv/object_detection/yolov5s/ixrt/deploy.py
deleted file mode 100644
index 37c5f9ac9d1893978f09f1717d99b5857274121e..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/deploy.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# !/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import argparse
-from tensorrt.deploy.api import GraphTransform, create_source, create_target
-
-class Transform:
-    def __init__(self, graph):
-        self.t = GraphTransform(graph)
-        self.graph = graph
-
-    def ReplaceFocus(self, input_edge, outputs, to_op):
-        input_var = self.graph.get_variable(input_edge)
-        op = self.graph.get_operator(to_op)
-        self.t.delete_operators_between_var_op(
-            from_var=input_var, to_op=op
-        )
-        self.t.make_operator(
-            "Focus", inputs=input_edge, outputs=outputs
-        )
-        return self.graph
-
-    def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
-        if attributes["anchor"] is None:
-            del attributes["anchor"]
-        self.t.make_operator(
-            op_type, inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-    def AddConcatOp(self, inputs: list, outputs, **attributes):
-        self.t.make_operator(
-            "Concat", inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-def customize_ops(graph, args):
-    t = Transform(graph)
-    fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
-    if fuse_focus:
-        graph = t.ReplaceFocus(
-            input_edge=args.focus_input,
-            outputs=args.focus_output,
-            to_op=args.focus_last_node
-        )
-    decoder_input = args.decoder_input_names
-    num = len(decoder_input) // 3
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[:num],
-        outputs=["decoder_8"],
-        op_type=args.decoder_type,
-        anchor=args.decoder8_anchor,
-        num_class=args.num_class,
-        stride=8,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num:num*2],
-        outputs=["decoder_16"],
-        op_type=args.decoder_type,
-        anchor=args.decoder16_anchor,
-        num_class=args.num_class,
-        stride=16,
-        faster_impl=args.faster
-    )
-    
-    if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:num*2+1],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2+1:],
-            outputs=["decoder_64"],
-            op_type=args.decoder_type,
-            anchor=args.decoder64_anchor,
-            num_class=args.num_class,
-            stride=64,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
-            outputs=["output"],
-            axis=1
-        )
-    else:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_32", "decoder_16", "decoder_8"],
-            outputs=["output"],
-            axis=1
-        )
-
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
-    return graph
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--src", type=str)
-    parser.add_argument("--dst", type=str)
-    parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
-    parser.add_argument("--decoder_input_names", nargs='+', type=str)
-    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
-    parser.add_argument("--num_class", type=int, default=80)
-    parser.add_argument("--faster", type=int, default=1)
-    parser.add_argument("--focus_input", type=str, default=None)
-    parser.add_argument("--focus_output", type=str, default=None)
-    parser.add_argument("--focus_last_node", type=str, default=None)
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-
-    args = parse_args()
-    graph = create_source(args.src)()
-    graph = customize_ops(graph, args)
-    create_target(saved_path=args.dst).export(graph)
-    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py
deleted file mode 100644
index ae47dc8e854b6bea1f768e65c4dd481048bfebce..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import ctypes
-import tensorrt
-from os.path import join, dirname, exists
-def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    ctypes.CDLL(dynamic_path)
-    tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/requirements.txt b/models/cv/object_detection/yolov5s/ixrt/requirements.txt
deleted file mode 100644
index b1a10ab060644ea96d6ad77b36dbc4367a632591..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-tqdm
-onnx
-onnxsim
-ultralytics==8.3.97
-pycocotools
-pycuda
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
index 81b32fd1ec2538faad36dc432817cd8036032a95..52ec959f1ea4b6b192b111c5b837904183d17876 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
@@ -48,8 +34,6 @@ ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
-echo COCO_GT : ${COCO_GT}
-echo EVAL_DIR : ${EVAL_DIR}
 echo RUN_DIR : ${RUN_DIR}
 echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
@@ -98,7 +82,7 @@ if [ $PRECISION == "int8" ];then
     echo;
     echo [STEP ${step}] : Quant Model
     if [[ -z ${QUANT_EXIST_ONNX} ]];then
-        QUANT_EXIST_ONNX=${CHECKPOINTS_DIR}/quantized_${MODEL_NAME}.onnx
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
     fi
     if [[ -f ${QUANT_EXIST_ONNX} ]];then
         CURRENT_MODEL=${QUANT_EXIST_ONNX}
@@ -112,7 +96,7 @@ if [ $PRECISION == "int8" ];then
             --data_process_type ${DATA_PROCESS_TYPE}        \
             --observer ${QUANT_OBSERVER}                    \
             --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
-            --save_dir ${CHECKPOINTS_DIR}                     \
+            --save_dir $CHECKPOINTS_DIR                     \
             --bsz   ${QUANT_BATCHSIZE}                      \
             --step  ${QUANT_STEP}                           \
             --seed  ${QUANT_SEED}                           \
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
index 4ab4f9e413d156ac4b2b669240f55f09cf5eb14a..5e2f97fb43b1480fcce35669e7d2346f63eb64a8 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
@@ -48,8 +34,6 @@ ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
-echo COCO_GT : ${COCO_GT}
-echo EVAL_DIR : ${EVAL_DIR}
 echo RUN_DIR : ${RUN_DIR}
 echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
@@ -112,7 +96,7 @@ if [ $PRECISION == "int8" ];then
             --data_process_type ${DATA_PROCESS_TYPE}        \
             --observer ${QUANT_OBSERVER}                    \
             --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
-            --save_dir ${CHECKPOINTS_DIR}                     \
+            --save_dir $CHECKPOINTS_DIR                     \
             --bsz   ${QUANT_BATCHSIZE}                      \
             --step  ${QUANT_STEP}                           \
             --seed  ${QUANT_SEED}                           \
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
index fc7988dfd37e36039a8d1d835dbc558d9da2ffe5..606fc94c8c7f11a517b7bf3fe1fcf22ccb6d68d1 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
@@ -48,8 +34,6 @@ ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
-echo COCO_GT : ${COCO_GT}
-echo EVAL_DIR : ${EVAL_DIR}
 echo RUN_DIR : ${RUN_DIR}
 echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
@@ -98,7 +82,7 @@ if [ $PRECISION == "int8" ];then
     echo;
     echo [STEP ${step}] : Quant Model
     if [[ -z ${QUANT_EXIST_ONNX} ]];then
-        QUANT_EXIST_ONNX=${CHECKPOINTS_DIR}/quantized_${MODEL_NAME}.onnx
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
     fi
     if [[ -f ${QUANT_EXIST_ONNX} ]];then
         CURRENT_MODEL=${QUANT_EXIST_ONNX}
@@ -112,7 +96,7 @@ if [ $PRECISION == "int8" ];then
             --data_process_type ${DATA_PROCESS_TYPE}        \
             --observer ${QUANT_OBSERVER}                    \
             --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
-            --save_dir ${CHECKPOINTS_DIR}                     \
+            --save_dir $CHECKPOINTS_DIR                     \
             --bsz   ${QUANT_BATCHSIZE}                      \
             --step  ${QUANT_STEP}                           \
             --seed  ${QUANT_SEED}                           \
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
index dc912fa9873e9ced8683a41c6f9600a8e5a80b62..b29836695881a184603d63159e0872c508d94486 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
@@ -48,8 +34,6 @@ ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
-echo COCO_GT : ${COCO_GT}
-echo EVAL_DIR : ${EVAL_DIR}
 echo RUN_DIR : ${RUN_DIR}
 echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
@@ -113,7 +97,7 @@ if [ $PRECISION == "int8" ];then
             --data_process_type ${DATA_PROCESS_TYPE}        \
             --observer ${QUANT_OBSERVER}                    \
             --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
-            --save_dir ${CHECKPOINTS_DIR}                     \
+            --save_dir $CHECKPOINTS_DIR                     \
             --bsz   ${QUANT_BATCHSIZE}                      \
             --step  ${QUANT_STEP}                           \
             --seed  ${QUANT_SEED}                           \
diff --git a/models/cv/object_detection/yolov5s/ixrt/simplify_model.py b/models/cv/object_detection/yolov5s/ixrt/simplify_model.py
deleted file mode 100644
index 1400fd81ddb4b3fae1b20d0fd35082a692f5d292..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov5s/ixrt/simplify_model.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import onnx
-import argparse
-from onnxsim import simplify
-
-# Simplify
-def simplify_model(args):
-    onnx_model = onnx.load(args.origin_model)
-    model_simp, check = simplify(onnx_model)
-    model_simp = onnx.shape_inference.infer_shapes(model_simp)
-    onnx.save(model_simp, args.output_model)
-    print("  Simplify onnx Done.")
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-simplify_model(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/README.md b/models/cv/object_detection/yolov7/ixrt/README.md
index b366ce6e10956385815e45002227931df0cc73a6..8ff917cc0f74b5a72c4f60e9818a932fd24f5f87 100644
--- a/models/cv/object_detection/yolov7/ixrt/README.md
+++ b/models/cv/object_detection/yolov7/ixrt/README.md
@@ -30,7 +30,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 ```
 
 ### Model Conversion
@@ -47,13 +47,13 @@ mv yolov7.onnx /Path/to/checkpoints/yolov7m.onnx
 ## Model Inference
 
 ```bash
-export PROJ_DIR=/Path/to/yolov7/ixrt
-export DATASETS_DIR=/Path/to/coco2017/
+export PROJ_DIR=./
+export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-export EVAL_DIR=${DATASETS_DIR}/val2017
-export RUN_DIR=/Path/to/yolov7/ixrt
-export CONFIG_DIR=config/YOLOV7_CONFIG
+export EVAL_DIR=${DATASETS_DIR}/images/val2017
+export RUN_DIR=../../ixrt_common
+export CONFIG_DIR=../../ixrt_common/config/YOLOV7_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov7/ixrt/build_engine.py b/models/cv/object_detection/yolov7/ixrt/build_engine.py
deleted file mode 100644
index a919bdd0183197ce125aa5492ec83e58e035675d..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/build_engine.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import cv2
-import argparse
-import numpy as np
-
-import torch
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    # engine args
-    parser.add_argument("--engine", type=str, default=None)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py
deleted file mode 100644
index 3be0d83d0d966018f59b87d22f628b9b1ddf9b21..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import argparse
-import torch
-import onnx
-from onnx import helper
-from onnx import TensorProto, numpy_helper
-import tensorrt
-
-def create_onnx(args):
-    nms = helper.make_node(
-        "NMS",
-        name="NMS",
-        inputs=["nms_input"],
-        outputs=["nms_output0", "nms_output1"],
-        nMaxKeep=args.max_box_pre_img,
-        fIoUThresh=args.iou_thresh,
-        fScoreThresh=args.score_thresh
-    )
-    graph = helper.make_graph(
-        nodes=[nms],
-        name="gpu_nms",
-        inputs=[
-            helper.make_tensor_value_info(
-                "nms_input", onnx.TensorProto.FLOAT, (args.bsz, args.all_box_num, 6)
-            )
-        ],
-        outputs=[
-            helper.make_tensor_value_info(
-                "nms_output0", onnx.TensorProto.FLOAT, (args.bsz, args.max_box_pre_img, 6)
-            ),
-            helper.make_tensor_value_info(
-                "nms_output1", onnx.TensorProto.INT32, (args.bsz,)
-            )
-        ],
-        initializer=[]
-    )
-
-    op = onnx.OperatorSetIdProto()
-    op.version = 13
-    model = onnx.helper.make_model(graph)
-
-    model = onnx.helper.make_model(graph, opset_imports=[op])
-    onnx_path = args.path + "/nms.onnx"
-    onnx.save(model, onnx_path)
-
-def build_engine(args):
-    onnx_path = args.path + "/nms.onnx"
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(onnx_path)
-    plan = builder.build_serialized_network(network, build_config)
-
-    engine_path = args.path + "/nms.engine"
-    with open(engine_path, "wb") as f:
-        f.write(plan)
-
-def main(args):
-    create_onnx(args)
-    build_engine(args)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--bsz", type=int, default=1, help="batch size")
-    parser.add_argument("--path", type=str)
-    parser.add_argument("--all_box_num", type=int, default=25200)
-    parser.add_argument("--max_box_pre_img", type=int, default=1000)
-    parser.add_argument("--iou_thresh", type=float, default=0.6)
-    parser.add_argument("--score_thresh", type=float, default=0.001)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py
deleted file mode 100644
index de37775a0c617fdefca4342423a6a47bdc9b9c41..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-from datasets.coco import CocoDetection
-
-def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
-    dataset = CocoDetection(
-        root=data_path,
-        annFile=annFile,
-        img_size=img_sz,
-        data_process_type=data_process_type
-    )
-    calibration_dataset = dataset
-    num_samples = min(5000, batch_size * step)
-    if num_samples > 0:
-        calibration_dataset = torch.utils.data.Subset(
-            dataset, indices=range(num_samples)
-        )
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov7/ixrt/ci/prepare.sh
index 310566fb8b6ddbba24aecc3fdace7d7146063f3d..611fcd19698a0d5642730f04c95da79f27711711 100644
--- a/models/cv/object_detection/yolov7/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov7/ixrt/ci/prepare.sh
@@ -25,7 +25,7 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 mkdir -p checkpoints
 cp -r /root/data/3rd_party/yolov7 ./
 cd yolov7
diff --git a/models/cv/object_detection/yolov7/ixrt/coco_labels.py b/models/cv/object_detection/yolov7/ixrt/coco_labels.py
deleted file mode 100644
index 43f5bd82cd257efdcab2bdba6bad64d9bb90416e..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/coco_labels.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-labels = [
-    "person",
-    "bicycle",
-    "car",
-    "motorcycle",
-    "airplane",
-    "bus",
-    "train",
-    "truck",
-    "boat",
-    "traffic light",
-    "fire hydrant",
-    "stop sign",
-    "parking meter",
-    "bench",
-    "bird",
-    "cat",
-    "dog",
-    "horse",
-    "sheep",
-    "cow",
-    "elephant",
-    "bear",
-    "zebra",
-    "giraffe",
-    "backpack",
-    "umbrella",
-    "handbag",
-    "tie",
-    "suitcase",
-    "frisbee",
-    "skis",
-    "snowboard",
-    "sports ball",
-    "kite",
-    "baseball bat",
-    "baseball glove",
-    "skateboard",
-    "surfboard",
-    "tennis racket",
-    "bottle",
-    "wine glass",
-    "cup",
-    "fork",
-    "knife",
-    "spoon",
-    "bowl",
-    "banana",
-    "apple",
-    "sandwich",
-    "orange",
-    "broccoli",
-    "carrot",
-    "hot dog",
-    "pizza",
-    "donut",
-    "cake",
-    "chair",
-    "couch",
-    "potted plant",
-    "bed",
-    "dining table",
-    "toilet",
-    "tv",
-    "laptop",
-    "mouse",
-    "remote",
-    "keyboard",
-    "cell phone",
-    "microwave",
-    "oven",
-    "toaster",
-    "sink",
-    "refrigerator",
-    "book",
-    "clock",
-    "vase",
-    "scissors",
-    "teddy bear",
-    "hair drier",
-    "toothbrush",
-]
-def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
-    return [
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
-        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
-
-__all__ = ["labels"]
diff --git a/models/cv/object_detection/yolov7/ixrt/common.py b/models/cv/object_detection/yolov7/ixrt/common.py
deleted file mode 100644
index aba2117c9942d6823abf73bf3ab94c291a7705e2..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/common.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import numpy as np
-from tqdm import tqdm
-
-import tensorrt
-import pycuda.driver as cuda
-
-# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
-# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
-def box_class85to6(input):
-    center_x_y = input[:, :2]
-    side = input[:, 2:4]
-    conf = input[:, 4:5]
-    class_id = np.argmax(input[:, 5:], axis = -1)
-    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
-    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
-    x1_y1 = center_x_y - 0.5 * side
-    x2_y2 = center_x_y + 0.5 * side
-    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
-    return nms_input
-
-def save2json(batch_img_id, pred_boxes, json_result, class_trans):
-    for i, boxes in enumerate(pred_boxes):
-        if boxes is not None:
-            image_id = int(batch_img_id[i])
-            # have no target
-            if image_id == -1:
-                continue
-            for x, y, w, h, c, p in boxes:
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                        "image_id": image_id,
-                        "category_id": class_trans[c - 1],
-                        "bbox": [x, y, w, h],
-                        "score": p,
-                    }
-                )
-
-def create_engine_context(engine_path, logger):
-    with open(engine_path, "rb") as f:
-        runtime = tensorrt.Runtime(logger)
-        assert runtime
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine
-        context = engine.create_execution_context()
-        assert context
-
-    return engine, context
-
-def get_io_bindings(engine):
-    # Setup I/O bindings
-    inputs = []
-    outputs = []
-    allocations = []
-
-    for i in range(engine.num_bindings):
-        is_input = False
-        if engine.binding_is_input(i):
-            is_input = True
-        name = engine.get_binding_name(i)
-        dtype = engine.get_binding_dtype(i)
-        shape = engine.get_binding_shape(i)
-        if is_input:
-            batch_size = shape[0]
-        size = np.dtype(tensorrt.nptype(dtype)).itemsize
-        for s in shape:
-            size *= s
-        allocation = cuda.mem_alloc(size)
-        binding = {
-            "index": i,
-            "name": name,
-            "dtype": np.dtype(tensorrt.nptype(dtype)),
-            "shape": list(shape),
-            "allocation": allocation,
-        }
-        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
-        allocations.append(allocation)
-        if engine.binding_is_input(i):
-            inputs.append(binding)
-        else:
-            outputs.append(binding)
-    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/cut_model.py b/models/cv/object_detection/yolov7/ixrt/cut_model.py
deleted file mode 100644
index e9ee19aadf0809fe1b97e3225d09150fb54513f7..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/cut_model.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import onnx
-import argparse
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    parser.add_argument("--input_names", nargs='+', type=str)
-    parser.add_argument("--output_names", nargs='+', type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
-print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py
deleted file mode 100644
index 162e24b462289dcee7b7a2888b93fad1115def81..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/coco.py b/models/cv/object_detection/yolov7/ixrt/datasets/coco.py
deleted file mode 100644
index 73c5df54761b917ecd0127fb56b61d9bd34c1196..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/coco.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os.path
-from typing import Any, Callable, List, Optional, Tuple
-
-import cv2
-
-from .vision import VisionDataset
-from .pre_process import get_post_process
-class CocoDetection(VisionDataset):
-    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        annFile: str,
-        img_size: int,
-        data_process_type: str,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-        
-    ) -> None:
-        super().__init__(root, transforms, transform, target_transform)
-        from pycocotools.coco import COCO
-
-        self.coco = COCO(annFile)
-        self.ids = list(sorted(self.coco.imgs.keys()))
-        self.img_size = img_size
-        
-        self.transforms = get_post_process(data_process_type)
-
-    def _load_image(self, id: int):
-        path = self.coco.loadImgs(id)[0]["file_name"]
-        data = cv2.imread(os.path.join(self.root, path))
-        return data
-
-    def _load_target(self, id: int) -> List[Any]:
-        return self.coco.loadAnns(self.coco.getAnnIds(id))
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        id = self.ids[index]
-        image = self._load_image(id)
-        target = self._load_target(id)
-        origin_shape = image.shape[:2]
-
-        if self.transforms is not None:
-            image = self.transforms(image, self.img_size)
-
-        if len(target) > 0:
-            image_id = target[0]["image_id"]
-        else:
-            # have no target
-            image_id = -1
-        return image, origin_shape, image_id
-
-    def __len__(self) -> int:
-        return len(self.ids)
-
-
-class CocoCaptions(CocoDetection):
-    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-
-    Example:
-
-        .. code:: python
-
-            import torchvision.datasets as dset
-            import torchvision.transforms as transforms
-            cap = dset.CocoCaptions(root = 'dir where images are',
-                                    annFile = 'json annotation file',
-                                    transform=transforms.PILToTensor())
-
-            print('Number of samples: ', len(cap))
-            img, target = cap[3] # load 4th sample
-
-            print("Image Size: ", img.size())
-            print(target)
-
-        Output: ::
-
-            Number of samples: 82783
-            Image Size: (3L, 427L, 640L)
-            [u'A plane emitting smoke stream flying over a mountain.',
-            u'A plane darts across a bright blue sky behind a mountain covered in snow',
-            u'A plane leaves a contrail above the snowy mountain top.',
-            u'A mountain that has a plane flying overheard in the distance.',
-            u'A mountain view with a plume of smoke in the background']
-
-    """
-
-    def _load_target(self, id: int) -> List[str]:
-        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/common.py b/models/cv/object_detection/yolov7/ixrt/datasets/common.py
deleted file mode 100644
index ef36eba394917bc05af46f33be48463df50f540d..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/common.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-    if not scaleup:  # only scale down, do not scale up (for better val mAP)
-        r = min(r, 1.0)
-
-    # Compute padding
-    ratio = r, r  # width, height ratios
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-    if auto:  # minimum rectangle
-        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
-    elif scaleFill:  # stretch
-        dw, dh = 0.0, 0.0
-        new_unpad = (new_shape[1], new_shape[0])
-        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
-    return im, ratio, (dw, dh)
-
-def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
-    # Rescale boxes (xyxy) from net_shape to ori_shape
-
-    if use_letterbox:
-
-        gain = min(
-            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
-        )  # gain  = new / old
-        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
-            net_shape[0] - ori_shape[0] * gain
-        ) / 2.0
-
-        boxes[:, [0, 2]] -= pad[0]  # x padding
-        boxes[:, [1, 3]] -= pad[1]  # y padding
-        boxes[:, :4] /= gain
-    else:
-        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
-
-        boxes[:, 0] /= x_scale
-        boxes[:, 1] /= y_scale
-        boxes[:, 2] /= x_scale
-        boxes[:, 3] /= y_scale
-
-    clip_boxes(boxes, ori_shape)
-    return boxes
-
-def clip_boxes(boxes, shape):
-
-    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py
deleted file mode 100644
index 8590816a0df18b6ef296ebe305b15b81240ab1d0..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox, scale_boxes, clip_boxes
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Postprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Postprocess
-    elif data_process_type == "yolox":
-        return YoloxPostprocess
-    return None
-
-def Yolov3Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=False
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def Yolov5Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=True
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def YoloxPostprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
-            boxes[:, :4] /= r
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py
deleted file mode 100644
index c651f8adb7c8190c214fbbbb7769c7d0713e9619..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Preprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Preprocess
-    elif data_process_type == "yolox":
-        return YoloxPreprocess
-    return None
-
-def Yolov3Preprocess(image, img_size):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    image = cv2.resize(image, (img_size, img_size))
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def Yolov5Preprocess(image, img_size, augment=False):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    if r != 1:  # if sizes are not equal
-        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
-        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
-
-    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
-
-    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def YoloxPreprocess(img, img_size, swap=(2,0,1)):
-
-    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
-    r = min(img_size / img.shape[0], img_size / img.shape[1])
-    resized_img = cv2.resize(
-        img,
-        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        interpolation=cv2.INTER_LINEAR, 
-    ).astype(np.uint8)
-
-    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-    padded_img = padded_img.transpose(swap)
-    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-
-    return padded_img
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/vision.py b/models/cv/object_detection/yolov7/ixrt/datasets/vision.py
deleted file mode 100644
index eadefb2c5b35abd0a11fa85c65891461a210aef8..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/vision.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-import torch.utils.data as data
-
-from types import FunctionType
-
-def _log_api_usage_once(obj: Any) -> None:
-
-    """
-    Logs API usage(module and name) within an organization.
-    In a large ecosystem, it's often useful to track the PyTorch and
-    TorchVision APIs usage. This API provides the similar functionality to the
-    logging module in the Python stdlib. It can be used for debugging purpose
-    to log which methods are used and by default it is inactive, unless the user
-    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
-    Please note it is triggered only once for the same API call within a process.
-    It does not collect any data from open-source users since it is no-op by default.
-    For more information, please refer to
-    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
-    * Logging policy: https://github.com/pytorch/vision/issues/5052;
-
-    Args:
-        obj (class instance or method): an object to extract info from.
-    """
-    module = obj.__module__
-    if not module.startswith("torchvision"):
-        module = f"torchvision.internal.{module}"
-    name = obj.__class__.__name__
-    if isinstance(obj, FunctionType):
-        name = obj.__name__
-    torch._C._log_api_usage_once(f"{module}.{name}")
-
-class VisionDataset(data.Dataset):
-    """
-    Base Class For making datasets which are compatible with torchvision.
-    It is necessary to override the ``__getitem__`` and ``__len__`` method.
-
-    Args:
-        root (string): Root directory of dataset.
-        transforms (callable, optional): A function/transforms that takes in
-            an image and a label and returns the transformed versions of both.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-
-    .. note::
-
-        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
-    """
-
-    _repr_indent = 4
-
-    def __init__(
-        self,
-        root: str,
-        transforms: Optional[Callable] = None,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-    ) -> None:
-        _log_api_usage_once(self)
-        if isinstance(root, str):
-            root = os.path.expanduser(root)
-        self.root = root
-
-        has_transforms = transforms is not None
-        has_separate_transform = transform is not None or target_transform is not None
-        if has_transforms and has_separate_transform:
-            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
-
-        # for backwards-compatibility
-        self.transform = transform
-        self.target_transform = target_transform
-
-        if has_separate_transform:
-            transforms = StandardTransform(transform, target_transform)
-        self.transforms = transforms
-
-    def __getitem__(self, index: int) -> Any:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            (Any): Sample and meta data, optionally transformed by the respective transforms.
-        """
-        raise NotImplementedError
-
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    def __repr__(self) -> str:
-        head = "Dataset " + self.__class__.__name__
-        body = [f"Number of datapoints: {self.__len__()}"]
-        if self.root is not None:
-            body.append(f"Root location: {self.root}")
-        body += self.extra_repr().splitlines()
-        if hasattr(self, "transforms") and self.transforms is not None:
-            body += [repr(self.transforms)]
-        lines = [head] + [" " * self._repr_indent + line for line in body]
-        return "\n".join(lines)
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def extra_repr(self) -> str:
-        return ""
-
-
-class StandardTransform:
-    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
-        self.transform = transform
-        self.target_transform = target_transform
-
-    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
-        if self.transform is not None:
-            input = self.transform(input)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-        return input, target
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def __repr__(self) -> str:
-        body = [self.__class__.__name__]
-        if self.transform is not None:
-            body += self._format_transform_repr(self.transform, "Transform: ")
-        if self.target_transform is not None:
-            body += self._format_transform_repr(self.target_transform, "Target transform: ")
-
-        return "\n".join(body)
diff --git a/models/cv/object_detection/yolov7/ixrt/deploy.py b/models/cv/object_detection/yolov7/ixrt/deploy.py
deleted file mode 100644
index 8c2cc424f699e01bc88dab98a29dc4c83e4d9b9e..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/deploy.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-# !/usr/bin/env python
-# -*- coding: utf-8 -*-
-import argparse
-from tensorrt.deploy.api import GraphTransform, create_source, create_target
-
-class Transform:
-    def __init__(self, graph):
-        self.t = GraphTransform(graph)
-        self.graph = graph
-
-    def ReplaceFocus(self, input_edge, outputs, to_op):
-        input_var = self.graph.get_variable(input_edge)
-        op = self.graph.get_operator(to_op)
-        self.t.delete_operators_between_var_op(
-            from_var=input_var, to_op=op
-        )
-        self.t.make_operator(
-            "Focus", inputs=input_edge, outputs=outputs
-        )
-        return self.graph
-
-    def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
-        if attributes["anchor"] is None:
-            del attributes["anchor"]
-        self.t.make_operator(
-            op_type, inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-    def AddConcatOp(self, inputs: list, outputs, **attributes):
-        self.t.make_operator(
-            "Concat", inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-def customize_ops(graph, args):
-    t = Transform(graph)
-    fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
-    if fuse_focus:
-        graph = t.ReplaceFocus(
-            input_edge=args.focus_input,
-            outputs=args.focus_output,
-            to_op=args.focus_last_node
-        )
-    decoder_input = args.decoder_input_names
-    num = len(decoder_input) // 3
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[:num],
-        outputs=["decoder_8"],
-        op_type=args.decoder_type,
-        anchor=args.decoder8_anchor,
-        num_class=args.num_class,
-        stride=8,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num:num*2],
-        outputs=["decoder_16"],
-        op_type=args.decoder_type,
-        anchor=args.decoder16_anchor,
-        num_class=args.num_class,
-        stride=16,
-        faster_impl=args.faster
-    )
-    
-    if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:num*2+1],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2+1:],
-            outputs=["decoder_64"],
-            op_type=args.decoder_type,
-            anchor=args.decoder64_anchor,
-            num_class=args.num_class,
-            stride=64,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
-            outputs=["output"],
-            axis=1
-        )
-    else:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_32", "decoder_16", "decoder_8"],
-            outputs=["output"],
-            axis=1
-        )
-
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
-    return graph
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--src", type=str)
-    parser.add_argument("--dst", type=str)
-    parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
-    parser.add_argument("--decoder_input_names", nargs='+', type=str)
-    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
-    parser.add_argument("--num_class", type=int, default=80)
-    parser.add_argument("--faster", type=int, default=1)
-    parser.add_argument("--focus_input", type=str, default=None)
-    parser.add_argument("--focus_output", type=str, default=None)
-    parser.add_argument("--focus_last_node", type=str, default=None)
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-
-    args = parse_args()
-    graph = create_source(args.src)()
-    graph = customize_ops(graph, args)
-    create_target(saved_path=args.dst).export(graph)
-    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/inference.py b/models/cv/object_detection/yolov7/ixrt/inference.py
deleted file mode 100644
index c0476b899ba0ec51ab4aedc0596f19cb283952ab..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/inference.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import argparse
-import glob
-import json
-import os
-import time
-import sys
-
-import torch
-import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
-
-from coco_labels import coco80_to_coco91_class, labels
-from common import save2json, box_class85to6
-from common import create_engine_context, get_io_bindings
-from calibration_dataset import create_dataloaders
-from datasets.post_process import get_post_process
-
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-from tqdm import tqdm
-from tqdm.contrib import tzip
-
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-
-    # Load dataloader
-    dataloader = create_dataloaders(
-        data_path=config.eval_dir,
-        annFile=config.coco_gt,
-        img_sz=config.imgsz,
-        batch_size=config.bsz,
-        step=config.loop_count,
-        data_process_type=config.data_process_type
-    )
-
-    # Load post process func
-    if config.test_mode == "MAP":
-        post_process_func = get_post_process(config.data_process_type)
-
-    bsz = config.bsz
-    num_samples = 5000
-    if config.loop_count > 0 and config.loop_count < num_samples/bsz :
-        num_samples = bsz * config.loop_count
-    num_batch = len(dataloader)
-    print("=" * 30)
-    print(f"Test Mode : {'Asynchronous' if config.use_async else 'Synchronous'}")
-    print(f"Total sample : {num_samples}\nBatch_size : {bsz}\nRun Batch : {num_batch}")
-    print("=" * 30)
-
-    json_result = []
-    forward_time = 0.0
-    class_map = coco80_to_coco91_class()
-
-    host_mem = tensorrt.IHostMemory
-    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-
-    # Load Engine
-    engine, context = create_engine_context(config.model_engine, logger)
-    inputs, outputs, allocations = get_io_bindings(engine)
-
-    # Load nms_engine
-    if config.test_mode == "MAP" and config.nms_type == "GPU":
-        nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
-        nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
-        nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
-        nms_output1 = np.zeros(nms_outputs[1]["shape"], nms_outputs[1]["dtype"])
-        print(f"nms_output0 shape : {nms_output0.shape}   nms_output0 type : {nms_output0.dtype}")
-        print(f"nms_output1 shape : {nms_output1.shape}   nms_output1 type : {nms_output1.dtype}")
-
-    # Warm up
-    if config.warm_up > 0:
-        print("\nWarm Start.")
-        for i in range(config.warm_up):
-            context.execute_v2(allocations)
-        print("Warm Done.")
-
-    # Prepare the output data
-    output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    print(f"output shape : {output.shape} output type : {output.dtype}")
-
-    for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
-        batch_data = batch_data.numpy()
-        batch_img_shape = [batch_img_shape[0].numpy(), batch_img_shape[1].numpy()]
-        # batch_img_id = batch_img_id.numpy()
-
-        cur_bsz_sample = batch_data.shape[0]
-
-        # Set input
-        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
-
-        # Forward
-        start_time = time.time()
-        context.execute_v2(allocations)
-        end_time = time.time()
-        forward_time += end_time - start_time
-
-        if config.test_mode == "MAP":
-            # Fetch output
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
-
-            # Step 1 : prepare data to nms
-            _, box_num, box_unit = output.shape
-            if config.debug:
-                print(f"[Debug] box_num(25200) : {box_num}, box_unit(6) : {box_unit}")
-
-            if config.decoder_faster == 0:
-                nms_input = box_class85to6(output.reshape(-1, box_unit))
-            else:
-                nms_input = output
-
-            # Step 2 : nms
-            # cpu nms(TODO)
-
-            # gpu nms
-            if config.nms_type == "GPU":
-
-                # Set nms input
-                cuda.memcpy_htod(nms_inputs[0]["allocation"], nms_input)
-                nms_context.execute_v2(nms_allocations)
-                cuda.memcpy_dtoh(nms_output0, nms_outputs[0]["allocation"])
-                cuda.memcpy_dtoh(nms_output1, nms_outputs[1]["allocation"])
-
-            # Step 3 : post process + save
-            pred_boxes = post_process_func(
-                ori_img_shape=batch_img_shape,
-                imgsz=(config.imgsz, config.imgsz),
-                box_datas=nms_output0,
-                box_nums=nms_output1,
-                sample_num=cur_bsz_sample,
-                max_det=config.max_det
-            )
-            save2json(batch_img_id, pred_boxes, json_result, class_map)
-
-    fps = num_samples / forward_time
-
-    if config.test_mode == "FPS":
-        print("FPS : ", fps)
-        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
-        if fps >= config.fps_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(1)
-
-    if config.test_mode == "MAP":
-        if len(json_result) == 0:
-            print("Predict zero box!")
-            exit(1)
-
-        if not os.path.exists(config.pred_dir):
-            os.makedirs(config.pred_dir)
-
-        pred_json = os.path.join(
-            config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
-        )
-        with open(pred_json, "w") as f:
-            json.dump(json_result, f)
-
-        start_time = time.time()
-        anno_json = config.coco_gt
-        anno = COCO(anno_json)  # init annotations api
-        pred = anno.loadRes(pred_json)  # init predictions api
-        eval = COCOeval(anno, pred, "bbox")
-
-        eval.evaluate()
-        eval.accumulate()
-        print(
-            f"==============================eval {config.model_name} {config.precision} coco map =============================="
-        )
-        eval.summarize()
-        e2e_time = time.time() - start_time
-        map, map50 = eval.stats[:2]
-        print(F"E2E time : {e2e_time:.3f} seconds")
-        print("MAP@0.5 : ", map50)
-        print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        print(F"E2E time : {e2e_time:.3f} seconds")
-        if map50 >= config.map_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(1)
-
-def parse_config():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", type=str, default="YOLOV5s", help="YOLOV3 YOLOV5 YOLOV7 YOLOX"
-    )
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
-    parser.add_argument(
-        "--model_engine",
-        type=str,
-        default="",
-        help="model engine path",
-    )
-    parser.add_argument(
-        "--nms_engine",
-        type=str,
-        default="",
-        help="nms engine path",
-    )
-    parser.add_argument(
-        "--coco_gt",
-        type=str,
-        default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
-        help="coco instances_val2017.json",
-    )
-    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
-    parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
-    parser.add_argument(
-        "--eval_dir",
-        type=str,
-        default="data/datasets/cv/coco2017/val2017",
-        help="coco image dir",
-    )
-    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
-    parser.add_argument(
-        "--imgsz",
-        "--img",
-        "--img-size",
-        type=int,
-        default=640,
-        help="inference size h,w",
-    )
-    parser.add_argument("--max_det", type=int, default=1000, help="maximum detections per image")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--use_async", action="store_true")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
-    parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
-    parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
-    parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
-    parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
-
-    config = parser.parse_args()
-    print("config:", config)
-    return config
-
-if __name__ == "__main__":
-    config = parse_config()
-    main(config)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py
deleted file mode 100644
index ae47dc8e854b6bea1f768e65c4dd481048bfebce..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import ctypes
-import tensorrt
-from os.path import join, dirname, exists
-def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    ctypes.CDLL(dynamic_path)
-    tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/quant.py b/models/cv/object_detection/yolov7/ixrt/quant.py
deleted file mode 100644
index 36fd39a13c2e1e40f4dc0098f042e66e4bd0d26a..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/quant.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import random
-import argparse
-import numpy as np
-from tensorrt.deploy import static_quantize
-
-import torch
-from calibration_dataset import create_dataloaders
-
-def setseed(seed=42):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str)
-    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
-    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
-    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
-    parser.add_argument("--disable_quant_names", nargs='*', type=str)
-    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
-    parser.add_argument("--bsz", type=int, default=32)
-    parser.add_argument("--step", type=int, default=20)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--imgsz", type=int, default=640)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-setseed(args.seed)
-model_name = args.model_name
-
-out_dir = args.save_dir
-dataloader = create_dataloaders(
-    data_path=args.dataset_dir,
-    annFile=args.ann_file,
-    img_sz=args.imgsz,
-    batch_size=args.bsz,
-    step=args.step,
-    data_process_type=args.data_process_type
-)
-# print("disable_quant_names : ", args.disable_quant_names)
-static_quantize(args.model,
-        calibration_dataloader=dataloader,
-        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
-        observer=args.observer,
-        data_preprocess=lambda x: x[0].to("cuda"),
-        quant_format="qdq",
-        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/requirements.txt b/models/cv/object_detection/yolov7/ixrt/requirements.txt
deleted file mode 100644
index 10a9fba6a70545eee20ab0db7bb740b1d4807f95..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-tqdm
-onnx
-onnxsim
-ultralytics==8.3.97
-pycocotools
-opencv-python==4.6.0.66
-pycuda
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
index 140ab8ace521610303cbfc0582e0c5eaf6188c62..52ec959f1ea4b6b192b111c5b837904183d17876 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
index 01542134796eda4a0d46c33e3d28c23120e690ba..5e2f97fb43b1480fcce35669e7d2346f63eb64a8 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
index 18d11eff42fc5ecf60ae3338ae1c4688ff252127..606fc94c8c7f11a517b7bf3fe1fcf22ccb6d68d1 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
index 08525d287a6e850b7ee253bb13dc165c2253f045..b29836695881a184603d63159e0872c508d94486 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
@@ -1,18 +1,4 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
diff --git a/models/cv/object_detection/yolov7/ixrt/simplify_model.py b/models/cv/object_detection/yolov7/ixrt/simplify_model.py
deleted file mode 100644
index 1400fd81ddb4b3fae1b20d0fd35082a692f5d292..0000000000000000000000000000000000000000
--- a/models/cv/object_detection/yolov7/ixrt/simplify_model.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import onnx
-import argparse
-from onnxsim import simplify
-
-# Simplify
-def simplify_model(args):
-    onnx_model = onnx.load(args.origin_model)
-    model_simp, check = simplify(onnx_model)
-    model_simp = onnx.shape_inference.infer_shapes(model_simp)
-    onnx.save(model_simp, args.output_model)
-    print("  Simplify onnx Done.")
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-simplify_model(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
index ed40e8dcaf19993516b1c25f1d99b2b06c45f606..455a54831e2cfbfecd8e38bbad0f6a4900065262 100644
--- a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
@@ -18,7 +18,7 @@
 batchsize=${BATCH_SIZE:-"32"}
 model_path="yolox"
 datasets_path=${DATASETS_DIR}
-DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
+DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
 
 # cut onnx
 python3 python/cut_model.py                             \
diff --git a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
index c66562d69bb27800259f159fc9afc3f6ac63194b..913d97295cf68aec424c650564f86887279a3f02 100644
--- a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
+++ b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
@@ -18,7 +18,7 @@
 batchsize=${BATCH_SIZE:-"32"}
 model_path="yolox"
 datasets_path=${DATASETS_DIR}
-DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
+DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
 
 # cut onnx
 python3 python/cut_model.py                             \
diff --git a/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh b/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh
index b9140aa3524dfab2af3249dee4e6ab89948b3607..f256bc1ddb0984f905c46165c59a2244bea77425 100644
--- a/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh
+++ b/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh
@@ -24,5 +24,5 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/diffusers-0.31.0-py3-none-any.whl
+pip3 install /mnt/deepspark/data/3rd_party/diffusers-0.31.0-py3-none-any.whl
 pip3 install -r requirements.txt
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/README.md b/models/multimodal/vision_language_model/idefics3/vllm/README.md
index 5117a327f5830886f4a2e1e0589a7f8ac71cfff5..78d4117c170f1db13021db147a13bd7d87db0d5e 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/README.md
+++ b/models/multimodal/vision_language_model/idefics3/vllm/README.md
@@ -22,8 +22,8 @@ significantly enhancing capabilities around OCR, document understanding and visu
 ```bash
 cp -r ../../vllm_public_assets/ ./
 
-# Download model from the website and make sure the model's path is "data/Aria"
-mkdir data
+# Download model from the website and make sure the model's path is "idefics3"
+mkdir HuggingFaceM4
 ```
 
 ### Install Dependencies
@@ -36,13 +36,14 @@ In order to run the model smoothly, you need to get the sdk from [resource cente
 yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
+
+pip install transformers==4.50.3
 ```
 
 ## Model Inference
 
 ```bash
-export VLLM_ASSETS_CACHE=../vllm/
-python3 offline_inference_vision_language.py --model data/Idefics3-8B-Llama3 -tp 4 --max-tokens 256 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache
+python3 offline_inference_vision_language.py --model-type idefics3
 ```
 
 ## Model Results
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
index 7232aa2996f379a961cf931968a1319fb70ac091..26f7a3ffa914a58ae2cb1905e4140bf4779e8911 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
+++ b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
@@ -25,3 +25,4 @@ else
 fi
 
 cp -r ../../vllm_public_assets/ ./
+pip install transformers==4.50.3
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
index 958131c646eb2e3f741257b4883ba0fcc04d8840..c25936037c9d50e5aecaeba7cf49c57463aeabc4 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
+++ b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
@@ -1,55 +1,67 @@
-#!/bin/bash
-# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
-with the correct prompt format on vision language models.
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
-import sys
-from pathlib import Path
 import os
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import argparse
-import dataclasses
-import inspect
-from vllm.assets.image import ImageAsset
-from vllm.assets.video import VideoAsset
+import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
 
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
 from vllm import LLM, EngineArgs, SamplingParams
-from utils import sampling_add_cli_args
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.lora.request import LoRARequest
+from vllm.utils import FlexibleArgumentParser
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
 
 # Idefics3-8B-Llama3
-def run_idefics3(question: str, engine_params, modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+    model_name = "./idefics3"
 
-    llm = LLM(**engine_params)
-    prompt = (
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
+    prompts = [(
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    ) for question in questions]
 
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
+model_example_map = {
+    "idefics3": run_idefics3,
+}
 
 def get_multi_modal_input(args):
     """
@@ -60,92 +72,188 @@ def get_multi_modal_input(args):
     """
     if args.modality == "image":
         # Input image and question
-        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-    
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser = EngineArgs.add_cli_args(parser)
-    parser = sampling_add_cli_args(parser)
-    args = parser.parse_args()
-    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-    sampling_args = [
-        param.name
-        for param in list(
-            inspect.signature(SamplingParams).parameters.values()
-        )
-    ]
-    engine_params = {attr: getattr(args, attr) for attr in engine_args}
-    sampling_params = {
-        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
-    }
-    
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompts[i % len(prompts)],
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
+
+    req_data = model_example_map[model](questions, modality)
 
-    llm, prompt, stop_token_ids = run_idefics3(question,engine_params,args.modality)
-    sampling_params['stop_token_ids'] = stop_token_ids
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(**sampling_params)
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=req_data.stop_token_ids)
 
     assert args.num_prompts > 0
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompts,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for i in range(args.num_prompts)]
+
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
-        print(generated_text)
\ No newline at end of file
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=4,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['image', 'video'],
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4 b/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4
old mode 100755
new mode 100644
similarity index 100%
rename from models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4
rename to models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
index a404f6ec2cb73151184612fbfa89bee0d5ce26ca..ea1c8d748e3daa6330fb59767289c4b2bb6dcc95 100644
--- a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
@@ -16,13 +16,12 @@ techniques, making it suitable for deployment in resource-constrained environmen
 
 ### Prepare Resources
 
-- Model: <https://huggingface.co/openbmb/MiniCPM-V-2>
+- Model: <https://huggingface.co/openbmb/MiniCPM-V-2_6>
 
 ```bash
 cp -r ../../vllm_public_assets/ ./
 
-# Download model from the website and make sure the model's path is "data/Aria"
-mkdir data
+# Download model from the website and make sure the model's path is "./minicpm_v"
 ```
 
 ### Install Dependencies
@@ -42,8 +41,7 @@ pip install timm==0.9.10
 ## Model Inference
 
 ```bash
-export VLLM_ASSETS_CACHE=../vllm/
-PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model data/MiniCPM-V-2 --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+python3 offline_inference_vision_language.py --model-type minicpmv
 ```
 
 ## Model Results
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
index 2fc88f4695bf32cc400dbb51ea5dae4c3fb8b11a..f6df6f98d4f8f50be1ea9b703e023e32fa756e6f 100644
--- a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
@@ -1,42 +1,42 @@
-#!/bin/bash
-# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
-with the correct prompt format on vision language models.
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
-import sys
-from pathlib import Path
 import os
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import argparse
-import dataclasses
-import inspect
+import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
+
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm import LLM, EngineArgs, SamplingParams
-from utils import sampling_add_cli_args
+from vllm.lora.request import LoRARequest
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
 
 # MiniCPM-V
-def run_minicpmv(question, engine_params, model,modality):
-    assert modality == "image"
+def run_minicpmv_base(questions: list[str], modality: str, model_name):
+    assert modality in ["image", "video"]
+    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
+
     # 2.0
     # The official repo doesn't work yet, so we need to use a fork for now
     # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
@@ -45,10 +45,25 @@ def run_minicpmv(question, engine_params, model,modality):
     # 2.5
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
-    #2.6
-    tokenizer = AutoTokenizer.from_pretrained(model,
+    # 2.6
+    # model_name = "openbmb/MiniCPM-V-2_6"
+    # o2.6
+
+    # modality supports
+    # 2.0: image
+    # 2.5: image
+    # 2.6: image, video
+    # o2.6: image, video, audio
+    # model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    llm = LLM(**engine_params)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
     # stop_token_ids = [tokenizer.eos_id]
@@ -56,18 +71,38 @@ def run_minicpmv(question, engine_params, model,modality):
     # 2.5
     # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
 
-    # 2.6
+    # 2.6 / o2.6
     stop_tokens = ['<|im_end|>', '<|endoftext|>']
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
-    messages = [{
-        'role': 'user',
-        'content': f'(<image>./</image>)\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-    return llm, prompt, stop_token_ids
+    modality_placeholder = {
+        "image": "(<image>./</image>)",
+        "video": "(<video>./</video>)",
+    }
+
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{
+                'role': 'user',
+                'content': f"{modality_placeholder[modality]}\n{question}"
+            }],
+            tokenize=False,
+            add_generation_prompt=True) for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
+    return run_minicpmv_base(questions, modality, "./minicpm_v")
+
+
+model_example_map = {
+    "minicpmv": run_minicpmv,
+}
 
 
 def get_multi_modal_input(args):
@@ -79,92 +114,188 @@ def get_multi_modal_input(args):
     """
     if args.modality == "image":
         # Input image and question
-        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-    
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser = EngineArgs.add_cli_args(parser)
-    parser = sampling_add_cli_args(parser)
-    args = parser.parse_args()
-    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-    sampling_args = [
-        param.name
-        for param in list(
-            inspect.signature(SamplingParams).parameters.values()
-        )
-    ]
-    engine_params = {attr: getattr(args, attr) for attr in engine_args}
-    sampling_params = {
-        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
-    }
-    
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompts[i % len(prompts)],
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
 
-    llm, prompt, stop_token_ids = run_minicpmv(question,engine_params, args.model, args.modality)
-    sampling_params['stop_token_ids'] = stop_token_ids
+    req_data = model_example_map[model](questions, modality)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(**sampling_params)
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=req_data.stop_token_ids)
 
     assert args.num_prompts > 0
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompts,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for i in range(args.num_prompts)]
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
+
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
-        print(generated_text)
\ No newline at end of file
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=4,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['image', 'video'],
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt b/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt
deleted file mode 100644
index 9a0e7a1217b72ee65ddca197e07b75294e736d60..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(nv_plugin)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake;${CMAKE_MODULE_PATH}")
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-set(TARGET_NAME ixrt_plugin)
-set(SHARED_TARGET ${TARGET_NAME})
-set(STATIC_TARGET ${TARGET_NAME}_static)
-set(PLUGIN_REPO_PATH ${PROJECT_SOURCE_DIR})
-
-if(DEFINED USE_TENSORRT)
-  find_package(CUDA)
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_75)
-
-  include_directories(
-        ${CUDA_PATH}/include)
-
-  message(STATUS "Plugin lib use TRT 8.6.1")
-  set(TRT_INC_PATH /usr/include/x86_64-linux-gnu/)
-  set(TRT_LIB_PATH /usr/lib/x86_64-linux-gnu/ /usr/local/cuda/targets/x86_64-linux/lib)
-  set(TRT_LIBRARY nvinfer cublasLt)
-  
-  message(STATUS "cuda_libs = ${CUDA_LIBRARIES}")
-  message(STATUS "cudadevrt_libs = ${CUDA_cudadevrt_LIBRARY}")
-else()
-  include(FindIxrt)
-  include(FindCompiler)
-  include(FindCuda)
-  set(TRT_LIBRARY cublasLt cudart ixrt)
-  include_directories(${IXRT_INCLUDE_DIR}
-        ${CUDA_PATH}/include)
-  add_definitions(-D__ILUVATAR__)
-
-  string(APPEND CMAKE_CXX_FLAGS " -std=c++17")
-endif()
-
-include(FindPluginFiles)
-
-################################## Compile Options ######################################
-cuda_add_library(${SHARED_TARGET} SHARED
-        ${PLUGIN_FILES}
-)
-
-target_link_libraries(${SHARED_TARGET} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} ${TRT_LIBRARY})
-target_link_directories(${SHARED_TARGET} PUBLIC ${CUDA_PATH}/lib64 ${TRT_LIB_PATH} ${IXRT_LIB_DIR})
-target_include_directories(${SHARED_TARGET}  PUBLIC ${CUDA_PATH}/include ${TRT_INC_PATH} src PUBLIC src/common)
diff --git a/models/nlp/plm/bert_large_squad/ixrt/README.md b/models/nlp/plm/bert_large_squad/ixrt/README.md
index 13e741143479e58b50e79745511facd2542c4ea7..ce5255842c8649e525f1a9d4ca39ddaefaecee0b 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/README.md
+++ b/models/nlp/plm/bert_large_squad/ixrt/README.md
@@ -18,8 +18,7 @@ Get `bert-large-uncased.zip` from [Google
 Drive](https://drive.google.com/file/d/1eD8QBkbK6YN-_YXODp3tmpp3cZKlrPTA/view?usp=drive_link)
 
 ```bash
-cd python/
-bash script/prepare.sh v1_1
+bash scripts/prepare.sh v1_1
 ```
 
 ### Install Dependencies
@@ -27,27 +26,7 @@ bash script/prepare.sh v1_1
 #### Install on Iluvatar
 
 ```bash
-cmake -S . -B build
-cmake --build build -j16
-```
-
-#### Install on NV
-
-Require tensorrt_version >= 8.6
-
-```bash
-# Get TensorRT docker image
-docker pull nvcr.io/nvidia/tensorrt:23.04-py3
-# Run TensorRT docker
-```
-
-```bash
-# Install requirements.txt in TensorRT docker
 pip3 install -r requirements.txt
-
-# Build
-cmake -S . -B build -DUSE_TENSORRT=true
-cmake --build build -j16
 ```
 
 ## Model Inference
@@ -55,20 +34,15 @@ cmake --build build -j16
 ### FP16
 
 ```bash
-cd python/
-
-# use --bs to set max_batch_size (dynamic)
-bash script/build_engine.sh --bs 32
-bash script/inference_squad.sh --bs 32
+bash scripts/infer_bert_large_squad_fp16_accuracy.sh
+bash scripts/infer_bert_large_squad_fp16_performance.sh
 ```
 
 ### INT8
 
 ```bash
-cd python
-pip install onnx pycuda
-bash script/build_engine.sh --bs 32 --int8
-bash script/inference_squad.sh --bs 32 --int8
+bash scripts/infer_bert_large_squad_int8_accuracy.sh
+bash scripts/infer_bert_large_squad_int8_performance.sh
 ```
 
 | Model              | BatchSize   | Precision   | Latency QPS           | exact_match   | f1      |
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder.py b/models/nlp/plm/bert_large_squad/ixrt/builder.py
similarity index 38%
rename from models/nlp/plm/bert_large_squad/ixrt/python/builder.py
rename to models/nlp/plm/bert_large_squad/ixrt/builder.py
index 627027a09834314d25883d768b935b970a0fa64f..970f91bc27011be0ca26e1ac2a4f4cc255010ec8 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/builder.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/builder.py
@@ -1,3 +1,19 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+#
+
 #!/usr/bin/env python3
 # Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
@@ -30,38 +46,64 @@
 # limitations under the License.
 #
 
-import os
 import argparse
-import json
-import tensorrt as trt
-import time
-import sys
 import ctypes
+import json
 import os
-import numpy as np
-from builder_utils import load_onnx_weights_and_quant, load_pytorch_weights_and_quant
-from builder_utils import WQKV, BQKV  # Attention Keys
-from builder_utils import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT  # Transformer Keys
-from builder_utils import SQD_W, SQD_B  # SQuAD Output Keys
+import sys
+import time
 
-trt_version = [int(n) for n in trt.__version__.split('.')]
-plugin_lib_name = "libnvinfer_plugin.so" if os.getenv('USE_TRT') == 'True' else "libixrt_plugin.so"
+import numpy as np
+import ixrt
+from builder_utils import (  # Attention Keys; Transformer Keys; SQuAD Output Keys
+    B_AOUT,
+    B_LOUT,
+    B_MID,
+    BQKV,
+    SQD_B,
+    SQD_W,
+    W_AOUT,
+    W_LOUT,
+    W_MID,
+    WQKV,
+    load_onnx_weights_and_quant,
+    load_pytorch_weights_and_quant,
+)
+
+plugin_lib_name = (
+    "libnvinfer_plugin.so" if os.getenv("USE_TRT") == "True" else "libixrt_plugin.so"
+)
 print(plugin_lib_name)
 
-TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
-from load_ixrt_plugin import load_ixrt_plugin, is_nvidia_platform
+TRT_LOGGER = ixrt.Logger(ixrt.Logger.WARNING)
+from load_ixrt_plugin import load_ixrt_plugin
+
 load_ixrt_plugin(TRT_LOGGER)
 
-plg_registry = trt.get_plugin_registry()
+plg_registry = ixrt.get_plugin_registry()
 registry_list = plg_registry.plugin_creator_list
-print("registry_list: ", [registry.name + '/' + registry.plugin_version for registry in registry_list])
-emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic_IxRT", "1", "")
-qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic_IxRT", "1", "")
-skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic_IxRT", "1", "")
-ffn_plg_creator = plg_registry.get_plugin_creator("CustomFFNPluginDynamic_IxRT", "1", "")
-gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic_IxRT", "1", "")
+print(
+    "registry_list: ",
+    [registry.name + "/" + registry.plugin_version for registry in registry_list],
+)
+emln_plg_creator = plg_registry.get_plugin_creator(
+    "CustomEmbLayerNormPluginDynamic_IxRT", "1", ""
+)
+qkv2_plg_creator = plg_registry.get_plugin_creator(
+    "CustomQKVToContextPluginDynamic_IxRT", "1", ""
+)
+skln_plg_creator = plg_registry.get_plugin_creator(
+    "CustomSkipLayerNormPluginDynamic_IxRT", "1", ""
+)
+ffn_plg_creator = plg_registry.get_plugin_creator(
+    "CustomFFNPluginDynamic_IxRT", "1", ""
+)
+gelu_plg_creator = plg_registry.get_plugin_creator(
+    "CustomGeluPluginDynamic_IxRT", "1", ""
+)
 fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic_IxRT", "1", "")
 
+
 class BertConfig:
     def __init__(self, bert_config_path, use_fp16, use_trt):
         with open(bert_config_path, "r") as f:
@@ -74,42 +116,51 @@ class BertConfig:
             self.use_fp16 = use_fp16
             self.use_trt = use_trt
 
+
 def set_tensor_name(tensor, prefix, name):
     tensor.name = prefix + name
 
-def set_output_name(layer, prefix, name, out_idx = 0):
+
+def set_output_name(layer, prefix, name, out_idx=0):
     set_tensor_name(layer.get_output(out_idx), prefix, name)
 
-def set_output_range(layer, maxval, out_idx = 0):
+
+def set_output_range(layer, maxval, out_idx=0):
     layer.get_output(out_idx).set_dynamic_range(-maxval, maxval)
 
+
 def get_mha_dtype(config):
-    dtype = trt.float32
+    dtype = ixrt.float32
     if config.use_fp16:
-        dtype = trt.float16
+        dtype = ixrt.float16
     return int(dtype)
 
+
 def custom_fc(network, input_tensor, out_dims, W, B):
-    pf_out_dims = trt.PluginField("out_dims", np.array(out_dims, dtype=np.int32), trt.PluginFieldType.INT32)
-    pf_type = trt.PluginField("type_id", np.array(int(trt.float16), dtype=np.int32), trt.PluginFieldType.INT32)
-    pf_W = trt.PluginField("W", W, trt.PluginFieldType.FLOAT32)
+    pf_out_dims = ixrt.PluginField(
+        "out_dims", np.array(out_dims, dtype=np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_type = ixrt.PluginField(
+        "type_id", np.array(int(ixrt.float16), dtype=np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_W = ixrt.PluginField("W", W, ixrt.PluginFieldType.FLOAT32)
     fields = [pf_out_dims, pf_type, pf_W]
     if B is not None:
-        pf_B = trt.PluginField("B", B, trt.PluginFieldType.FLOAT32)
+        pf_B = ixrt.PluginField("B", B, ixrt.PluginFieldType.FLOAT32)
         fields.append(pf_B)
 
-    pfc = trt.PluginFieldCollection(fields)
+    pfc = ixrt.PluginFieldCollection(fields)
     fc_plugin = fc_plg_creator.create_plugin("fcplugin", pfc)
     plug_inputs = [input_tensor]
     out_dense = network.add_plugin_v2(plug_inputs, fc_plugin)
     return out_dense
 
+
 def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
     """
     Add the attention layer
     """
-    assert(len(input_tensor.shape) == 5)
-    B, S, hidden_size, _, _ = input_tensor.shape
+    B, S, hidden_size = input_tensor.shape
     num_heads = config.num_attention_heads
     head_size = int(hidden_size / num_heads)
 
@@ -117,18 +168,27 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask)
     Ball = init_dict[prefix + BQKV]
 
     # FC_attention
-    if config.use_trt:
-        mult_all = network.add_fully_connected(input_tensor, 3 * hidden_size, Wall, Ball)
-    else:
-        mult_all = custom_fc(network, input_tensor, 3 * hidden_size, Wall, Ball)
+    mult_all = custom_fc(network, input_tensor, 3 * hidden_size, Wall, Ball)
 
     has_mask = imask is not None
     # QKV2CTX
-    pf_type = trt.PluginField("type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32)
-    pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
-    pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32)
-    pf_has_mask = trt.PluginField("has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32)
-    pfc = trt.PluginFieldCollection([pf_hidden_size, pf_num_heads, pf_has_mask, pf_type])
+    pf_type = ixrt.PluginField(
+        "type_id",
+        np.array([get_mha_dtype(config)], np.int32),
+        ixrt.PluginFieldType.INT32,
+    )
+    pf_hidden_size = ixrt.PluginField(
+        "hidden_size", np.array([hidden_size], np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_num_heads = ixrt.PluginField(
+        "num_heads", np.array([num_heads], np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_has_mask = ixrt.PluginField(
+        "has_mask", np.array([has_mask], np.int32), ixrt.PluginFieldType.INT32
+    )
+    pfc = ixrt.PluginFieldCollection(
+        [pf_hidden_size, pf_num_heads, pf_has_mask, pf_type]
+    )
     qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc)
 
     qkv_in = [mult_all.get_output(0)]
@@ -143,46 +203,56 @@ def skipln(prefix, config, init_dict, network, input_tensor, skip, bias=None):
     Add the skip layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 5
     hidden_size = idims[2]
 
-    dtype = trt.float32
+    dtype = ixrt.float32
     if config.use_fp16:
-        dtype = trt.float16
+        dtype = ixrt.float16
 
-    pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
+    pf_ld = ixrt.PluginField(
+        "ld", np.array([hidden_size], np.int32), ixrt.PluginFieldType.INT32
+    )
     wbeta = init_dict[prefix + "beta"]
-    pf_beta = trt.PluginField("beta", wbeta, trt.PluginFieldType.FLOAT32)
+    pf_beta = ixrt.PluginField("beta", wbeta, ixrt.PluginFieldType.FLOAT32)
     wgamma = init_dict[prefix + "gamma"]
-    pf_gamma = trt.PluginField("gamma", wgamma, trt.PluginFieldType.FLOAT32)
-    pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
+    pf_gamma = ixrt.PluginField("gamma", wgamma, ixrt.PluginFieldType.FLOAT32)
+    pf_type = ixrt.PluginField(
+        "type_id", np.array([int(dtype)], np.int32), ixrt.PluginFieldType.INT32
+    )
 
-    fields = [pf_ld, pf_beta, pf_gamma, pf_type ]
+    fields = [pf_ld, pf_beta, pf_gamma, pf_type]
 
     if bias is not None:
-        pf_bias = trt.PluginField("bias", bias, trt.PluginFieldType.FLOAT32)
+        pf_bias = ixrt.PluginField("bias", bias, ixrt.PluginFieldType.FLOAT32)
         fields.append(pf_bias)
 
-    pfc = trt.PluginFieldCollection(fields)
+    pfc = ixrt.PluginFieldCollection(fields)
     skipln_plug = skln_plg_creator.create_plugin("skipln", pfc)
 
     skipln_inputs = [input_tensor, skip]
     layer = network.add_plugin_v2(skipln_inputs, skipln_plug)
     return layer
 
+
 def ffn_trt(prefix, config, init_dict, network, input_tensor):
-     # FC1 + GELU
+    # FC1 + GELU
     B_mid = init_dict[prefix + B_MID]
     W_mid = init_dict[prefix + W_MID]
-    mid_dense = network.add_fully_connected(input_tensor, config.intermediate_size, W_mid, B_mid)
+    mid_dense = network.add_fully_connected(
+        input_tensor, config.intermediate_size, W_mid, B_mid
+    )
 
-    dtype = trt.float32
+    dtype = ixrt.float32
     if config.use_fp16:
-        dtype = trt.float16
-    pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
-    pf_ld = trt.PluginField("ld", np.array([config.hidden_size], np.int32), trt.PluginFieldType.INT32)
-
-    pfc = trt.PluginFieldCollection([pf_type, pf_ld])
+        dtype = ixrt.float16
+    pf_type = ixrt.PluginField(
+        "type_id", np.array([int(dtype)], np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_ld = ixrt.PluginField(
+        "ld", np.array([config.hidden_size], np.int32), ixrt.PluginFieldType.INT32
+    )
+
+    pfc = ixrt.PluginFieldCollection([pf_type, pf_ld])
     gelu_plug = gelu_plg_creator.create_plugin("gelu", pfc)
 
     gelu_inputs = [mid_dense.get_output(0)]
@@ -194,54 +264,88 @@ def ffn_trt(prefix, config, init_dict, network, input_tensor):
     # Dense to hidden size
     B_lout = init_dict[prefix + B_LOUT]
     W_lout = init_dict[prefix + W_LOUT]
-    out_dense = network.add_fully_connected(intermediate_act, config.hidden_size, W_lout, B_lout)
+    out_dense = network.add_fully_connected(
+        intermediate_act, config.hidden_size, W_lout, B_lout
+    )
     B_lout = None
 
-    out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, out_dense.get_output(0), input_tensor, B_lout)
+    out_layer = skipln(
+        prefix + "output_layernorm_",
+        config,
+        init_dict,
+        network,
+        out_dense.get_output(0),
+        input_tensor,
+        B_lout,
+    )
     return out_layer
 
+
 def ffn(prefix, config, init_dict, network, input_tensor):
     # FC1 + GELU
     B_mid = init_dict[prefix + B_MID]
     W_mid = init_dict[prefix + W_MID]
     B_lout = init_dict[prefix + B_LOUT]
     W_lout = init_dict[prefix + W_LOUT]
-    pf_out_dim = trt.PluginField("out_dims", np.array(config.hidden_size, np.int32), trt.PluginFieldType.INT32)
-    pf_type = trt.PluginField("type_id", np.array(int(trt.float16), np.int32), trt.PluginFieldType.INT32)
-    pf_W1 = trt.PluginField("W1", W_mid, trt.PluginFieldType.FLOAT32)
-    pf_W2 = trt.PluginField("W2", W_lout, trt.PluginFieldType.FLOAT32)
-    pf_B1 = trt.PluginField("B1", B_mid, trt.PluginFieldType.FLOAT32)
-    pf_act_type = trt.PluginField("act_type", np.array(int(3), np.int32), trt.PluginFieldType.INT32)
-    pfc = trt.PluginFieldCollection([pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type])
+    pf_out_dim = ixrt.PluginField(
+        "out_dims", np.array(config.hidden_size, np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_type = ixrt.PluginField(
+        "type_id", np.array(int(ixrt.float16), np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_W1 = ixrt.PluginField("W1", W_mid, ixrt.PluginFieldType.FLOAT32)
+    pf_W2 = ixrt.PluginField("W2", W_lout, ixrt.PluginFieldType.FLOAT32)
+    pf_B1 = ixrt.PluginField("B1", B_mid, ixrt.PluginFieldType.FLOAT32)
+    pf_act_type = ixrt.PluginField(
+        "act_type", np.array(int(3), np.int32), ixrt.PluginFieldType.INT32
+    )
+    pfc = ixrt.PluginFieldCollection(
+        [pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type]
+    )
     ffn_plug = ffn_plg_creator.create_plugin("ffn", pfc)
 
     ffn_inputs = [input_tensor]
     ffn_layer = network.add_plugin_v2(ffn_inputs, ffn_plug)
 
-    out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, ffn_layer.get_output(0), input_tensor, B_lout)
+    out_layer = skipln(
+        prefix + "output_layernorm_",
+        config,
+        init_dict,
+        network,
+        ffn_layer.get_output(0),
+        input_tensor,
+        B_lout,
+    )
     return out_layer
 
+
 def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
     """
     Add the transformer layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 5
     hidden_size = idims[2]
 
-    context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask)
+    context_transposed = attention_layer_opt(
+        prefix + "attention_", config, init_dict, network, input_tensor, imask
+    )
     attention_heads = context_transposed.get_output(0)
-    
+
     # FC0
     B_aout = init_dict[prefix + B_AOUT]
     W_aout = init_dict[prefix + W_AOUT]
-    if config.use_trt:
-        attention_out_fc = network.add_fully_connected(attention_heads, hidden_size, W_aout, B_aout)
-    else:
-        attention_out_fc = custom_fc(network, attention_heads, hidden_size, W_aout, B_aout)
-    B_aout = None     
-    
-    skiplayer = skipln(prefix + "attention_output_layernorm_",config, init_dict, network, attention_out_fc.get_output(0), input_tensor, B_aout)
+    attention_out_fc = custom_fc(network, attention_heads, hidden_size, W_aout, B_aout)
+    B_aout = None
+
+    skiplayer = skipln(
+        prefix + "attention_output_layernorm_",
+        config,
+        init_dict,
+        network,
+        attention_out_fc.get_output(0),
+        input_tensor,
+        B_aout,
+    )
     attention_ln = skiplayer.get_output(0)
 
     if config.use_trt:
@@ -250,121 +354,277 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imas
         ffn_layer = ffn(prefix, config, init_dict, network, attention_ln)
     return ffn_layer
 
+
 def bert_model(config, init_dict, network, input_tensor, input_mask):
     """
     Create the bert model
     """
     prev_input = input_tensor
     for layer in range(0, config.num_hidden_layers):
-        ss = "l{}_".format(layer)   
-        out_layer = transformer_layer_opt(ss, config,  init_dict, network, prev_input, input_mask)
+        ss = "l{}_".format(layer)
+        out_layer = transformer_layer_opt(
+            ss, config, init_dict, network, prev_input, input_mask
+        )
         prev_input = out_layer.get_output(0)
     return prev_input
 
+
 def squad_output(prefix, config, init_dict, network, input_tensor):
     """
     Create the squad output
     """
 
     idims = input_tensor.shape
-    assert len(idims) == 5
-    B, S, hidden_size, _, _ = idims
+    B, S, hidden_size = idims
 
     W_out = init_dict[prefix + SQD_W]
     B_out = init_dict[prefix + SQD_B]
 
+    dense = custom_fc(network, input_tensor, 2, W_out, B_out)
+
     if config.use_trt:
-        dense = network.add_fully_connected(input_tensor, 2, W_out, B_out)
-    else:
-        dense = custom_fc(network, input_tensor, 2, W_out, B_out)
-        
+        OUT = network.add_shuffle(dense.get_output(0))
+        OUT.second_transpose = (1, 0, 2)
+        return OUT
     return dense
 
-def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes):
-    input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
-    segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
-    input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
+
+def emb_layernorm(
+    builder,
+    network,
+    config,
+    weights_dict,
+    builder_config,
+    sequence_lengths,
+    batch_sizes,
+):
+    input_ids = network.add_input(
+        name="input_ids",
+        dtype=ixrt.int32,
+        shape=(
+            -1 if len(batch_sizes) > 1 else batch_sizes[0],
+            -1 if len(sequence_lengths) > 1 else sequence_lengths[0],
+        ),
+    )
+    segment_ids = network.add_input(
+        name="segment_ids",
+        dtype=ixrt.int32,
+        shape=(
+            -1 if len(batch_sizes) > 1 else batch_sizes[0],
+            -1 if len(sequence_lengths) > 1 else sequence_lengths[0],
+        ),
+    )
+    input_mask = network.add_input(
+        name="input_mask",
+        dtype=ixrt.int32,
+        shape=(
+            -1 if len(batch_sizes) > 1 else batch_sizes[0],
+            -1 if len(sequence_lengths) > 1 else sequence_lengths[0],
+        ),
+    )
 
     if len(sequence_lengths) > 1:
         profile = builder.create_optimization_profile()
         min_shape = (batch_sizes[0], sequence_lengths[0])
         opt_shape = (batch_sizes[1], sequence_lengths[1])
         max_shape = (batch_sizes[2], sequence_lengths[2])
-        assert(sequence_lengths[0] <= sequence_lengths[1] and sequence_lengths[1] <= sequence_lengths[2])
-        
-        print('set dynamic shape -> ', min_shape, opt_shape, max_shape)
+        assert (
+            sequence_lengths[0] <= sequence_lengths[1]
+            and sequence_lengths[1] <= sequence_lengths[2]
+        )
+
+        print("set dynamic shape -> ", min_shape, opt_shape, max_shape)
         profile.set_shape("input_ids", min_shape, opt_shape, max_shape)
         profile.set_shape("segment_ids", min_shape, opt_shape, max_shape)
         profile.set_shape("input_mask", min_shape, opt_shape, max_shape)
         builder_config.add_optimization_profile(profile)
 
-    wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"], trt.PluginFieldType.FLOAT32)
-    wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"], trt.PluginFieldType.FLOAT32)
-    wwordemb = trt.PluginField("bert_embeddings_word_embeddings", weights_dict["bert_embeddings_word_embeddings"], trt.PluginFieldType.FLOAT32)
-    wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32)
-    wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32)
-
-    output_fp16 = trt.PluginField("output_fp16", np.array([1 if config.use_fp16 else 0]).astype(np.int32), trt.PluginFieldType.INT32)
-    mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32)
-
-    pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type])
+    wbeta = ixrt.PluginField(
+        "bert_embeddings_layernorm_beta",
+        weights_dict["bert_embeddings_layernorm_beta"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+
+    wgamma = ixrt.PluginField(
+        "bert_embeddings_layernorm_gamma",
+        weights_dict["bert_embeddings_layernorm_gamma"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+    wwordemb = ixrt.PluginField(
+        "bert_embeddings_word_embeddings",
+        weights_dict["bert_embeddings_word_embeddings"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+    wtokemb = ixrt.PluginField(
+        "bert_embeddings_token_type_embeddings",
+        weights_dict["bert_embeddings_token_type_embeddings"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+    wposemb = ixrt.PluginField(
+        "bert_embeddings_position_embeddings",
+        weights_dict["bert_embeddings_position_embeddings"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+
+    output_fp16 = ixrt.PluginField(
+        "output_fp16",
+        np.array([1 if config.use_fp16 else 0]).astype(np.int32),
+        ixrt.PluginFieldType.INT32,
+    )
+    mha_type = ixrt.PluginField(
+        "mha_type_id",
+        np.array([get_mha_dtype(config)], np.int32),
+        ixrt.PluginFieldType.INT32,
+    )
+
+    pfc = ixrt.PluginFieldCollection(
+        [wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type]
+    )
     fn = emln_plg_creator.create_plugin("embeddings", pfc)
 
-    inputs = [input_ids, segment_ids, input_mask]
+    if config.use_trt:
+        input_ids = network.add_shuffle(input_ids)
+        input_ids.second_transpose = (1, 0)
+        segment_ids = network.add_shuffle(segment_ids)
+        segment_ids.second_transpose = (1, 0)
+        input_mask = network.add_shuffle(input_mask)
+        input_mask.second_transpose = (1, 0)
+        inputs = [
+            input_ids.get_output(0),
+            segment_ids.get_output(0),
+            input_mask.get_output(0),
+        ]
+    else:
+        inputs = [input_ids, segment_ids, input_mask]
     emb_layer = network.add_plugin_v2(inputs, fn)
     return emb_layer
 
+
 def build_engine(batch_sizes, sequence_lengths, config, weights_dict):
-    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    explicit_batch_flag = 1 << int(ixrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
 
-    builder = trt.Builder(TRT_LOGGER)
-    with builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config:
+    builder = ixrt.Builder(TRT_LOGGER)
+    with builder.create_network(
+        explicit_batch_flag
+    ) as network, builder.create_builder_config() as builder_config:
         if config.use_fp16:
-            builder_config.set_flag(trt.BuilderFlag.FP16)
+            builder_config.set_flag(ixrt.BuilderFlag.FP16)
 
         # Create the network
-        emb_layer = emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes)
+        emb_layer = emb_layernorm(
+            builder,
+            network,
+            config,
+            weights_dict,
+            builder_config,
+            sequence_lengths,
+            batch_sizes,
+        )
         embeddings = emb_layer.get_output(0)
         mask_idx = emb_layer.get_output(1)
-        
+
         bert_out = bert_model(config, weights_dict, network, embeddings, mask_idx)
 
         squad_logits = squad_output("cls_", config, weights_dict, network, bert_out)
         squad_logits_out = squad_logits.get_output(0)
+        squad_logits.set_output_type(0, ixrt.float32)
 
         network.mark_output(squad_logits_out)
 
         build_start_time = time.time()
-        engine = builder.build_engine(network, builder_config)
-        build_time_elapsed = (time.time() - build_start_time)
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed))
-        return engine
-    
+        serialized_engine = builder.build_serialized_network(network, builder_config)
+        build_time_elapsed = time.time() - build_start_time
+        TRT_LOGGER.log(
+            TRT_LOGGER.INFO, "build serialized_engine in {:.3f} Sec".format(build_time_elapsed)
+        )
+        return serialized_engine
+
+
 def str2bool(v):
-    return v.lower() in ('yes', 'true')    
+    return v.lower() in ("yes", "true")
+
 
 def main():
-    parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("-z", "--use_trt", type=str2bool, default=False, help = "Whether to use tensorRT or IxRT")
-    parser.add_argument("-x", "--onnx", required=False, help="The ONNX model file path.")
-    parser.add_argument("-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path.")
-    parser.add_argument("-o", "--output", required=True, default="bert_base_384.engine", help="The bert engine file, ex bert.engine")
-    parser.add_argument("-b", "--batch-size", nargs='+', help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.", type=int)
-    parser.add_argument("-s", "--sequence-length", nargs='+', help="Sequence length of the BERT model", type=int)
-    parser.add_argument("-c", "--config-dir", required=True,
-                        help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google")
-    parser.add_argument("-f", "--fp16", action="store_true", help="Indicates that inference should be run in FP16 precision", required=False)
-    parser.add_argument("-j", "--squad-json", default="squad/dev-v1.1.json", help="squad json dataset used for int8 calibration", required=False)
-    parser.add_argument("-v", "--vocab-file", default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt", help="Path to file containing entire understandable vocab", required=False)
-    parser.add_argument("--verbose", action="store_true", help="Turn on verbose logger and set profiling verbosity to DETAILED", required=False)
+    parser = argparse.ArgumentParser(
+        description="IxRT BERT Sample",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-z",
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use ixrt or IxRT",
+    )
+    parser.add_argument(
+        "-x", "--onnx", required=False, help="The ONNX model file path."
+    )
+    parser.add_argument(
+        "-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path."
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        default="bert_base_384.engine",
+        help="The bert engine file, ex bert.engine",
+    )
+    parser.add_argument(
+        "-b",
+        "--batch-size",
+        nargs="+",
+        help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.",
+        type=int,
+    )
+    parser.add_argument(
+        "-s",
+        "--sequence-length",
+        nargs="+",
+        help="Sequence length of the BERT model",
+        type=int,
+    )
+    parser.add_argument(
+        "-c",
+        "--config-dir",
+        required=True,
+        help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google",
+    )
+    parser.add_argument(
+        "-f",
+        "--fp16",
+        action="store_true",
+        help="Indicates that inference should be run in FP16 precision",
+        required=False,
+    )
+    parser.add_argument(
+        "-j",
+        "--squad-json",
+        default="squad/dev-v1.1.json",
+        help="squad json dataset used for int8 calibration",
+        required=False,
+    )
+    parser.add_argument(
+        "-v",
+        "--vocab-file",
+        default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt",
+        help="Path to file containing entire understandable vocab",
+        required=False,
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Turn on verbose logger and set profiling verbosity to DETAILED",
+        required=False,
+    )
 
     args, _ = parser.parse_known_args()
     args.batch_size = args.batch_size or [1]
     args.sequence_length = args.sequence_length or [128]
-    args.use_trt = is_nvidia_platform()
 
     if len(args.sequence_length) not in [1, 3]:
-        print("Error: You must provide <args.sequence_length> either one or three integers.")
+        print(
+            "Error: You must provide <args.sequence_length> either one or three integers."
+        )
         sys.exit(1)
 
     if len(args.batch_size) not in [1, 3]:
@@ -375,7 +635,9 @@ def main():
         TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE
 
     bert_config_path = args.config_dir
-    TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path))
+    TRT_LOGGER.log(
+        TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path)
+    )
 
     config = BertConfig(bert_config_path, args.fp16, args.use_trt)
 
@@ -384,15 +646,18 @@ def main():
     elif args.pytorch != None:
         weights_dict = load_pytorch_weights_and_quant(args.pytorch, config)
     else:
-        raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.")
+        raise RuntimeError(
+            "You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model."
+        )
 
-    with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as engine:
-        TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...")
-        serialized_engine = engine.serialize()
+    with build_engine(
+        args.batch_size, args.sequence_length, config, weights_dict
+    ) as serialized_engine:
         TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output))
         with open(args.output, "wb") as fout:
             fout.write(serialized_engine)
         TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py b/models/nlp/plm/bert_large_squad/ixrt/builder_int8.py
similarity index 89%
rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py
rename to models/nlp/plm/bert_large_squad/ixrt/builder_int8.py
index e51d7c40d5fd0a9d79514b0367b446058ddec14f..5a16bc5333247d01bb9ef31591b84c37e8c081d2 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/builder_int8.py
@@ -1,35 +1,3 @@
-#!/usr/bin/env python3
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
 import os
 import argparse
 import json
@@ -43,6 +11,7 @@ from builder_utils_int8 import load_pytorch_weights_and_quant
 from builder_utils_int8 import WQKV, BQKV  # Attention Keys
 from builder_utils_int8 import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT  # Transformer Keys
 from builder_utils_int8 import SQD_W, SQD_B  # SQuAD Output Keys
+from builder import custom_fc as custom_fc_fp16
 
 trt_version = [int(n) for n in trt.__version__.split('.')]
 
@@ -114,8 +83,7 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask)
     """
     Add the attention layer
     """
-    assert(len(input_tensor.shape) == 5)
-    B, S, hidden_size, _, _ = input_tensor.shape
+    B, S, hidden_size = input_tensor.shape
     num_heads = config.num_attention_heads
     head_size = int(hidden_size / num_heads)
 
@@ -157,7 +125,6 @@ def skipln(prefix, config, init_dict, network, input_tensor, skip, residual, is_
     Add the skip layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 5
     hidden_size = idims[2]
 
     dtype = trt.float32
@@ -236,7 +203,6 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imas
     Add the transformer layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 5
     hidden_size = idims[2]
 
     context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask)
@@ -281,13 +247,12 @@ def squad_output(prefix, config, init_dict, network, input_tensor):
     """
 
     idims = input_tensor.shape
-    assert len(idims) == 5
-    B, S, hidden_size, _, _ = idims
+    B, S, hidden_size = idims
 
     W_out = init_dict[prefix + SQD_W]
     B_out = init_dict[prefix + SQD_B]
 
-    dense = network.add_fully_connected(input_tensor, 2, W_out, B_out)
+    dense = custom_fc_fp16(network, input_tensor, 2, W_out, B_out)
     return dense
 
 def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes):
@@ -314,7 +279,7 @@ def emb_layernorm(builder, network, config, weights_dict, builder_config, sequen
     wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32)
     wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32)
 
-    output_fp16 = trt.PluginField("output_fp16", np.array([0]).astype(np.int32), trt.PluginFieldType.INT32)
+    output_fp16 = trt.PluginField("output_fp16", np.array([1]).astype(np.int32), trt.PluginFieldType.INT32)
     mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32)
 
     pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type])
@@ -354,10 +319,10 @@ def build_engine(batch_sizes, sequence_lengths, config, weights_dict):
         network.mark_output(squad_logits_out)
 
         build_start_time = time.time()
-        engine = builder.build_engine(network, builder_config)
+        plan = builder.build_serialized_network(network, builder_config)
         build_time_elapsed = (time.time() - build_start_time)
         TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed))
-        return engine
+        return plan
     
 def main():
     parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -389,7 +354,7 @@ def main():
     if args.verbose:
         TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE
 
-    bert_config_path = args.config_dir
+    bert_config_path = os.path.join(args.config_dir, "bert_config.json")
     TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path))
 
     config = BertConfig(bert_config_path, args.int8)
@@ -403,13 +368,11 @@ def main():
         raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.")
 
     # engine = build_engine(args.batch_size, args.workspace_size, args.sequence_length, config, weights_dict, args.squad_json, args.vocab_file, None, args.calib_num, args.verbose)
-    with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as engine:
-        TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...")
-        serialized_engine = engine.serialize()
+    with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as serialized_engine:
         TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output))
         with open(args.output, "wb") as fout:
             fout.write(serialized_engine)
         TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py b/models/nlp/plm/bert_large_squad/ixrt/builder_utils.py
similarity index 74%
rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py
rename to models/nlp/plm/bert_large_squad/ixrt/builder_utils.py
index 767379778633cafe889a4df414d8cc487495559b..51c294e104a8ee12c336b23d7455f44f8b66b9a0 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/builder_utils.py
@@ -1,34 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
 import onnx
 import numpy as np
 import tensorrt as trt
@@ -93,6 +62,10 @@ def get_onnx_weight_dict(tensor_dict, config):
             Bqkv[1,:] = tensor_dict[prefix + BK]
             Bqkv[2,:] = tensor_dict[prefix + BV]
 
+            if config.use_trt:
+                Wqkv = np.ascontiguousarray(Wqkv.reshape((3, N, H, N, H)).transpose((1,0,2,3,4)))
+                Bqkv = np.ascontiguousarray(Bqkv.reshape((3, N, H)).transpose((1,0,2)))
+
             weights_dict[prefix + WQKV] = Wqkv.flatten()
             weights_dict[prefix + BQKV] = Bqkv.flatten()
             weights_dict[prefix + WQKV + "_notrans"] = np.ascontiguousarray(Wqkv.T).flatten()
@@ -103,6 +76,10 @@ def get_onnx_weight_dict(tensor_dict, config):
             flat_tensor = np.ascontiguousarray(tensor).flatten()
             weights_dict[outname] = flat_tensor
 
+            if outname.find("kernel") != -1 and config.use_trt:
+                tensor = np.transpose(tensor)
+                weights_dict[outname + "_notrans"] = np.ascontiguousarray(tensor).flatten()
+
     return weights_dict
 
 def onnx_to_trt_name(onnx_name):
@@ -162,24 +139,67 @@ def onnx_to_trt_name(onnx_name):
     parsed = '_'.join(toks)
     return parsed
 
+def pt_to_trt_name(pt_name):
+    """
+    Converting variables in the onnx checkpoint to names corresponding to the naming convention used in the TF version, expected by the builder
+    """
+    qkv_strings = {'key', 'value', 'query', 'query_key_value'}
+    pt_name = pt_name.lower()
+    toks = [t.strip('_') for t in pt_name.split('.')]
+    if toks[0] == 'bert': #embeddings or encoder
+        if toks[1] == 'encoder': #transformer
+            if toks[-2] == 'layernorm': #bias->beta, weight->gamma
+                toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma'
+            elif (toks[-2] == 'dense' or toks[-2] in qkv_strings) and toks[-1] == 'weight':
+                toks[-1] = 'kernel'
+
+            if 'final_input_quantizer' not in toks[2]:
+                ind = toks.index('layers')+1 if 'layers' in toks else 3
+                toks = toks[ind:]
+                toks[0] = 'l{}'.format(int(toks[0]))
+
+        else:
+            if toks[-2] == 'layernorm': #bias->beta, weight->gamma
+                toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma'
+            else: #embeddings: drop "_weight" suffix
+                toks = toks[:-1]
+
+    elif 'qa_outputs' in pt_name: ##
+        name = 'cls_squad_output_bias' if toks[-1] == 'bias' else 'cls_squad_output_weights'
+        return name
+    else:
+        print("Encountered unknown case:", pt_name)
+        assert(False)
+    parsed = '_'.join(toks)
+    return parsed
+
 def load_onnx_weights_and_quant(path, config):
     """
     Load the weights from the onnx checkpoint
     """
     model = onnx.load(path)
     weights = model.graph.initializer
+    # for w in weights:
+    #     print(w.name, w.dims,flush=True)
     tensor_dict = dict((onnx_to_trt_name(w.name), np.frombuffer(w.raw_data, np.int8).reshape(w.dims))
                        if w.name.split('_')[-1] == 'mask' else
                        (onnx_to_trt_name(w.name), np.frombuffer(w.raw_data, np.float32).reshape(w.dims))
                        for w in weights)
+    # for key in tensor_dict:
+    #     print(key, tensor_dict[key].shape,flush=True)
+
     return get_onnx_weight_dict(tensor_dict, config)
 
 def load_pytorch_weights_and_quant(path, config):
     """
     Load the weights from the pytorch checkpoint
     """
-    state_dict = torch.load(path, map_location='cpu')["model"]
-    tensor_dict = {onnx_to_trt_name(name):val.numpy() for name, val in state_dict.items()}
+    state_dict = torch.load(path, map_location='cpu')
+    # for name in state_dict:
+    #     print(name, state_dict[name].size(),flush=True)
+    tensor_dict = {pt_to_trt_name(name):val.numpy()  for name, val in state_dict.items()}
+    # for key in tensor_dict:
+    #     print(key, tensor_dict[key].shape,flush=True)
     return get_onnx_weight_dict(tensor_dict, config)
 
 class BertConfig:
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py b/models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py
similarity index 85%
rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py
rename to models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py
index 56ac8d1889912cb98817d5960767d94522441030..77997b1bb92f07a10d4de8de11b56620695f76a6 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py
@@ -1,34 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
 import numpy as np
 import tensorrt as trt
 import json
diff --git a/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh b/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh
index ebc8effc48246556bd7fa5edadd0ad9d35a984a0..bc47b14905d175849a61a6e91921748705b85e2c 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh
@@ -28,14 +28,6 @@ fi
 # install ixrt run
 bash /root/data/install/ixrt-1.0.0.alpha+corex.4.3.0-linux_x86_64.run
 
-if [ "$1" = "nvidia" ]; then
-    cmake -S . -B build -DUSE_TENSORRT=true
-    cmake --build build -j16
-else
-    cmake -S . -B build
-    cmake --build build -j16
-fi
-
 pip install -r requirements.txt
-mkdir -p ./python/data
-ln -s /root/data/checkpoints/bert-large-uncased/ ./python/data && ln -s /root/data/datasets/squad/ ./python/data
\ No newline at end of file
+mkdir -p ./data
+ln -s /root/data/checkpoints/bert-large-uncased/ ./data && ln -s /root/data/datasets/squad/ ./data
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake
deleted file mode 100644
index 07c436f5e545933e1debe34a0de482512f0ffb0a..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-if(NOT COMPILER_PATH)
-  if (EXISTS /opt/sw_home/local/bin/clang++)
-    set(COMPILER_PATH /opt/sw_home/local/bin)
-  elseif (EXISTS /usr/local/corex/bin/clang++)
-    set(COMPILER_PATH /usr/local/corex/bin)
-  else()
-    message(STATUS "COMPILER_PATH is not set and we couldn't find clang compiler neither, will use system C/C++ compiler")
-  endif()
-endif()
-if (COMPILER_PATH)
-  set(CMAKE_CXX_COMPILER ${COMPILER_PATH}/clang++)
-  set(CMAKE_C_COMPILER ${COMPILER_PATH}/clang)
-endif()
-
-message(STATUS "Use ${CMAKE_CXX_COMPILER} and ${CMAKE_C_COMPILER} as C++ and C compiler")
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake
deleted file mode 100644
index 58e39e6003cb6a0545a76f9a6fab88e44fe39caa..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake
+++ /dev/null
@@ -1,57 +0,0 @@
-# This cmake does:
-# - Set CUDA_PATH
-# - Find libcudart
-# - Util functions like cuda_add_library, cuda_add_executable
-
-
-# CUDA_PATH can be specified through below means shown in priority order 1.
-# cmake command line argument, -DCUDA_PATH=/path/to/cuda 2. bash environment
-# variable, export CUDA_PATH=/path/to/cuda
-if(DEFINED ENV{CUDA_PATH})
-  set(CUDA_PATH "$ENV{CUDA_PATH}")
-else()
-  set(CUDA_PATH
-      "/opt/sw_home/local/cuda"
-      CACHE PATH "cuda installation root path")
-endif()
-message(STATUS "Use CUDA_PATH=${CUDA_PATH} ")
-
-# GPU arch
-if(NOT "${CUDA_ARCH}" STREQUAL "")
-  set(CUDA_ARCH
-      ${CUDA_ARCH}
-      CACHE STRING "GPU architecture tag, ivcore11")
-else("${CUDA_ARCH}" STREQUAL "")
-  set(CUDA_ARCH
-      "ivcore11"
-      CACHE STRING "GPU architecture tag, ivcore11")
-endif()
-message(STATUS "Use CUDA_ARCH=${CUDA_ARCH}")
-
-macro(cuda_add_executable)
-  foreach(File ${ARGN})
-    if(${File} MATCHES ".*\.cu$")
-      set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-    endif()
-  endforeach()
-  add_executable(${ARGV})
-endmacro()
-
-macro(cuda_add_library)
-  foreach(File ${ARGN})
-    if(${File} MATCHES ".*\.cu$")
-      set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-    endif()
-  endforeach()
-  add_library(${ARGV})
-endmacro()
-
-find_library(
-  CUDART_LIBRARY cudart
-  PATHS ${CUDA_PATH}
-  PATH_SUFFIXES lib/x64 lib64 lib
-  NO_DEFAULT_PATH)
-
-if (NOT USE_TRT)
-  set(CUDA_LIBRARIES cudart)
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake
deleted file mode 100644
index 5b0f27293edaebf80cd5bfd622c363f49b36966b..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-# This cmake file decides how to build with IxRT
-# Custom IxRT Path
-if(NOT "${IXRT_HOME}" STREQUAL "")
-    set(IXRT_INCLUDE_DIR ${IXRT_HOME}/include)
-    set(IXRT_LIB_DIR ${IXRT_HOME}/lib)
-# From default paths
-else()
-  set(IXRT_INCLUDE_DIR /usr/local/corex/include)
-  set(IXRT_LIB_DIR /usr/local/corex/lib)
-endif()
-
-message(STATUS "IXRT_INCLUDE_DIR:   ${IXRT_INCLUDE_DIR}")
-message(STATUS "IXRT_LIB_DIR:   ${IXRT_LIB_DIR}")
-
-if(EXISTS ${IXRT_INCLUDE_DIR} AND EXISTS ${IXRT_LIB_DIR})
-    include_directories(${IXRT_INCLUDE_DIR})
-else()
-    message( FATAL_ERROR "IxRT library doesn't exist!")
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake
deleted file mode 100644
index 603606996e8a310579fd86de3ea36125f19bbea1..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB_RECURSE PLUGIN_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu)
-
-if(DEFINED USE_TENSORRT)
-   list(FILTER PLUGIN_FILES EXCLUDE REGEX "${CMAKE_CURRENT_SOURCE_DIR}/src/backend/ixinfer")
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py b/models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py
similarity index 83%
rename from models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py
rename to models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py
index ce5bb98df7f60176ac5def72f4c2a5d1d54f990e..c73db423d3b2e357de0527271f1d2d945c6bff76 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py
@@ -1,18 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 #
 # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
@@ -107,10 +93,6 @@ def evaluate(dataset, predictions, f1_acc):
         print("&&&& FAILED TensorRT BERT Squad Accuracy matches reference.")
     else:
         print("&&&& PASSED TensorRT BERT Squad Accuracy matches reference.")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["exact_match"] = round(exact_match, 3)
-    metricResult["metricResult"]["f1"] = round(f1, 3)
-    print(metricResult)
     return {'exact_match': exact_match, 'f1': f1}
 
 if __name__ == '__main__':
diff --git a/models/nlp/plm/bert_large_squad/ixrt/helpers/__init__.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py
similarity index 89%
rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py
rename to models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py
index beacc625fae0f73bda3480054e4ecceca85fb240..73084f39b8de03d8cfcdfb37d31407d30d9c3176 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py
@@ -19,8 +19,8 @@
 import tensorrt as trt
 import os
 
-import pycuda.driver as cuda
-import pycuda.autoinit
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 import numpy as np
 import helpers.tokenization as tokenization
 import helpers.data_processing as dp
@@ -80,9 +80,12 @@ class BertCalibrator(trt.IInt8LegacyCalibrator):
                 segment_ids = features[0].segment_ids
                 input_mask = features[0].input_mask
 
-        cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel())
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[0], input_ids.ravel(), input_ids.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[1], segment_ids.ravel(), segment_ids.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[2], input_mask.ravel(), input_mask.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         self.current_index += self.batch_size
         return self.device_inputs
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py
similarity index 98%
rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py
rename to models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py
index 712e1a61d29a198eb276f41a9249b0c66e3786ba..88459ebfafbd84c11356c0a3dfc3838882e4b2f8 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py
@@ -159,14 +159,14 @@ def convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_le
         input_mask = [1] * len(input_ids)
 
         # Zero-pad up to the sequence length.
-        # while len(input_ids) < max_seq_length:
-        #     input_ids.append(0)
-        #     input_mask.append(0)
-        #     segment_ids.append(0)
-
-        # assert len(input_ids) == max_seq_length
-        # assert len(input_mask) == max_seq_length
-        # assert len(segment_ids) == max_seq_length
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
 
         def create_int_feature(values):
             feature = np.asarray(values, dtype=np.int32, order=None)
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/tokenization.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/tokenization.py
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/tokenization.py
rename to models/nlp/plm/bert_large_squad/ixrt/helpers/tokenization.py
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/inference.py b/models/nlp/plm/bert_large_squad/ixrt/inference.py
similarity index 79%
rename from models/nlp/plm/bert_large_squad/ixrt/python/inference.py
rename to models/nlp/plm/bert_large_squad/ixrt/inference.py
index ec93972d295cc3fa777ab60cf82d12401b99f7c3..95b88dc5153e4a8cf3f157ab2d6d980316747ed3 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/inference.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/inference.py
@@ -38,8 +38,8 @@ import argparse
 import collections
 import numpy as np
 import tensorrt as trt
-import pycuda.driver as cuda
-import pycuda.autoinit
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 import helpers.tokenization as tokenization
 import helpers.data_processing as dp
@@ -153,14 +153,15 @@ if __name__ == '__main__':
                 break
         if selected_profile == -1:
             raise RuntimeError("Could not find any profile that can run batch size {}.".format(args.batch_size))
-        
+
         # Create a stream in which to copy inputs/outputs and run inference.
-        stream = cuda.Stream()
+        err_dr, stream = cuda.cuStreamCreate(0)
+        assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
         
         # if args.use_trt:
         #     context.active_optimization_profile = selected_profile
         # else:
-        context.set_optimization_profile_async(selected_profile, stream.handle)
+        context.set_optimization_profile_async(selected_profile, stream)
         binding_idx_offset = selected_profile * num_binding_per_profile
 
         input_shape = (args.batch_size, max_seq_length)
@@ -170,11 +171,17 @@ if __name__ == '__main__':
         assert context.all_binding_shapes_specified
 
         # Allocate device memory for inputs.
-        d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
+        d_inputs = []
+        for binding in range(3):
+            err, ptr = cuda.cuMemAlloc(input_nbytes)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+            d_inputs.append(ptr)
 
         # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
-        h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32)
-        d_output = cuda.mem_alloc(h_output.nbytes)
+        h_output = np.empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32) 
+ 
+        err, d_output = cuda.cuMemAlloc(h_output.nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         def inference(features, tokens):
             global h_output
@@ -191,25 +198,32 @@ if __name__ == '__main__':
                 segment_ids_batch = np.repeat(np.expand_dims(feature.segment_ids, 0), args.batch_size, axis=0)
                 input_mask_batch = np.repeat(np.expand_dims(feature.input_mask, 0), args.batch_size, axis=0)
 
-                input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel()))
-                segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel()))
-                input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel()))
+                input_ids = cuda.cuMemHostRegister(np.ascontiguousarray(input_ids_batch.ravel()), input_ids_batch.nbytes)
+                segment_ids = cuda.cuMemHostRegister(np.ascontiguousarray(segment_ids_batch.ravel()), segment_ids_batch.nbytes)
+                input_mask = cuda.cuMemHostRegister(np.ascontiguousarray(input_mask_batch.ravel()), input_mask.nbytes)
 
                 eval_start_time = time.time()
-                cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
-                cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
-                cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], input_ids, input_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], segment_ids, segment_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[2], input_mask, input_mask.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
                 # Run inference
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
                 # Synchronize the stream
-                stream.synchronize()
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 eval_time_elapsed += (time.time() - eval_start_time)
 
                 # Transfer predictions back from GPU
-                cuda.memcpy_dtoh_async(h_output, d_output, stream)
-                stream.synchronize()
-
+                err, = cuda.cuMemcpyDtoHAsync(h_output, d_output, h_output.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                # for x in h_output[0].reshape(-1,2):
+                #     print(x)
                 # Only retrieve and post-process the first batch
                 batch = h_output[0]
                 
@@ -218,7 +232,7 @@ if __name__ == '__main__':
                     end_logits = np.array(batch.squeeze()[:, 1]),
                     feature_index = feature_index
                     ))
-
+            
             eval_time_elapsed /= len(features)
 
             # Total number of n-best predictions to generate in the nbest_predictions.json output file
@@ -258,14 +272,16 @@ if __name__ == '__main__':
             batch_example = []
             max_batch_length = 0
             seq_length_list = []
-            for index in tqdm(sort_index):
+            for index in sort_index:
                 batch_feature.append(features_list[index])
                 batch_example.append(squad_examples[index])
                 max_batch_length = max(max_batch_length, len(features_list[index].input_ids))
                 if args.int8:
-                    max_batch_length = max_seq_length
-                else:
                     max_batch_length = math.ceil(max_batch_length / 2) * 2
+                else:
+                    # workround to solve bs=1 10% slow
+                    if args.batch_size == 1:
+                        max_batch_length = math.ceil(max_batch_length / 64) * 64
                 seq_length_list.append(len(features_list[index].input_ids))
                 if len(batch_feature) == args.batch_size:
                     batch_input_ids = [
@@ -319,28 +335,39 @@ if __name__ == '__main__':
                 for binding in range(3):
                     context.set_binding_shape(binding, (args.batch_size, max_seq_length))
                 assert context.all_binding_shapes_specified
-                cuda.memcpy_htod_async(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream)
-                cuda.memcpy_htod_async(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream)
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
-            stream.synchronize()
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel().nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel().nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
+            err, = cuda.cuStreamSynchronize(stream)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
             
-            start_time = time.time()
+            infer_toal_time = 0
             output_index = 0
             for input_ids, segment_ids in tqdm(all_token_ids):
                 for binding in range(3):
                     context.set_binding_shape(binding, input_ids.shape)
                 assert context.all_binding_shapes_specified
 
-                cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream)
-                cuda.memcpy_htod_async(d_inputs[1], segment_ids.ravel(), stream)
-                stream.synchronize()
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], input_ids.ravel(), input_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], segment_ids.ravel(), segment_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                infer_start_time = time.time()
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                infer_end_time = time.time()
+                infer_time = infer_end_time - infer_start_time
+                infer_toal_time += infer_time
+                err, = cuda.cuMemcpyDtoHAsync(h_output, d_output, h_output.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
-                stream.synchronize()
-                
-                cuda.memcpy_dtoh_async(h_output, d_output, stream)
-                stream.synchronize()
-    
                 new_h_output = np.array(h_output.reshape(-1)[:input_ids.shape[0]*input_ids.shape[1]*2]).reshape(input_ids.shape[0], input_ids.shape[1], 2)
                 for index in range(input_ids.shape[0]):
                     networkOutputs.append(_NetworkOutput(
@@ -349,7 +376,12 @@ if __name__ == '__main__':
                         feature_index = index
                     ))
                     output_index += 1
-            infer_time = time.time() - start_time
+            for i in range(3):
+                err, = cuda.cuMemFree(d_inputs[i])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+            err, = cuda.cuMemFree(d_output)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+            
             output_index = 0
             for (be, bf) in zip(batch_example_list, batch_feature_list):
                 for index in range(len(bf)):
@@ -357,7 +389,7 @@ if __name__ == '__main__':
                         [networkOutputs[output_index]], args.n_best_size, args.max_answer_length)
                     output_index += 1
                     all_precision[be[index].id] = prediction
-            return infer_time, all_precision
+            return infer_toal_time, all_precision
 
         status = 0
         if squad_examples:
@@ -366,21 +398,16 @@ if __name__ == '__main__':
             features_list = []
             lengths = []
 
-            for example_index, example in tqdm(enumerate(squad_examples)):
+            for example_index, example in enumerate(squad_examples):
                 features = question_features(example.doc_tokens, example.question_text)
                 features_list.append(features[0])
                 lengths.append(len(features[0].input_ids))
 
             sort_index = np.argsort(lengths)
-            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)
-            print(F"E2E time : {infer_time:.3f} seconds")
+            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)          
             
-            qps = len(squad_examples)/infer_time
+            qps = math.ceil(len(squad_examples)/args.batch_size)*args.batch_size/infer_time
             print(f"Latency QPS: {qps} sentences/s")
-            metricResult = {"metricResult": {}}
-            metricResult["metricResult"]["E2E time"] = round(infer_time, 3)
-            metricResult["metricResult"]["Latency QPS"] = round(qps, 3)
-            print(metricResult)
 
             with open(output_prediction_file, "w") as f:
                 f.write(json.dumps(all_predictions, indent=4))
@@ -415,4 +442,4 @@ if __name__ == '__main__':
                     # question_text = input("Question (to exit, type one of {:}): ".format(EXIT_CMDS))
         del context
         del engine
-        sys.exit(status)
\ No newline at end of file
+        sys.exit(status)
diff --git a/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py b/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..b370130872bdf2a94cdb87b42909a6b1ce889b58
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,13 @@
+from os.path import join, dirname, exists
+import tensorrt as trt
+import ctypes
+
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    trt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/perf.py b/models/nlp/plm/bert_large_squad/ixrt/perf.py
similarity index 81%
rename from models/nlp/plm/bert_large_squad/ixrt/python/perf.py
rename to models/nlp/plm/bert_large_squad/ixrt/perf.py
index 968a39435bd597639e427ad2ac745579250cda0f..9f98fe39edf62aa455dcb6ecf813d0672e13904a 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/perf.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/perf.py
@@ -1,34 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
 import argparse
 import ctypes
 import time
@@ -55,6 +24,7 @@ class DeviceBuffer(object):
 
 def main():
     parser = argparse.ArgumentParser(description='BERT Inference Benchmark')
+    parser.add_argument("-z", "--use_trt", action="store_false", help="Whether to use tensorRT or IxRT")
     parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine')
     parser.add_argument('-b', '--batch-size', default=[], action="append", help='Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.', type=int)
     parser.add_argument('-s', '--sequence-length', default=128, help='Sequence length of the BERT model', type=int)
@@ -66,7 +36,7 @@ def main():
     args.batch_size = args.batch_size or [1]
 
     # Import necessary plugins for BERT TensorRT
-    load_ixrt_plugin(TRT_LOGGER, dynamic_path="../build/libixrt_plugin.so")
+    load_ixrt_plugin(TRT_LOGGER)
 
     with open(args.engine, 'rb') as f:
         runtime = trt.Runtime(TRT_LOGGER)
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py b/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py
deleted file mode 100644
index 93301c303658d92832d68c78b61757610d6ab201..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from os.path import join, dirname, exists, abspath
-import tensorrt as trt
-import ctypes
-import os
-import subprocess
-
-def is_nvidia_platform():
-    try:
-        # 尝试运行 nvidia-smi
-        subprocess.check_output(['nvidia-smi'])
-        return True
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        return False
-
-def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(abspath(__file__)), "..", "build", "libixrt_plugin.so") 
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    handle = ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
-    handle.initLibNvInferPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-    handle.initLibNvInferPlugins.restype = ctypes.c_bool
-    handle.initLibNvInferPlugins(None, namespace.encode('utf-8'))
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh
deleted file mode 100644
index 7a7a05c5dbca037d8f31ef6c9b707a800902df2f..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 builder.py -x ./data/bert-large-uncased/bert_large_v1_1_fake_quant.onnx \
-                       -w 4096 \
-                       -o ./data/bert_large_384.engine \
-                       -s 1 384 384 \
-                       -b 1 ${BSZ} ${BSZ} \
-                       --fp16 \
-                       -c ./data/bert-large-uncased/bert_config.json
-else
-    echo 'USE_INT8=True'
-    python3 builder_int8.py -pt ./data/bert-large-uncased/bert_large_int8_qat.bin \
-                -o ./data/bert_large_384_int8.engine \
-                -s 1 384 384 \
-                -b 1 ${BSZ} ${BSZ} \
-                -i \
-                -c ./data/bert-large-uncased/bert_config.json 
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh
deleted file mode 100644
index 550c735e85b3b80202041a4ac878e73bcfeeaa14..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-PASSAGE='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, 
-speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations 
-for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components 
-to take advantage of powerful TensorRT optimizations for your apps.'
-QUESTION="What is TensorRT?"
-
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 inference.py -e ./data/bert_large_384.engine \
-                        -s 384 \
-                        -p $PASSAGE \
-                        -q $QUESTION \
-                        -v ./data/bert-large-uncased/vocab.txt 
-else
-    echo 'USE_INT8=True'
-    python3 inference.py -e ./data/bert_large_384_int8.engine \
-                        -s 384 \
-                        -p $PASSAGE \
-                        -q $QUESTION \
-                        -v ./data/bert-large-uncased/vocab.txt 
-fi
-
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh
deleted file mode 100644
index 088b1d39ed738804ab4959b2cfcb4948a02f4c33..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ./data/bert_large_384.engine \
-                            -b ${BSZ} \
-                            -s 384 \
-                            -sq ./data/squad/dev-v1.1.json \
-                            -v ./data/bert-large-uncased/vocab.txt \
-                            -o ./data/predictions-bert_large_384.json 
-    python3 evaluate-v1.1.py  ./data/squad/dev-v1.1.json  ./data/predictions-bert_large_384.json 90
-else
-    echo 'USE_INT8=True'
-    UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ./data/bert_large_384_int8.engine \
-                            -b ${BSZ} \
-                            -s 384 \
-                            -sq ./data/squad/dev-v1.1.json \
-                            -v ./data/bert-large-uncased/vocab.txt \
-                            -o ./data/predictions-bert_large_384_int8.json \
-                            -i
-    python3 evaluate-v1.1.py  ./data/squad/dev-v1.1.json  ./data/predictions-bert_large_384_int8.json 88
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh
deleted file mode 100644
index f19c1def4b139edc1e02b7ae595327dc367c919e..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-index=0
-options=("$@") # 将所有参数存储到数组中
-PRECISION=fp16
-BSZ=32
-
-# 循环遍历所有参数
-while [[ $index -lt ${#options[@]} ]]; do
-    argument=${options[$index]}
-    case $argument in
-    --bs)
-        ((index++))
-        BSZ=${options[$index]}
-        ;;
-    --prec)
-        ((index++))
-        PRECISION=${options[$index]}
-        ;;
-    esac
-    ((index++))
-done
-
-# 设置INT8_FLAG
-INT8_FLAG=""
-if [[ "$PRECISION" == "int8" ]]; then
-    INT8_FLAG="--int8"
-fi
-
-echo "PREC_FLAG=$INT8_FLAG"
-echo "PRECISION=$PRECISION"
-echo "BSZ=$BSZ"
-
-# 检查环境并执行相应的脚本
-if command -v ixsmi &>/dev/null; then
-    echo "MR env"
-    cmake -S . -B build
-    cmake --build build -j16
-elif command -v nvidia-smi &>/dev/null; then
-    echo "NV env"
-    cmake -S . -B build -DUSE_TENSORRT=true
-    cmake --build build -j16
-else
-    echo "No driver detected"
-    exit 1
-fi
-cd ./python/
-bash script/build_engine.sh --bs $BSZ $INT8_FLAG
-bash script/inference_squad.sh --bs $BSZ $INT8_FLAG
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh
deleted file mode 100644
index 1ad462a763ccf37c19a0914b6a9b684bae52232c..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 perf.py -e ./data/bert_large_384.engine -b ${BSZ} -s 384
-else
-    echo 'USE_INT8=True'
-    python3 perf.py -e ./data/bert_large_384_int8.engine -b ${BSZ} -s 384
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..02025e96a1a07b7f09dc55ac1bb8c200d721ed26
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh
@@ -0,0 +1,50 @@
+set -eo pipefail
+
+BSZ=32
+TGT=90
+USE_TRT=False
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+      --use_trt) USE_TRT=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+project_path=./
+checkpoints_path=${project_path}/data/bert-large-uncased
+datasets_path=${project_path}/data
+
+echo 'USE_TRT='${USE_TRT}
+export USE_TRT=$USE_TRT
+
+echo "Step1 Build Engine FP16(bert large squad)!"
+
+python3 builder.py -x ${checkpoints_path}/bert_large_v1_1_fake_quant.onnx \
+                   -w 4096 \
+                   -o ${checkpoints_path}/bert_large_b${BSZ}.engine \
+                   -s 1 384 384\
+                   -b 1 ${BSZ} ${BSZ}\
+                   --fp16 \
+                   -c ${checkpoints_path}/bert_config.json \
+                   -z ${USE_TRT}
+
+echo "Step2 Run dev.json and generate json"
+python3 inference.py -e ${checkpoints_path}/bert_large_b${BSZ}.engine \
+                        -s 384 \
+                        -b ${BSZ} \
+                        -sq ${datasets_path}/squad/dev-v1.1.json \
+                        -v ${checkpoints_path}/vocab.txt \
+                        -o ${checkpoints_path}/predictions-bert_large_b${BSZ}.json \
+                        -z ${USE_TRT}
+
+echo "Step3 Inference(test F1-score)"
+python3 evaluate-v1.1.py  ${datasets_path}/squad/dev-v1.1.json  ${checkpoints_path}/predictions-bert_large_b${BSZ}.json ${TGT}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44285fa117ee5be2798595d9c2466a46f96cfe5d
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh
@@ -0,0 +1,48 @@
+set -eo pipefail
+
+BSZ=32
+TGT=150
+USE_TRT=False
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+      --use_trt) USE_TRT=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+project_path=./
+checkpoints_path=${project_path}/data/bert-large-uncased
+datasets_path=${project_path}/data
+
+echo 'USE_TRT='${USE_TRT}
+export USE_TRT=$USE_TRT
+
+echo "Step1 Build Engine FP16(bert large squad)!"
+
+python3 builder.py -x ${checkpoints_path}/bert_large_v1_1_fake_quant.onnx \
+                   -w 4096 \
+                   -o ${checkpoints_path}/bert_large_b${BSZ}.engine \
+                   -s 1 384 384\
+                   -b 1 ${BSZ} ${BSZ}\
+                   --fp16 \
+                   -c ${checkpoints_path}/bert_config.json \
+                   -z ${USE_TRT}
+
+echo "Step2 Inference(test QPS)"
+UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ${checkpoints_path}/bert_large_b${BSZ}.engine \
+                        -s 384 \
+                        -b ${BSZ} \
+                        -sq ${datasets_path}/squad/dev-v1.1.json \
+                        -v ${checkpoints_path}/vocab.txt \
+                        -o ${checkpoints_path}/predictions-bert_large_b${BSZ}.json \
+                        -z ${USE_TRT} \
+                        --target_qps ${TGT}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ddbcf2340640027ff1bc5d6477c6e393c401f093
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh
@@ -0,0 +1,49 @@
+set -eo pipefail
+
+BSZ=32
+TGT=88
+USE_TRT=False
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+      --use_trt) USE_TRT=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+project_path=./
+checkpoints_path=${project_path}/data/bert-large-uncased
+datasets_path=${project_path}/data
+
+echo 'USE_TRT='${USE_TRT}
+export USE_TRT=$USE_TRT
+
+echo "Step1 Build Engine Int8(bert large squad)!"
+
+python3 builder_int8.py -pt ${checkpoints_path}/bert_large_int8_qat.bin \
+                -o ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \
+                -b 1 ${BSZ} ${BSZ} \
+                -s 1 384 384 \
+                -i \
+                -c ${checkpoints_path}
+
+echo "Step2 Run dev.json and generate json"
+python3 inference.py -e ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \
+                        -b ${BSZ} \
+                        -s 384 \
+                        -sq ${datasets_path}/squad/dev-v1.1.json \
+                        -v ${checkpoints_path}/vocab.txt \
+                        -o ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json \
+                        -z ${USE_TRT} \
+                        -i
+
+echo "Step3 Inference(test F1-score)"
+python3 evaluate-v1.1.py  ${datasets_path}/squad/dev-v1.1.json  ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json ${TGT}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3ead05efba507486eb53a32e8cb5cc29fb7bcfc5
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh
@@ -0,0 +1,47 @@
+set -eo pipefail
+
+BSZ=32
+TGT=200
+USE_TRT=False
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+      --use_trt) USE_TRT=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+project_path=./
+checkpoints_path=${project_path}/data/bert-large-uncased
+datasets_path=${project_path}/data
+
+echo 'USE_TRT='${USE_TRT}
+export USE_TRT=$USE_TRT
+
+echo "Step1 Build Engine Int8(bert large squad)!"
+
+python3 builder_int8.py -pt ${checkpoints_path}/bert_large_int8_qat.bin \
+                -o ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \
+                -b 1 ${BSZ} ${BSZ} \
+                -s 1 384 384 \
+                -i \
+                -c ${checkpoints_path}
+
+echo "Step2 Inference(test QPS)"
+UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \
+                        -b ${BSZ} \
+                        -s 384 \
+                        -sq ${datasets_path}/squad/dev-v1.1.json \
+                        -v ${checkpoints_path}/vocab.txt \
+                        -o ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json \
+                        -z ${USE_TRT} \
+                        --target_qps ${TGT} \
+                        -i         
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/prepare.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/prepare.sh
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/python/script/prepare.sh
rename to models/nlp/plm/bert_large_squad/ixrt/scripts/prepare.sh
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc b/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc
deleted file mode 100644
index ceea8d8b80468cce08d46637073504e1d3a4057f..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-#include <mutex>
-#include <stack>
-#include <unordered_set>
-
-#include "NvInfer.h"
-#include "NvInferPlugin.h"
-#include "NvInferRuntimeCommon.h"
-#include "custom_fc/fcPlugin.h"
-#include "emb_layernorm/embLayerNormPlugin.h"
-#include "emb_layernorm/embLayerNormInt8Plugin.h"
-#include "gelu/geluPlugin.h"
-#include "qkv_to_context/qkvToContextInt8Plugin.h"
-#include "qkv_to_context/qkvToContextPlugin.h"
-#include "skip_layernorm/skipLayerNormInt8Plugin.h"
-#include "skip_layernorm/skipLayerNormPlugin.h"
-#include "ffn/ffnPlugin.h"
-
-using namespace nvinfer1;
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-extern ILogger* gLogger;
-
-}  // namespace plugin
-}  // namespace nvinfer1
-
-namespace {
-// This singleton ensures that each plugin is only registered once for a given
-// namespace and type, and attempts of duplicate registration are ignored.
-class PluginCreatorRegistry {
-   public:
-    static PluginCreatorRegistry& getInstance() {
-        static PluginCreatorRegistry instance;
-        return instance;
-    }
-
-    string GetPluginUniqKey(const AsciiChar* const plugin_namespace, const AsciiChar* const plugin_name,
-                            const AsciiChar* const plugin_version) {
-        stringstream os;
-        if (plugin_namespace[0] != '\0') {
-            os << plugin_namespace << "/";
-        }
-        os << plugin_name;
-        if (plugin_version[0] != '\0') {
-            os << "/" << plugin_version;
-        }
-        return os.str();
-    }
-
-    template <typename CreatorType>
-    void addPluginCreator(void* logger, char const* libNamespace) {
-        printf("start addPluginCreator %s\n", libNamespace);
-        // Make accesses to the plugin creator registry thread safe
-        std::lock_guard<std::mutex> lock(mRegistryLock);
-
-        std::string errorMsg;
-        std::string verboseMsg;
-
-        std::unique_ptr<CreatorType> pluginCreator{new CreatorType{}};
-        pluginCreator->setPluginNamespace(libNamespace);
-
-        nvinfer1::ixrt_plugin::gLogger = static_cast<nvinfer1::ILogger*>(logger);
-        std::string pluginType = GetPluginUniqKey(pluginCreator->getPluginNamespace(), pluginCreator->getPluginName(),
-                                                  pluginCreator->getPluginVersion());
-
-        if (mRegistryList.find(pluginType) == mRegistryList.end()) {
-            bool status = getPluginRegistry()->registerCreator(*pluginCreator, libNamespace);
-            if (status) {
-                mRegistry.push(std::move(pluginCreator));
-                mRegistryList.insert(pluginType);
-                printf("Registered plugin creator -  %s\n", pluginType.c_str());
-                verboseMsg = "Registered plugin creator - " + pluginType;
-            } else {
-                printf("Could not register plugin creator - %s\n", pluginType.c_str());
-                errorMsg = "Could not register plugin creator -  " + pluginType;
-            }
-        } else {
-            printf("Plugin creator already registered - %s\n", pluginType.c_str());
-            verboseMsg = "Plugin creator already registered - " + pluginType;
-        }
-
-        if (logger) {
-            if (!errorMsg.empty()) {
-                nvinfer1::ixrt_plugin::gLogger->log(ILogger::Severity::kERROR, errorMsg.c_str());
-            }
-            if (!verboseMsg.empty()) {
-                nvinfer1::ixrt_plugin::gLogger->log(ILogger::Severity::kVERBOSE, verboseMsg.c_str());
-            }
-        }
-    }
-
-    ~PluginCreatorRegistry() {
-        std::lock_guard<std::mutex> lock(mRegistryLock);
-
-        // Release pluginCreators in LIFO order of registration.
-        while (!mRegistry.empty()) {
-            mRegistry.pop();
-        }
-        mRegistryList.clear();
-    }
-
-   private:
-    PluginCreatorRegistry() {}
-
-    std::mutex mRegistryLock;
-    std::stack<std::unique_ptr<IPluginCreator>> mRegistry;
-    std::unordered_set<std::string> mRegistryList;
-
-   public:
-    PluginCreatorRegistry(PluginCreatorRegistry const&) = delete;
-    void operator=(PluginCreatorRegistry const&) = delete;
-};
-
-template <typename CreatorType>
-void initializePlugin(void* logger, char const* libNamespace) {
-    PluginCreatorRegistry::getInstance().addPluginCreator<CreatorType>(logger, libNamespace);
-}
-
-}  // namespace
-
-extern "C" {
-bool initLibNvInferPlugins(void* logger, const char* libNamespace) {
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FCPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FCInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FFNPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::EmbLayerNormPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::EmbLayerNormInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::GeluPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::QKVToContextPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::QKVToContextInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::SkipLayerNormPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::SkipLayerNormInt8PluginHFaceCreator>(logger, libNamespace);
-    return true;
-}
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h
deleted file mode 100644
index bd094b403acf8fdc83b90ea6628c989e84815316..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h
+++ /dev/null
@@ -1,299 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-
-#include <stdexcept>
-
-#ifndef C10_WARP_SIZE
-
-#ifdef __ILUVATAR__
-#define C10_WARP_SIZE 64
-#else
-#define C10_WARP_SIZE 32
-#endif
-
-#endif
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-const float epsilon = 0.000000000001;
-const unsigned int WARP_REDUCE_MASK = 0xffffffff;
-const float CUDA_FLOAT_INF_NEG = -100000000.f;  // FIXME later
-const float CUDA_FLOAT_INF_POS = 100000000.f;   // FIXME later
-const int CUDA_INT_INF = 2147483647;
-const int MAX_THREADS = 1024;
-
-__forceinline__ __device__ int8_t float2int8(float x, float quant_scale) {
-    float i8_f = x * quant_scale;
-    int32_t i8 = floorf(i8_f + 0.5);
-    i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-    return int8_t(i8);
-}
-
-inline __device__ void WelfordCombine(float val, float *mean, float *m2, float *count) {
-    // Use Welford Online algorithem to compute mean and variance
-    // For more details you can refer to:
-    // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
-    *count += 1;
-    float delta1 = val - *mean;
-    *mean += delta1 / *count;
-    float delta2 = val - *mean;
-    *m2 += delta1 * delta2;
-}
-
-inline __device__ void WelfordCombine(float b_mean, float b_m2, float b_count, float *mean, float *m2, float *count) {
-    if (b_count == 0) {
-        return;
-    }
-    float new_count = *count + b_count;
-    float nb_over_n = b_count / new_count;
-    float delta = b_mean - *mean;
-    *mean += delta * nb_over_n;
-    *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
-    *count = new_count;
-}
-
-__inline__ __device__ void WelfordWarpReduce(float thread_mean, float thread_m2, float thread_count, float *mean,
-                                             float *m2, float *count) {
-    *mean = thread_mean;
-    *m2 = thread_m2;
-    *count = thread_count;
-    for (int mask = C10_WARP_SIZE / 2; mask > 0; mask /= 2) {
-        float b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
-        float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask);
-        float b_count = __shfl_down_sync(0xffffffff, *count, mask);
-        WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
-    }
-}
-// addd by pxl
-// block内所有数据完成reduce
-//  template <int >
-__inline__ __device__ void WelfordBlockAllReduce(float thread_mean, float thread_m2, float thread_count,
-                                                 float *result_mean, float *result_m2, float *result_count) {
-    __shared__ float mean_shared[C10_WARP_SIZE];
-    __shared__ float m2_shared[C10_WARP_SIZE];
-    __shared__ float count_shared[C10_WARP_SIZE];
-    __shared__ float mean_result_broadcast;
-    __shared__ float m2_result_broadcast;
-    __shared__ float count_result_broadcast;
-
-    const int lid = threadIdx.x % C10_WARP_SIZE;
-    const int wid = threadIdx.x / C10_WARP_SIZE;
-    float warp_mean = 0;
-    float warp_m2 = 0;
-    float warp_count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
-    __syncthreads();
-
-    if (lid == 0) {
-        mean_shared[wid] = warp_mean;
-        m2_shared[wid] = warp_m2;
-        count_shared[wid] = warp_count;
-    }
-    __syncthreads();
-
-    if (wid == 0) {
-        if (threadIdx.x < blockDim.x / C10_WARP_SIZE) {
-            warp_mean = mean_shared[lid];
-            warp_m2 = m2_shared[lid];
-            warp_count = count_shared[lid];
-
-        } else {
-            warp_mean = 0.f;
-            warp_m2 = 0.f;
-            warp_count = 0.f;
-        }
-        __syncwarp();
-
-        float block_mean = 0;
-        float block_m2 = 0;
-        float block_count = 0;
-
-        WelfordWarpReduce(warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
-
-        if (lid == 0) {
-            mean_result_broadcast = block_mean;
-            m2_result_broadcast = block_m2;
-            count_result_broadcast = block_count;
-        }
-    }
-    __syncthreads();
-    *result_mean = mean_result_broadcast;
-    *result_m2 = m2_result_broadcast;
-    *result_count = count_result_broadcast;
-}
-__forceinline__ __device__ char4 float42char4(float4 vals, float quant_scale) {
-    char4 res;
-    res.x = float2int8(vals.x, quant_scale);
-    res.y = float2int8(vals.y, quant_scale);
-    res.z = float2int8(vals.z, quant_scale);
-    res.w = float2int8(vals.w, quant_scale);
-    return res;
-}
-
-// load 两个 half2, 保存到 float4
-__forceinline__ __device__ void load_float4_from_half(float4 &vals, __half2 *input, int index) {
-    __half2 i1 = input[index * 2];
-    __half2 i2 = input[index * 2 + 1];
-
-    vals.x = __half2float(i1.x);
-    vals.y = __half2float(i1.y);
-    vals.z = __half2float(i2.x);
-    vals.w = __half2float(i2.y);
-}
-
-/* Convert vector index to 3-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1, int dim2, int *id0, int *id1, int *id2) {
-    *id2 = src % dim2;
-    src /= dim2;
-
-    *id1 = src % dim1;
-    *id0 = src / dim1;
-}
-
-__forceinline__ __device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2, int hidden_size,
-                                                            float epsilon, float4 scale, float4 bias) {
-    float4 norm_value;
-    norm_value.x =
-        (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.x + bias.x;
-    norm_value.y =
-        (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.y + bias.y;
-    norm_value.z =
-        (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.z + bias.z;
-    norm_value.w =
-        (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.w + bias.w;
-    return norm_value;
-}
-
-// for layer norm
-__forceinline__ __device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2, int hidden_size,
-                                                            float epsilon, half2 scale_1, half2 scale_2, half2 bias_1,
-                                                            half2 bias_2) {
-    float4 norm_value;
-    norm_value.x =
-        (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + __half2float(bias_1.x);
-    norm_value.y =
-        (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + __half2float(bias_1.y);
-    norm_value.z =
-        (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_2.x) + __half2float(bias_2.x);
-    norm_value.w =
-        (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_2.y) + __half2float(bias_2.y);
-    return norm_value;
-}
-/* Convert half2 into float2, mask inf and -inf */
-__forceinline__ __host__ __device__ float safe_half_to_float(half hval) {
-    return fmax(fmin(100000.f, __half2float(hval)), -100000.f);
-}
-__forceinline__ __device__ float4 char4addfloat4_dequant(char4 input_4, float4 residual,
-                                                        float dequant_scale) {
-    float4 res;
-    res.x = __int2float_rn(input_4.x) * dequant_scale + residual.x;
-    res.y = __int2float_rn(input_4.y) * dequant_scale + residual.y;
-    res.z = __int2float_rn(input_4.z) * dequant_scale + residual.z;
-    res.w = __int2float_rn(input_4.w) * dequant_scale + residual.w;
-    return res;
-}
-
-__forceinline__ __device__ float4 char4addhalf2_dequant(char4 input_4, half2 residual_1, half2 residual_2,
-                                                        float dequant_scale) {
-    float4 res;
-    res.x = __int2float_rn(input_4.x) * dequant_scale + safe_half_to_float(residual_1.x);
-    res.y = __int2float_rn(input_4.y) * dequant_scale + safe_half_to_float(residual_1.y);
-    res.z = __int2float_rn(input_4.z) * dequant_scale + safe_half_to_float(residual_2.x);
-    res.w = __int2float_rn(input_4.w) * dequant_scale + safe_half_to_float(residual_2.y);
-    return res;
-}
-
-// gelu
-//  IxinferBiasGeluI8II8OKernel
-template <typename T>
-__forceinline__ __device__ T tanhf_exp(T x) {
-    // float e1 = __expf(x);
-    // float e2 = 1.0f / e1;
-    // return (e1 - e2) / (e1 + e2);
-
-    return (2.f / (1.f + __expf(-2.f * x)) - 1.f);
-}
-
-template <typename T>
-__forceinline__ __device__ T gelu(T x) {
-    float cdf = 0.5f * (1.0f + tanhf_exp((0.7978845608028654f * (x + 0.044715f * x * x * x))));
-    return x * cdf;
-}
-
-// softmax
-__forceinline__ __host__ __device__ int log2_ceil(int value) {
-    int log2_value = 0;
-    while ((1 << log2_value) < value) ++log2_value;
-    return log2_value;
-}
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width, unsigned int mask = 0xffffffff) {
-#if !(defined(__HIP_PLATFORM_HCC__) || defined(__ILUVATAR__))
-    return __shfl_xor_sync(mask, value, laneMask, width);
-#else
-    return __shfl_xor(value, laneMask, width);
-#endif
-}
-
-template <typename T>
-struct Add {
-    __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-struct Max {
-    __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
-};
-template <typename acc_t, int REDUCE_WARP_SIZE, template <typename> class ReduceOp>
-__device__ __forceinline__ void warp_reduce(acc_t *sum) {
-    ReduceOp<acc_t> r;
-#pragma unroll
-    for (int offset = REDUCE_WARP_SIZE / 2; offset > 0; offset /= 2) {
-        acc_t b = WARP_SHFL_XOR(*sum, offset, REDUCE_WARP_SIZE);
-        *sum = r(*sum, b);
-    }
-}
-/* Convert 3-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int targetid_3dim(int id1, int id2, int id3, int dim2, int dim3) {
-    return id1 * dim2 * dim3 + id2 * dim3 + id3;
-}
-
-/* Convert 4-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int targetid_4dim(int id1, int id2, int id3, int id4, int dim2, int dim3,
-                                                      int dim4) {
-    // return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
-    int res = id4;
-
-    int ld = dim4;
-    res += id3 * ld;
-
-    ld *= dim3;
-    res += id2 * ld;
-
-    ld *= dim2;
-    res += id1 * ld;
-
-    return res;
-}
-
-}  // namespace backend
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h
deleted file mode 100644
index c0f3484255b81e5e3d60d79e981359d0fa90c1cf..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cublasLt.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <stdexcept>
-
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-/* GPU function guard */
-
-/**
- * @brief cublasLt gemm without imma
- *
- * @tparam OutType output dtype
- * @tparam ScaleType scale dtype
- * @param input_a
- * @param input_b
- * @param output_c
- * @param batch_count
- * @param m
- * @param n
- * @param k
- * @param stridea
- * @param strideb
- * @param stridec
- * @param alpha
- * @param cublasLt_handle
- * @param stream
- */
-template <typename OutType, typename ScaleType>
-void cublaslt_gemm(const int8_t* input_a, const int8_t* input_b, OutType* output_c, int batch_count, int m, int n,
-                   int k, int64_t stridea, int64_t strideb, int64_t stridec, const ScaleType alpha,
-                   cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-    cublasOperation_t transpose = CUBLAS_OP_T;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t compute_type = CUDA_R_32I;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype;
-    cudaDataType_t scale_dtype;
-    if (std::is_same<OutType, int32_t>::value) {
-        out_dtype = CUDA_R_32I;
-        scale_dtype = CUDA_R_32I;
-    } else if (std::is_same<OutType, int8_t>::value) {
-        out_dtype = CUDA_R_8I;
-        scale_dtype = CUDA_R_32F;
-    } else {
-        throw std::runtime_error("Unsupported output type");
-    }
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-    CHECK_GPU_ERROR(
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transpose, sizeof(transpose)));
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_8I, k, m, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_8I, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    ScaleType beta = ScaleType(0);
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-inline void cublaslt_gemm(const half* input_a, const half* input_b, half* output_c, int batch_count, int m, int n,
-                          int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                          cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-    cublasOperation_t transpose = CUBLAS_OP_T;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
-#else
-    cudaDataType_t compute_type = CUDA_R_32F;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype = CUDA_R_16F;
-    cudaDataType_t scale_dtype = CUDA_R_32F;
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-    CHECK_GPU_ERROR(
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transpose, sizeof(transpose)));
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_16F, k, m, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_16F, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    float beta = 0.0;
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-template void cublaslt_gemm<int32_t, int32_t>(const int8_t* input_a, const int8_t* input_b, int32_t* output_c,
-                                              int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                              int64_t stridec, const int32_t alpha, cublasLtHandle_t cublasLt_handle,
-                                              cudaStream_t stream);
-
-template void cublaslt_gemm<int8_t, float>(const int8_t* input_a, const int8_t* input_b, int8_t* output_c,
-                                           int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                           int64_t stridec, const float alpha, cublasLtHandle_t cublasLt_handle,
-                                           cudaStream_t stream);
-
-/************add by pxl *************/
-template <typename OutType, typename ScaleType>
-void cublaslt_gemm_nn(const int8_t* input_a, const int8_t* input_b, OutType* output_c, int batch_count, int m, int n,
-                      int k, int64_t stridea, int64_t strideb, int64_t stridec, const ScaleType alpha,
-                      cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t compute_type = CUDA_R_32I;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype;
-    cudaDataType_t scale_dtype;
-    if (std::is_same<OutType, int32_t>::value) {
-        out_dtype = CUDA_R_32I;
-        scale_dtype = CUDA_R_32I;
-    } else if (std::is_same<OutType, int8_t>::value) {
-        out_dtype = CUDA_R_8I;
-        scale_dtype = CUDA_R_32F;
-    } else {
-        throw std::runtime_error("Unsupported output type");
-    }
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_8I, m, k, m));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_8I, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    ScaleType beta = ScaleType(0);
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-template void cublaslt_gemm_nn<int32_t, int32_t>(const int8_t* input_a, const int8_t* input_b, int32_t* output_c,
-                                                 int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                                 int64_t stridec, const int32_t alpha, cublasLtHandle_t cublasLt_handle,
-                                                 cudaStream_t stream);
-
-template void cublaslt_gemm_nn<int8_t, float>(const int8_t* input_a, const int8_t* input_b, int8_t* output_c,
-                                              int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                              int64_t stridec, const float alpha, cublasLtHandle_t cublasLt_handle,
-                                              cudaStream_t stream);
-
-inline void cublaslt_gemm_nn(const half* input_a, const half* input_b, half* output_c, int batch_count, int m, int n,
-                          int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                          cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
-#else
-    cudaDataType_t compute_type = CUDA_R_32F;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype = CUDA_R_16F;
-    cudaDataType_t scale_dtype = CUDA_R_32F;
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_16F, m, k, m));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_16F, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    float beta = 0.0;
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-}  // namespace backend
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu b/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu
deleted file mode 100644
index b3f0bbcb3322868b8f9ec485cb294beec2373008..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "ixinfer_gemm_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace backend {
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                     int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: m,k input_b: n,k  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, const float *bias, int8_t *output_c, int batch_count,
-                     int m, int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const float beta, const int act_type, cuinferHandle_t &cuinfer_handle, cudaStream_t &stream) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_nn_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: n,k  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_N;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = m;
-    int ldb = k;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_nt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: k,n  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_T;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = m;
-    int ldb = n;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_tt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: k,n  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_T;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = k;
-    int ldb = n;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                  int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t handle,
-                  cudaStream_t stream) {
-    /* Performs operation using cublas */
-    float beta = 0.0f;
-    cublasSetStream(handle, stream);
-    cublasStatus_t status;
-    if (batch_count <= 1) {
-        status = cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, k, input_b,
-                              CUDA_R_16F, k, &beta, output_c, CUDA_R_16F, m, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    } else {
-        status = cublasGemmStridedBatchedEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, k,
-                                            stridea, input_b, CUDA_R_16F, k, strideb, &beta, output_c, CUDA_R_16F, m,
-                                            stridec, batch_count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    }
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinfer_gemm error!");
-    }
-}
-
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                     int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t handle,
-                     cudaStream_t stream) {
-    /* Performs operation using cublas */
-    float beta = 0.0f;
-    cublasSetStream(handle, stream);
-    cublasStatus_t status;
-    if (batch_count <= 1) {
-        // k,m n,k
-        status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, m, input_b,
-                              CUDA_R_16F, k, &beta, output_c, CUDA_R_16F, m, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    } else {
-        status = cublasGemmStridedBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, m,
-                                            stridea, input_b, CUDA_R_16F, k, strideb, &beta, output_c, CUDA_R_16F, m,
-                                            stridec, batch_count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    }
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinfer_gemm error!");
-    }
-}
-
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-        // std::cout << "CUINFER_BLAS_GEMM_CUSTOM_NONE" << std::endl;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, const float beta,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-        // std::cout << "CUINFER_BLAS_GEMM_CUSTOM_NONE" << std::endl;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-    // float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = m;
-    int ldb = k;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_nt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_T;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = m;
-    int ldb = n;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_tt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_T;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = k;
-    int ldb = n;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-}  // namespace backend
-}  // namespace nvinfer1::ixrt_plugin
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h
deleted file mode 100644
index 2433b3a15ad2b4fb277dc0c5a233f28541cbb132..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <ixinfer.h>
-
-#include <stdexcept>
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                     int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, const float *bias, int8_t *output_c, int batch_count,
-                     int m, int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const float beta, const int act_type, cuinferHandle_t &cuinfer_handle, cudaStream_t &stream);
-
-void cuinfer_nn_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_nt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_tt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                  int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t cublas_handle,
-                  cudaStream_t stream);
-
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                     int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t cublas_handle,
-                     cudaStream_t stream);
-
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, const float beta,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_nt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_tt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h
deleted file mode 100644
index a75d902fb263c5ed484ec932e4c2c579b7db10c0..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cuda_fp16.h>
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <numeric>
-
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-constexpr uint32_t BDIM = 0;  // batch dimension
-constexpr uint32_t SDIM = 1;  // seq len dimension
-constexpr uint32_t HDIM = 2;  // hidden dimension
-
-#define TRT_UNUSED (void)
-
-template <typename T>
-struct CudaDeleter {
-    void operator()(T* buf) { IXRT_PLUGIN_CUASSERT(cudaFree(buf)); }
-};
-
-template <typename T>
-using cuda_unique_ptr = std::unique_ptr<T, CudaDeleter<T>>;
-
-inline uint32_t getElementSize(nvinfer1::DataType t) noexcept {
-    switch (t) {
-        case nvinfer1::DataType::kINT32:
-            return 4;
-        case nvinfer1::DataType::kFLOAT:
-            return 4;
-        case nvinfer1::DataType::kHALF:
-            return 2;
-        case nvinfer1::DataType::kBOOL:
-        // case nvinfer1::DataType::kUINT8:
-        case nvinfer1::DataType::kINT8:
-            return 1;
-        default:
-            break;
-        // case DataType::kUNKNOWN:
-        // case DataType::kINT64:
-        // case DataType::kFLOAT64:
-            // break;
-    }
-    return 0;
-}
-
-inline int64_t getWeightsSize(nvinfer1::Weights const& w, nvinfer1::DataType type) {
-    return w.count * getElementSize(type);
-}
-
-template <typename T>
-using cuda_shared_ptr = std::shared_ptr<T>;
-
-template <typename T>
-void make_cuda_shared(cuda_shared_ptr<T>& ptr, void* cudaMem) {
-    ptr.reset(static_cast<T*>(cudaMem), bert::CudaDeleter<T>());
-}
-
-struct WeightsWithOwnership : public nvinfer1::Weights {
-    ILogger* logger_;
-    WeightsWithOwnership() {
-        values = nullptr;
-        count = 0;
-    }
-    ~WeightsWithOwnership() { operator delete[](const_cast<void*>(values)); }
-
-    WeightsWithOwnership(WeightsWithOwnership const&) = delete;
-    WeightsWithOwnership operator=(WeightsWithOwnership const&) = delete;
-    WeightsWithOwnership(WeightsWithOwnership const&&) = delete;
-    WeightsWithOwnership operator=(WeightsWithOwnership const&&) = delete;
-
-    void convertAndCopy(nvinfer1::Weights const& src, nvinfer1::DataType type, float scale = 1) {
-        this->type = type;
-        this->count = src.count;
-
-        if (type == nvinfer1::DataType::kFLOAT) {
-            auto destBuf = new float[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kFLOAT) {
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Float Array(Host)" << endl;
-                std::copy_n(static_cast<float const*>(src.values), src.count, destBuf);
-            } else {
-                IXRT_PLUGIN_ASSERT(src.type == nvinfer1::DataType::kHALF);
-
-                ixrt_plugin::gLogInfo << "Half Weights(Host) => Float Array(Host)" << endl;
-                auto const s = static_cast<half const*>(src.values);
-                auto d = static_cast<float*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    d[it] = __half2float(s[it]);
-                }
-            }
-        } else if (type == nvinfer1::DataType::kHALF) {
-            auto destBuf = new half[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kHALF) {
-                ixrt_plugin::gLogInfo << "Half Weights(Host) => Half Array(Host)" << endl;
-                std::copy_n(static_cast<half const*>(src.values), src.count, destBuf);
-            } else {
-                IXRT_PLUGIN_ASSERT(src.type == nvinfer1::DataType::kFLOAT);
-
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Half Array(Host)" << endl;
-                auto const s = static_cast<float const*>(src.values);
-                auto d = static_cast<half*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    d[it] = __float2half(s[it]);
-                }
-            }
-        } else if (type == nvinfer1::DataType::kINT8) {
-            auto destBuf = new int8_t[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kFLOAT) {
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Int8 Array(Host)" << endl;
-                auto const s = static_cast<float const*>(src.values);
-                auto d = static_cast<int8_t*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    int32_t v = static_cast<int32_t>(std::roundf(s[it] / scale));
-                    d[it] = v <= -127 ? -127 : (v >= 127 ? 127 : v);
-                }
-            } else if (src.type == nvinfer1::DataType::kINT8) {
-                ixrt_plugin::gLogInfo << "Int8 Weights(Host) => Int8 Array(Host)" << endl;
-                std::copy_n(static_cast<int8_t const*>(src.values), src.count, destBuf);
-            } else {
-                throw std::runtime_error("Unsupported DataType specified for plugin.");
-            }
-        } else {
-            throw std::runtime_error("Unsupported DataType specified for plugin.");
-        }
-    }
-
-    void convertAndCopy(char const*& srcBuf, size_t count, nvinfer1::DataType type) noexcept {
-        this->type = type;
-        this->count = count;
-        auto const nbBytes = getWeightsSize(*this, type);
-        auto destBuf = new char[nbBytes];
-        this->values = destBuf;
-
-        std::copy_n(srcBuf, nbBytes, destBuf);
-        srcBuf += nbBytes;
-    }
-};
-
-template <typename T>
-inline void copyToDevice(WeightsWithOwnership& hostWeights, size_t nbBytes, cuda_unique_ptr<T>& cudaWeights) {
-    if (hostWeights.values) {
-        void* cudaMem{nullptr};
-        IXRT_PLUGIN_CUASSERT(cudaMalloc(&cudaMem, nbBytes));
-        IXRT_PLUGIN_CUASSERT(cudaMemcpy(cudaMem, hostWeights.values, nbBytes, cudaMemcpyHostToDevice));
-        cudaWeights.reset(static_cast<T*>(cudaMem));
-    }
-}
-
-template <typename T>
-inline void serFromDev(char*& buffer, T const* data, size_t nbElem) {
-    const size_t len = sizeof(T) * nbElem;
-    IXRT_PLUGIN_CUASSERT(cudaMemcpy(buffer, static_cast<void const*>(data), len, cudaMemcpyDeviceToHost));
-    buffer += len;
-}
-
-template <typename T>
-inline T* deserToDev(char const*& buffer, size_t nbElem) {
-    void* dev{nullptr};
-    const size_t len = sizeof(T) * nbElem;
-    IXRT_PLUGIN_CUASSERT(cudaMalloc(&dev, len));
-    IXRT_PLUGIN_CUASSERT(cudaMemcpy(dev, buffer, len, cudaMemcpyHostToDevice));
-
-    buffer += len;
-    return static_cast<T*>(dev);
-}
-
-inline nvinfer1::DataType fieldTypeToDataType(const nvinfer1::PluginFieldType ftype) {
-    switch (ftype) {
-        case nvinfer1::PluginFieldType::kFLOAT32: {
-            gLogInfo << "PluginFieldType is Float32" << endl;
-            return nvinfer1::DataType::kFLOAT;
-        }
-        case nvinfer1::PluginFieldType::kFLOAT16: {
-            gLogInfo << "PluginFieldType is Float16" << endl;
-            return nvinfer1::DataType::kHALF;
-        }
-        case nvinfer1::PluginFieldType::kINT32: {
-            gLogInfo << "PluginFieldType is Int32" << endl;
-            return nvinfer1::DataType::kINT32;
-        }
-        case nvinfer1::PluginFieldType::kINT8: {
-            gLogInfo << "PluginFieldType is Int8" << endl;
-            return nvinfer1::DataType::kINT8;
-        }
-        default:
-            throw std::invalid_argument("No corresponding datatype for plugin field type");
-    }
-}
-
-inline int64_t volume(nvinfer1::Dims const& d) {
-    return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies<int64_t>{});
-}
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp
deleted file mode 100644
index 8e705d6cdb96aef58aa1169cd6d99b5671d0d69e..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "checkMacrosPlugin.h"
-
-#include "NvInferRuntimeCommon.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-ILogger* gLogger{};
-
-template <ILogger::Severity kSeverity>
-int32_t LogStream<kSeverity>::Buf::sync() {
-    std::string s = str();
-    while (!s.empty() && s.back() == '\n') {
-        s.pop_back();
-    }
-    if (gLogger != nullptr) {
-        gLogger->log(kSeverity, s.c_str());
-    }
-    str("");
-    return 0;
-}
-
-// These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger
-// (otherwise, it will not log)
-LogStream<ILogger::Severity::kERROR> gLogError;
-LogStream<ILogger::Severity::kWARNING> gLogWarning;
-LogStream<ILogger::Severity::kINFO> gLogInfo;
-LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h
deleted file mode 100644
index 76d87a927516e4521ebb5233c1e2b729feab9532..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cublasLt.h>
-
-#include <cassert>
-#include <iostream>
-#include <mutex>
-#include <sstream>
-
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-
-// Logs failed assertion and aborts.
-// Aborting is undesirable and will be phased-out from the plugin module, at which point
-// PLUGIN_ASSERT will perform the same function as PLUGIN_VALIDATE.
-using namespace std;
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-#ifdef _MSC_VER
-#define FN_NAME __FUNCTION__
-#else
-#define FN_NAME __func__
-#endif
-
-#define IXRT_PLUGIN_CHECK_VALUE(value, msg)                            \
-    {                                                                  \
-        if (not(value)) {                                              \
-            std::cerr << __FILE__ << " (" << __LINE__ << ")"           \
-                      << "-" << __FUNCTION__ << " : "                  \
-                      << " Plugin assert error: " << msg << std::endl; \
-            std::exit(EXIT_FAILURE);                                   \
-        }                                                              \
-    }
-
-#define IXRT_PLUGIN_ASSERT(value)                             \
-    {                                                         \
-        if (not(value)) {                                     \
-            std::cerr << __FILE__ << " (" << __LINE__ << ")"  \
-                      << "-" << __FUNCTION__ << " : "         \
-                      << " Plugin assert false" << std::endl; \
-            std::exit(EXIT_FAILURE);                          \
-        }                                                     \
-    }
-
-#define IXRT_PLUGIN_CHECK_CUDA(call)                                        \
-    do {                                                                    \
-        const cudaError_t error_code = call;                                \
-        if (error_code != cudaSuccess) {                                    \
-            printf("CUDA Error:\n");                                        \
-            printf("    File:       %s\n", __FILE__);                       \
-            printf("    Line:       %d\n", __LINE__);                       \
-            printf("    Error code: %d\n", error_code);                     \
-            printf("    Error text: %s\n", cudaGetErrorString(error_code)); \
-            exit(1);                                                        \
-        }                                                                   \
-    } while (0)
-
-inline void caughtError(const std::exception& e) { std::cerr << e.what() << std::endl; }
-
-#define IXRT_PLUGIN_FAIL(msg)                         \
-    do {                                              \
-        std::ostringstream stream;                    \
-        stream << "Assertion failed: " << msg << "\n" \
-               << __FILE__ << ':' << __LINE__ << "\n" \
-               << "Aborting..."                       \
-               << "\n";                               \
-        IXRT_PLUGIN_CHECK_CUDA(cudaDeviceReset());    \
-        abort;                                        \
-    } while (0)
-
-inline void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg) {
-    std::cerr << file << " (" << line << ")"
-              << "-" << function << " : " << msg << std::endl;
-    std::exit(EXIT_FAILURE);
-}
-
-#define IXRT_PLUGIN_CUASSERT(status_)                             \
-    {                                                             \
-        auto s_ = status_;                                        \
-        if (s_ != cudaSuccess) {                                  \
-            const char* msg = cudaGetErrorString(s_);             \
-            throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg); \
-        }                                                         \
-    }
-
-#undef CUINFER_CHECK
-#define CUINFER_CHECK(func)                                                              \
-    do {                                                                                 \
-        cuinferStatus_t status = (func);                                                 \
-        if (status != CUINFER_STATUS_SUCCESS) {                                          \
-            std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \
-                      << cuinferGetErrorString(status) << std::endl;                     \
-            std::exit(EXIT_FAILURE);                                                     \
-        }                                                                                \
-    } while (0)
-
-static std::string _cudaGetErrorString(cublasStatus_t error) {
-    switch (error) {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-    return "CUBLAS_UNKNOW";
-}
-
-template <typename T>
-void check_gpu_error(T result, char const* const func, const char* const file, int const line) {
-    if (result) {
-        throw std::runtime_error(std::string("[CUDA][ERROR] ") + +file + "(" + std::to_string(line) +
-                                 "): " + (_cudaGetErrorString(result)) + "\n");
-    }
-}
-
-#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
-
-template <ILogger::Severity kSeverity>
-class LogStream : public std::ostream {
-    class Buf : public std::stringbuf {
-       public:
-        int32_t sync() override;
-    };
-
-    Buf buffer;
-    std::mutex mLogStreamMutex;
-
-   public:
-    std::mutex& getMutex() { return mLogStreamMutex; }
-    LogStream() : std::ostream(&buffer){};
-};
-
-// Use mutex to protect multi-stream write to buffer
-template <ILogger::Severity kSeverity, typename T>
-LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, T const& msg) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << msg;
-    return stream;
-}
-
-// Special handling static numbers
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, int32_t num) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << num;
-    return stream;
-}
-
-// Special handling std::endl
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, std::ostream& (*f)(std::ostream&)) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << f;
-    return stream;
-}
-
-extern LogStream<ILogger::Severity::kERROR> gLogError;
-extern LogStream<ILogger::Severity::kWARNING> gLogWarning;
-extern LogStream<ILogger::Severity::kINFO> gLogInfo;
-extern LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh b/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh
deleted file mode 100644
index b9b9eb8e4cec752014ccfdab5b259619d9d8d945..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-
-#pragma once
-
-#include <cublasLt.h>
-
-#include <cuda.h>
-namespace nvinfer1 {
-namespace ixrt_plugin {
-#ifdef __ILUVATAR__
-static const int kMaxThreadNbPerBlock = 1024;
-static const int kMaxBlockNbPerSM = 8;
-static const int kWarpSize = 64;
-static const dim3 kMaxBlockDimension = {4096, 4096, 64};
-static const dim3 kMaxGridDimension = {4294967295, 65536, 65536};
-static const int kNbThreadsPerBlockGainBestPerformance = 1024;
-static const int kMaxSharedMemSizePerBlock = (128 * 1024 * 4);
-static const int kNbSmemLane = 64;
-static const int kNbBytesPerSmemLane = 4;
-#else
-static const int kMaxThreadNbPerBlock = 1024;
-static const int kMaxBlockNbPerSM = 8;
-static const int kWarpSize = 32;
-static const dim3 kMaxBlockDimension = {1024, 1024, 64};
-static const dim3 kMaxGridDimension = {2147483647, 65535, 65535};
-static const int kNbThreadsPerBlockGainBestPerformance = 256;
-static const int kMaxSharedMemSizePerBlock = 48 * 1024 * 4;
-static const int kNbSmemLane = 32;
-static const int kNbBytesPerSmemLane = 4;
-#endif
-
-static const int kNbCe = 4;
-static const int kNbCuPerCe = 4;
-static const int kNbSppPerCu = 4;
-
-static const float kLog2e = 1.442695040888963387;
-
-#define DivUp(x, y) (((x) + (y)-1) / (y))
-
-__device__ __forceinline__ float floatExp(float x) { return __builtin_exp2f(kLog2e * x); }
-
-__device__ __forceinline__ float floatLog(float x) { return __logf(x); }
-
-__forceinline__ int nearest_num(int x, int value) {
-    if (x % value == 0) {
-        return x;
-    } else {
-        int padding = value - x % value;
-        return x + padding;
-    }
-}
-}  // namespace nvinfer1::ixrt_plugin
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp
deleted file mode 100644
index 29908ff168e82bc43da09260bd7d5eb4dd52f94b..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "plugin.h"
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1
-{
-namespace ixrt_plugin
-{
-
-void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc)
-{
-    for (int32_t i = 0; i < fc->nbFields; i++)
-    {
-        requiredFieldNames.erase(fc->fields[i].name);
-    }
-    if (!requiredFieldNames.empty())
-    {
-        std::stringstream msg{};
-        msg << "PluginFieldCollection missing required fields: {";
-        char const* separator = "";
-        for (auto const& field : requiredFieldNames)
-        {
-            msg << separator << field;
-            separator = ", ";
-        }
-        msg << "}";
-        std::string msg_str = msg.str();
-        IXRT_PLUGIN_CHECK_VALUE(false, msg_str.c_str());
-    }
-}
-
-} // namespace ixrt_plugin
-} // namespace nvinfer1
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h
deleted file mode 100644
index b24ef30067eeb17e526f9ee1430031873645dde0..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h
+++ /dev/null
@@ -1,72 +0,0 @@
-
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cstring>
-#include <string>
-#include <set>
-#include "NvInferRuntimeCommon.h"
-
-typedef enum
-{
-    STATUS_SUCCESS = 0,
-    STATUS_FAILURE = 1,
-    STATUS_BAD_PARAM = 2,
-    STATUS_NOT_SUPPORTED = 3,
-    STATUS_NOT_INITIALIZED = 4
-} pluginStatus_t;
-
-namespace nvinfer1 {
-
-namespace ixrt_plugin {
-
-
-// Write values into buffer
-template <typename T>
-void write(char*& buffer, const T& val) {
-    std::memcpy(buffer, &val, sizeof(T));
-    buffer += sizeof(T);
-}
-
-// Read values from buffer
-template <typename T>
-T read(const char*& buffer) {
-    T val{};
-    std::memcpy(&val, buffer, sizeof(T));
-    buffer += sizeof(T);
-    return val;
-}
-
-void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc);
-
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h
deleted file mode 100644
index 11ef7eca97ce5506712ee7993957cf3ac1eb0086..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-
-#include <cstring>
-#include <vector>
-#include <cassert>
-#include <type_traits>
-
-#include <iostream>
-using std::cerr;
-using std::cout;
-using std::endl;
-
-template <typename T>
-inline void serialize_value(void** buffer, T const& value);
-
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
-
-namespace
-{
-
-template <typename T, class Enable = void>
-struct Serializer
-{
-};
-
-template <typename T>
-struct Serializer<T,
-    typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
-{
-    static size_t serialized_size(T const&)
-    {
-        return sizeof(T);
-    }
-    static void serialize(void** buffer, T const& value)
-    {
-        ::memcpy(*buffer, &value, sizeof(T));
-        reinterpret_cast<char*&>(*buffer) += sizeof(T);
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, T* value)
-    {
-        assert(*buffer_size >= sizeof(T));
-        ::memcpy(value, *buffer, sizeof(T));
-        reinterpret_cast<char const*&>(*buffer) += sizeof(T);
-        *buffer_size -= sizeof(T);
-    }
-};
-
-template <>
-struct Serializer<const char*>
-{
-    static size_t serialized_size(const char* value)
-    {
-        return strlen(value) + 1;
-    }
-    static void serialize(void** buffer, const char* value)
-    {
-        ::strcpy(static_cast<char*>(*buffer), value);
-        reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
-    {
-        *value = static_cast<char const*>(*buffer);
-        size_t data_size = strnlen(*value, *buffer_size) + 1;
-        assert(*buffer_size >= data_size);
-        reinterpret_cast<char const*&>(*buffer) += data_size;
-        *buffer_size -= data_size;
-    }
-};
-
-template <typename T>
-struct Serializer<std::vector<T>,
-    typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
-{
-    static size_t serialized_size(std::vector<T> const& value)
-    {
-        return sizeof(value.size()) + value.size() * sizeof(T);
-    }
-    static void serialize(void** buffer, std::vector<T> const& value)
-    {
-        serialize_value(buffer, value.size());
-        size_t nbyte = value.size() * sizeof(T);
-        ::memcpy(*buffer, value.data(), nbyte);
-        reinterpret_cast<char*&>(*buffer) += nbyte;
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value)
-    {
-        size_t size;
-        deserialize_value(buffer, buffer_size, &size);
-        value->resize(size);
-        size_t nbyte = value->size() * sizeof(T);
-        assert(*buffer_size >= nbyte);
-        ::memcpy(value->data(), *buffer, nbyte);
-        reinterpret_cast<char const*&>(*buffer) += nbyte;
-        *buffer_size -= nbyte;
-    }
-};
-
-} // namespace
-
-template <typename T>
-inline size_t serialized_size(T const& value)
-{
-    return Serializer<T>::serialized_size(value);
-}
-
-template <typename T>
-inline void serialize_value(void** buffer, T const& value)
-{
-    return Serializer<T>::serialize(buffer, value);
-}
-
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value)
-{
-    return Serializer<T>::deserialize(buffer, buffer_size, value);
-}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp
deleted file mode 100644
index cf00d620b2c3d47f0bea4bbad3f9fdc003bbe6bd..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp
+++ /dev/null
@@ -1,431 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "NvInferRuntimeCommon.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "cuda_runtime_api.h"
-#include "driver_types.h"
-#include "fcPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <cassert>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFC_VERSION{"2"};
-char const* const kFC_NAME{"CustomFCPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FCInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> FCInt8PluginDynamicCreator::mPluginAttributes;
-
-FCInt8PluginDynamicCreator::FCInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("W", nullptr, PluginFieldType::kINT8, 1));
-    mPluginAttributes.emplace_back(PluginField("fc_amax", nullptr, PluginFieldType::kFLOAT32, 2));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* FCInt8PluginDynamicCreator::getPluginName() const noexcept { return kFC_NAME; }
-
-char const* FCInt8PluginDynamicCreator::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-PluginFieldCollection const* FCInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* FCInt8PluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FCInt8PluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        Weights W{DataType::kINT8, nullptr, 0LL};
-        Weights Bias{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "W", "fc_amax"}, fc);
-        vector<float> weight_scale;
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("W") == 0) {
-                gLogInfo << "Building W..." << endl;
-                W.values = fc->fields[i].data;
-                W.count = fc->fields[i].length;
-                W.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W int8: " << (W.type == DataType::kINT8) << endl;
-            }
-
-            if (fieldName.compare("Bias") == 0) {
-                gLogInfo << "Building Bias..." << endl;
-                Bias.values = fc->fields[i].data;
-                Bias.count = fc->fields[i].length;
-                Bias.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is Bias float32: " << (Bias.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("fc_amax") == 0) {
-                gLogInfo << "Building fc_amax..." << endl;
-                for (auto j = 0; j < fc->fields[i].length; j++) {
-                    auto value = static_cast<float const*>(fc->fields[i].data)[j];
-                    weight_scale.emplace_back(value / 127.0);
-                }
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (W.count == 0 || W.values == nullptr || W.count < outDims) {
-            gLogInfo << "Invalid weights" << endl;
-        }
-
-        DataType type = DataType::kINT8;
-        return new FCInt8PluginDynamic(name, type, outDims, W, Bias, weight_scale);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FCInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                         size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FCInt8PluginDynamic::destroy()
-    try {
-        return new FCInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FCInt8PluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FCInt8PluginDynamicCreator);
-//#########################################################################//
-FCInt8PluginDynamic::FCInt8PluginDynamic(std::string const name, DataType const type, int32_t const outDim,
-                                         Weights const& W, Weights const& Bias, vector<float> const& scale)
-    : mLayerName(name),
-      mType(type),
-      mOutDim(outDim),
-      mNumParams(W.count),
-      mNmax(0),
-      mK(0),
-      mWdev(nullptr),
-      mNumBias(Bias.count),
-      mScale(scale),
-      mBiasdev(nullptr) {
-    if (W.type == nvinfer1::DataType::kFLOAT) {
-        float weight_max = std::numeric_limits<float>::min();
-        for (int64_t wb = 0, we = W.count; wb < we; ++wb) {
-            float val = static_cast<const float*>(W.values)[wb];
-            weight_max = std::max(weight_max, std::abs(val));
-        }
-        // mWeightScale = 127 / weight_max;
-    }
-
-    mW.convertAndCopy(W, DataType::kINT8, scale[0]);
-    copyToDevice(mW, getWeightsSize(mW, DataType::kINT8), mWdev);
-    if (Bias.values != nullptr) {
-        mBias.convertAndCopy(Bias, DataType::kFLOAT);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kFLOAT), mBiasdev);
-    }
-}
-
-FCInt8PluginDynamic::FCInt8PluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev(nullptr), mBiasdev(nullptr) {
-    gLogInfo << "FCInt8PluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mOutDim);
-    deserialize_value(&data, &length, &mNumParams);
-    deserialize_value(&data, &length, &mNmax);
-    deserialize_value(&data, &length, &mK);
-    deserialize_value(&data, &length, &mNumBias);
-    deserialize_value(&data, &length, &mScale);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW.convertAndCopy(d, mNumParams, DataType::kINT8);
-    copyToDevice(mW, getWeightsSize(mW, DataType::kINT8), mWdev);
-    if (mNumBias > 0) {
-        mBias.convertAndCopy(d, mNumBias, DataType::kFLOAT);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kFLOAT), mBiasdev);
-    }
-}
-
-// IPluginV2 Methods
-char const* FCInt8PluginDynamic::getPluginType() const noexcept { return kFC_NAME; }
-
-char const* FCInt8PluginDynamic::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-int32_t FCInt8PluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FCInt8PluginDynamic::initialize() noexcept {
-    gLogInfo << "FCInt8PluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FCInt8PluginDynamic::terminate() noexcept { gLogInfo << "FCInt8PluginDynamic terminate" << endl; }
-
-size_t FCInt8PluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mType) + sizeof(mOutDim) + sizeof(mNumParams) + sizeof(mNmax) + sizeof(mK) + sizeof(mNumBias) +
-           mScale.size() * sizeof(float) + sizeof(mScale.size()) + getElementSize(DataType::kINT8) * mNumParams +
-           getElementSize(DataType::kFLOAT) * mNumBias;
-}
-
-void FCInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mOutDim);
-    serialize_value(&buffer, mNumParams);
-    serialize_value(&buffer, mNmax);
-    serialize_value(&buffer, mK);
-    serialize_value(&buffer, mNumBias);
-    serialize_value(&buffer, mScale);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev.get()), mNumParams * getElementSize(DataType::kINT8));
-
-    if (mNumBias > 0) {
-        serFromDev(d, static_cast<char*>(mBiasdev.get()), mNumBias * getElementSize(DataType::kFLOAT));
-    }
-}
-
-void FCInt8PluginDynamic::destroy() noexcept {
-    gLogInfo << "FCInt8PluginDynamic destroy" << endl;
-    mWdev.reset(nullptr);
-    if (mNumBias > 0) {
-        mBiasdev.reset(nullptr);
-    }
-    delete this;
-}
-
-void FCInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FCInt8PluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    // IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kINT8);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FCInt8PluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FCInt8PluginDynamic clone" << endl;
-
-        auto* p = new FCInt8PluginDynamic(mLayerName, mType, mOutDim, mW, mBias, mScale);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FCInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                   IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mOutDim);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FCInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                    int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FCInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                          DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        mK = inDims0.d[HDIM];  // hiddensize
-        // IXRT_PLUGIN_ASSERT(hiddenSize * mOutDim == mNumParams);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FCInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                             PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    int32_t const B = inputs[0].dims.d[BDIM];
-    int32_t const S = inputs[0].dims.d[SDIM];
-    int32_t const oE = outputs[0].dims.d[HDIM];
-#ifdef __ILUVATAR__
-        return B * S * oE * sizeof(int8_t);
-#else 
-        return B * S * oE * sizeof(int32_t);
-#endif
-}
-
-int32_t FCInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                     void const* const* inputs, void* const* outputs, void* workSpace,
-                                     cudaStream_t stream) noexcept {
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const E = inputDesc->dims.d[HDIM];
-        int32_t const oE = outputDesc->dims.d[HDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        float qkv_in_scale = inputDesc[0].scale;
-        float qkv_wei_scale = mScale[0];
-        float output_scale = outputDesc[0].scale;
-        float qkv_out_scale;
-        if (mScale.size() == 2) {
-            qkv_out_scale = mScale[1];
-        } else {
-            qkv_out_scale = output_scale;
-        }
-#ifdef __ILUVATAR__
-        int8_t* buffer = static_cast<int8_t*>(workSpace);
-#else
-        int32_t* buffer = static_cast<int32_t*>(workSpace);
-#endif
-        if (mType == DataType::kINT8) {
-            auto const* const input = static_cast<int8_t const*>(inputs[0]);
-            auto* output = static_cast<int8_t*>(outputs[0]);
-            auto weight = static_cast<int8_t*>(mWdev.get());
-
-            float dequant_scale = (qkv_in_scale * qkv_wei_scale) / qkv_out_scale;
-
-            if (mBiasdev.get() != nullptr) {
-#ifdef __ILUVATAR__
-                cuinfer_i8_gemm(weight, input, nullptr, buffer, 1, oE, n, E, 0, 0, 0, dequant_scale, 0.0, 0,
-                                cuinfer_handle, stream);
-                dequantGemmWithBias(buffer, static_cast<float*>(mBiasdev.get()), output, B * S, oE, qkv_out_scale,
-                                    1.0 / output_scale, stream);
-#else
-                cublaslt_gemm(weight, input, buffer, 1, oE, n, E, 0, 0, 0, 1, blaslt_handle, stream);
-                dequantGemmWithBias(buffer, static_cast<float*>(mBiasdev.get()), output, B * S, oE,  dequant_scale, qkv_out_scale,
-                                    1.0 / output_scale, stream);
-#endif
-                
-            } else {
-#ifdef __ILUVATAR__
-                cuinfer_i8_gemm(weight, input, nullptr, output, 1, oE, n, E, 0, 0, 0, dequant_scale, 0.0, 0,
-                                cuinfer_handle, stream);
-#else
-                
-                cublaslt_gemm(weight, input, buffer, 1, oE, n, E, 0, 0, 0, 1, blaslt_handle, stream);
-                quantGemm(buffer, output, B * S, oE, dequant_scale, stream);
-#endif
-            }
-        } else {
-            gLogError << "Unsupported type error, expected [kINT8], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu
deleted file mode 100644
index 7e233c878814dc347e7da8e310c96fd24923e8b9..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu
+++ /dev/null
@@ -1,485 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "fcPlugin.h"
-using namespace nvinfer1::ixrt_plugin::backend;
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_without_bias(const int8_t* input, int8_t* output, int hidden_size, float dequant_scale,
-                                          float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    char4* p_input = (char4*)input;
-    char4* p_output = (char4*)output;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-
-        val[it].x = __int2float_rn(p_input[element_index].x) * dequant_scale;
-        val[it].y = __int2float_rn(p_input[element_index].y) * dequant_scale;
-        val[it].z = __int2float_rn(p_input[element_index].z) * dequant_scale;
-        val[it].w = __int2float_rn(p_input[element_index].w) * dequant_scale;
-
-        char4 res = float42char4(val[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_with_bias(const int8_t* input, const float* bias, int8_t* output, int hidden_size,
-                                       float dequant_scale, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    char4* p_input = (char4*)input;
-    float4* p_bias = (float4*)bias;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        bias_val.x = p_bias[element_index].x;
-        bias_val.y = p_bias[element_index].y;
-        bias_val.z = p_bias[element_index].z;
-        bias_val.w = p_bias[element_index].w;
-
-        val[it].x = __int2float_rn(p_input[element_index].x) * dequant_scale + bias_val.x;
-        val[it].y = __int2float_rn(p_input[element_index].y) * dequant_scale + bias_val.y;
-        val[it].z = __int2float_rn(p_input[element_index].z) * dequant_scale + bias_val.z;
-        val[it].w = __int2float_rn(p_input[element_index].w) * dequant_scale + bias_val.w;
-
-        char4 res = float42char4(val[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_with_bias(const int32_t* input, const float* bias, int8_t* output, int hidden_size,
-                                       float quant_scale1, float dequant_scale, float quant_scale2, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    float4* p_bias = (float4*)bias;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        bias_val.x = p_bias[element_index].x;
-        bias_val.y = p_bias[element_index].y;
-        bias_val.z = p_bias[element_index].z;
-        bias_val.w = p_bias[element_index].w;
-
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale1);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale1);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale1);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale1);
-
-        val[it].x = __int2float_rn(q_input.x) * dequant_scale + bias_val.x;
-        val[it].y = __int2float_rn(q_input.y) * dequant_scale + bias_val.y;
-        val[it].z = __int2float_rn(q_input.z) * dequant_scale + bias_val.z;
-        val[it].w = __int2float_rn(q_input.w) * dequant_scale + bias_val.w;
-
-        char4 res = float42char4(val[it], quant_scale2);
-        p_output[element_index] = res;
-    }
-}
-
-void dequantGemmWithoutBias(int8_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-                            float quant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_without_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_without_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_without_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_without_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_without_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_without_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_without_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_without_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_without_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_without_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_without_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_without_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_without_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_without_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_without_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_without_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithoutBias");
-            break;
-    }
-}
-
-void dequantGemmWithBias(int8_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale, float quant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_with_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_with_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_with_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_with_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_with_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_with_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_with_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_with_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_with_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_with_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_with_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_with_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_with_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_with_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_with_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_with_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithBias with int8_t input");
-            break;
-    }
-}
-
-void dequantGemmWithBias(int32_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float quant_scale1, float dequant_scale, float quant_scale2, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_with_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_with_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_with_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_with_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_with_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_with_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_with_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_with_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_with_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_with_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_with_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_with_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_with_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_with_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_with_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_with_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithBias with int32_t input");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void quant_gemm(const int32_t* input, int8_t* output, int hidden_size, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale);
-
-        p_output[element_index] = q_input;
-    }
-}
-
-void quantGemm(int32_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            quant_gemm<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 2:
-            quant_gemm<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 3:
-            quant_gemm<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 4:
-            quant_gemm<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 5:
-            quant_gemm<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 6:
-            quant_gemm<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 7:
-            quant_gemm<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 8:
-            quant_gemm<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 9:
-            quant_gemm<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 10:
-            quant_gemm<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 11:
-            quant_gemm<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 12:
-            quant_gemm<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 13:
-            quant_gemm<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 14:
-            quant_gemm<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 15:
-            quant_gemm<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 16:
-            quant_gemm<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("quantGemm");
-            break;
-    }
-}
-
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp
deleted file mode 100644
index 675415356d82188b92994919df2d7f45828ed543..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-#include "fcPlugin.h"
-
-#include "NvInferRuntimeCommon.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFC_VERSION{"1"};
-char const* const kFC_NAME{"CustomFCPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FCPluginDynamicCreator::mFC{};
-std::vector<PluginField> FCPluginDynamicCreator::mPluginAttributes;
-
-FCPluginDynamicCreator::FCPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("W", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* FCPluginDynamicCreator::getPluginName() const noexcept { return kFC_NAME; }
-
-char const* FCPluginDynamicCreator::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-PluginFieldCollection const* FCPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* FCPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FCPluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        int32_t typeId = -1;
-        Weights W{DataType::kFLOAT, nullptr, 0LL};
-        Weights B{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "type_id", "W"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (fieldName.compare("W") == 0) {
-                gLogInfo << "Building W..." << endl;
-                W.values = fc->fields[i].data;
-                W.count = fc->fields[i].length;
-                W.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W float32: " << (W.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("B") == 0) {
-                gLogInfo << "Building B..." << endl;
-                B.values = fc->fields[i].data;
-                B.count = fc->fields[i].length;
-                B.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is B float32: " << (B.type == DataType::kFLOAT) << endl;
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (typeId < 0 || typeId > 1) {
-            gLogInfo << "Invalid type id" << typeId << endl;
-        }
-        if (W.count == 0 || W.values == nullptr || W.count < outDims) {
-            gLogInfo << "Invalid weights" << endl;
-        }
-
-        DataType type = typeId == 0 ? DataType::kFLOAT : DataType::kHALF;
-        return new FCPluginDynamic(name, type, outDims, W, B);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FCPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                     size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FCPluginDynamic::destroy()
-    try {
-        return new FCPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FCPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FCPluginDynamicCreator);
-//#########################################################################//
-FCPluginDynamic::FCPluginDynamic(std::string const name, DataType const type, int32_t const outDim, Weights const& W,
-                                 Weights const& B)
-    : mLayerName(name),
-      mType(type),
-      mOutDim(outDim),
-      mNumParams(W.count),
-      mNumBias(B.count),
-      mWdev(nullptr),
-      mBdev(nullptr) {
-    mW.convertAndCopy(W, mType);
-    copyToDevice(mW, getWeightsSize(mW, mType), mWdev);
-    if (mNumBias) {
-        mB.convertAndCopy(B, mType);
-        copyToDevice(mB, getWeightsSize(mB, mType), mBdev);
-    }
-}
-
-FCPluginDynamic::FCPluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev(nullptr) {
-    gLogInfo << "FCPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mOutDim);
-    deserialize_value(&data, &length, &mNumParams);
-    deserialize_value(&data, &length, &mNumBias);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW.convertAndCopy(d, mNumParams, mType);
-    copyToDevice(mW, getWeightsSize(mW, mType), mWdev);
-    if (mNumBias) {
-        mB.convertAndCopy(d, mNumBias, mType);
-        copyToDevice(mB, getWeightsSize(mB, mType), mBdev);
-    }
-}
-
-// IPluginV2 Methods
-char const* FCPluginDynamic::getPluginType() const noexcept { return kFC_NAME; }
-
-char const* FCPluginDynamic::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-int32_t FCPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FCPluginDynamic::initialize() noexcept {
-    gLogInfo << "FCPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FCPluginDynamic::terminate() noexcept { gLogInfo << "FCPluginDynamic terminate" << endl; }
-
-size_t FCPluginDynamic::getSerializationSize() const noexcept {
-    size_t wordSize = getElementSize(mType);
-    return wordSize * (mNumParams + mNumBias) + sizeof(mType) + sizeof(mOutDim) + sizeof(mNumParams) + sizeof(mNumBias);
-}
-
-void FCPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mOutDim);
-    serialize_value(&buffer, mNumParams);
-    serialize_value(&buffer, mNumBias);
-
-    size_t wordSize = getElementSize(mType);
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev.get()), mNumParams * wordSize);
-    if (mNumBias) {
-        serFromDev(d, static_cast<char*>(mBdev.get()), mNumBias * wordSize);
-    }
-}
-
-void FCPluginDynamic::destroy() noexcept {
-    gLogInfo << "FCPluginDynamic destroy" << endl;
-    mWdev.reset(nullptr);
-    if (mNumBias) {
-        mBdev.reset(nullptr);
-    }
-    delete this;
-}
-
-void FCPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FCPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                            int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FCPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FCPluginDynamic clone" << endl;
-
-        auto* p = new FCPluginDynamic(mLayerName, mType, mOutDim, mW, mB);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FCPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                               IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mOutDim);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FCPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FCPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                      DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        // IXRT_PLUGIN_ASSERT(hiddenSize * mOutDim == mNumParams);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FCPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                         PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t FCPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                 void const* const* inputs, void* const* outputs, void* workSpace,
-                                 cudaStream_t stream) noexcept {
-    gLogInfo << "in FCPluginDynamic.." << endl;
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const E = inputDesc->dims.d[HDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        if (mType == DataType::kHALF) {
-            auto const* const input = static_cast<half const*>(inputs[0]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto weight = static_cast<half*>(mWdev.get());
-            half* bias = nullptr;
-            if (mNumBias) {
-                bias = static_cast<half*>(mBdev.get());
-            }
-
-#ifdef __ILUVATAR__
-            cuinfer_gemm(weight, input, bias, output, 1, mOutDim, n, E, 0, 0, 0, 1.0f, -1, stream, cuinfer_handle);
-#else
-            cublaslt_gemm(weight, input, output, 1, mOutDim, n, E, 0, 0, 0, 1.0f, blaslt_handle, stream);
-#endif
-        } else {
-            gLogError << "Unsupported type error, expected [kHALF,kFLOAT], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h
deleted file mode 100644
index 2f9115dc166a087a9bfd604a9697b8db28a2c8ca..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h
+++ /dev/null
@@ -1,246 +0,0 @@
-
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "bertCommon.h"
-#include "driver_types.h"
-
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#else
-#include "backend/cublas/cublas_helper.h"
-#endif
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-void quantGemm(int32_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-               cudaStream_t stream);
-
-void dequantGemmWithBias(int32_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale1, float dequant_scale2, float quant_scale, cudaStream_t stream);
-
-void dequantGemmWithBias(int8_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale, float quant_scale, cudaStream_t stream);
-
-void dequantGemmWithoutBias(int8_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-                            float quant_scale, cudaStream_t stream);
-
-class FCPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FCPluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                    nvinfer1::Weights const& W, nvinfer1::Weights const& B);
-
-    FCPluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FCPluginDynamic without arguments, so we
-    // delete default constructor.
-    FCPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    size_t mNumBias;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-    bert::WeightsWithOwnership mB;
-    bert::cuda_unique_ptr<void> mBdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FCPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FCPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class FCInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FCInt8PluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                        nvinfer1::Weights const& W, nvinfer1::Weights const& Bias, vector<float> const& scale);
-
-    FCInt8PluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FCInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    FCInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    int32_t mNmax;
-    int32_t mK;
-    int32_t mNumBias;
-
-    vector<float> mScale;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FCInt8PluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FCInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp
deleted file mode 100644
index 292e8a631f945c33c2ec7771e9f1136b5ee6828c..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp
+++ /dev/null
@@ -1,503 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "NvInferImpl.h"
-#include "NvInferRuntimeCommon.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "driver_types.h"
-#include "embLayerNormInt8Plugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* EMB_LAYER_NORM_INT8_VERSION{"2"};
-char const* EMB_LAYER_NORM_INT8_NAME{"CustomEmbLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection EmbLayerNormInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> EmbLayerNormInt8PluginDynamicCreator::mPluginAttributes;
-
-EmbLayerNormInt8PluginDynamicCreator::EmbLayerNormInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_beta"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_gamma"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_word_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_token_type_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_position_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("output_fp16"));
-    mPluginAttributes.emplace_back(PluginField("full_mask"));
-    mPluginAttributes.emplace_back(PluginField("mha_type_id"));
-    mPluginAttributes.emplace_back(PluginField("pad_id"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginName() const noexcept { return EMB_LAYER_NORM_INT8_NAME; }
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginVersion() const noexcept {
-    return EMB_LAYER_NORM_INT8_VERSION;
-}
-
-PluginFieldCollection const* EmbLayerNormInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamicCreator::createPlugin(char const* name,
-                                                                        PluginFieldCollection const* fc) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        gLogInfo << "EmbLayerNormInt8PluginDynamic createPlugin." << endl;
-        std::set<std::string> const requiredAttributes{
-            "bert_embeddings_layernorm_beta",      "bert_embeddings_layernorm_gamma",
-            "bert_embeddings_word_embeddings",     "bert_embeddings_token_type_embeddings",
-            "bert_embeddings_position_embeddings",
-        };
-
-        bool output_fp16 = false;
-        bool useFullMask = false;
-        Weights beta{};
-        Weights gamma{};
-        Weights word_emb{};
-        Weights pos_emb{};
-        Weights tok_emb{};
-        int32_t mhaTypeId = 0;
-        int32_t pad_id = 0;
-
-        for (auto i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("bert_embeddings_layernorm_beta") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_word_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_word_embeddings..." << endl;
-                word_emb.values = fc->fields[i].data;
-                word_emb.count = fc->fields[i].length;
-                word_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_token_type_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_token_type_embeddings..." << endl;
-                tok_emb.values = fc->fields[i].data;
-                tok_emb.count = fc->fields[i].length;
-                tok_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_position_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_position_embeddings..." << endl;
-                pos_emb.values = fc->fields[i].data;
-                pos_emb.count = fc->fields[i].length;
-                pos_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("output_fp16") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building output_fp16: " << output_fp16 << endl;
-            }
-
-            if (field_name.compare("full_mask") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                useFullMask = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building full_mask: " << useFullMask << endl;
-            }
-
-            if (field_name.compare("mha_type_id") == 0) {
-                mhaTypeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_ASSERT(mhaTypeId >= 0 && mhaTypeId < 3);
-                gLogInfo << "Building mha typeId: " << mhaTypeId << endl;
-            }
-
-            if (field_name.compare("pad_id") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32)
-                pad_id = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-        gLogInfo << "Building EmbLayerNormInt8PluginDynamic Plugin..." << endl;
-        DataType mhaType = static_cast<DataType>(mhaTypeId);
-        EmbLayerNormInt8PluginDynamic* p =
-            new EmbLayerNormInt8PluginDynamic(name, output_fp16 ? DataType::kHALF : DataType::kFLOAT, mhaType, beta,
-                                              gamma, word_emb, pos_emb, tok_emb, useFullMask, pad_id);
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                             size_t serialLength) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(serialData != nullptr);
-        return new EmbLayerNormInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void EmbLayerNormInt8PluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(pluginNamespace != nullptr);
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(EmbLayerNormInt8PluginDynamicCreator);
-
-//#########################################################################//
-EmbLayerNormInt8PluginDynamic::EmbLayerNormInt8PluginDynamic(std::string const& name, DataType const type,
-                                                             DataType const mhaType, Weights const& beta,
-                                                             Weights const& gamma, Weights const& wordEmb,
-                                                             Weights const& posEmb, Weights const& tokEmb,
-                                                             bool const useFullMask, int32_t padId)
-    : mLayerName(name),
-      mHiddenSize(beta.count),
-      mEmbType(type),
-      mUseFullMask(useFullMask),
-      mMhaType(mhaType),
-      mPadId(padId) {
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    IXRT_PLUGIN_ASSERT(mHiddenSize > 0U);
-    IXRT_PLUGIN_ASSERT(wordEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(posEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(tokEmb.count % mHiddenSize == 0);
-    mWordVocabSize = wordEmb.count / mHiddenSize;
-    mPosVocabSize = posEmb.count / mHiddenSize;
-    mTokVocabSize = tokEmb.count / mHiddenSize;
-
-    mBeta.convertAndCopy(beta, nvinfer1::DataType::kFLOAT);
-    mGamma.convertAndCopy(gamma, nvinfer1::DataType::kFLOAT);
-    mWordEmb.convertAndCopy(wordEmb, mEmbType);
-    mTokEmb.convertAndCopy(tokEmb, mEmbType);
-    mPosEmb.convertAndCopy(posEmb, mEmbType);
-
-    copyToDevice(mGamma, sizeof(float) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(float) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-EmbLayerNormInt8PluginDynamic::EmbLayerNormInt8PluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mWordEmbDev(nullptr),
-      mTokEmbDev(nullptr),
-      mPosEmbDev(nullptr) {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic deserialize." << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mEmbType);
-    deserialize_value(&data, &length, &mMhaType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mSeqLen);
-    deserialize_value(&data, &length, &mPadId);
-    deserialize_value(&data, &length, &mWordVocabSize);
-    deserialize_value(&data, &length, &mPosVocabSize);
-    deserialize_value(&data, &length, &mTokVocabSize);
-    deserialize_value(&data, &length, &mUseFullMask);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kFLOAT);
-    mGamma.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kFLOAT);
-    mWordEmb.convertAndCopy(d, mHiddenSize * mWordVocabSize, mEmbType);
-    mPosEmb.convertAndCopy(d, mHiddenSize * mPosVocabSize, mEmbType);
-    mTokEmb.convertAndCopy(d, mHiddenSize * mTokVocabSize, mEmbType);
-
-    copyToDevice(mGamma, sizeof(float) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(float) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-// IPluginV2 Methods
-char const* EmbLayerNormInt8PluginDynamic::getPluginType() const noexcept { return EMB_LAYER_NORM_INT8_NAME; }
-
-char const* EmbLayerNormInt8PluginDynamic::getPluginVersion() const noexcept { return EMB_LAYER_NORM_INT8_VERSION; }
-
-int32_t EmbLayerNormInt8PluginDynamic::getNbOutputs() const noexcept { return 3; }
-
-int32_t EmbLayerNormInt8PluginDynamic::initialize() noexcept { return 0; }
-
-void EmbLayerNormInt8PluginDynamic::terminate() noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic terminate." << endl;
-}
-
-size_t EmbLayerNormInt8PluginDynamic::getSerializationSize() const noexcept {
-    size_t const wordSize = getElementSize(mEmbType);
-    return sizeof(mEmbType) * 2                       // mEmbType, mMhaType
-           + sizeof(mHiddenSize) * 6                  // mHiddenSize, mSeqLen, 3*VocabSize, mPadId
-           + sizeof(mUseFullMask)                     // mask type
-           + 2 * sizeof(float) * mHiddenSize           // beta + gamma
-           + wordSize * mHiddenSize * mWordVocabSize  // word emb
-           + wordSize * mHiddenSize * mPosVocabSize   // pos emb
-           + wordSize * mHiddenSize * mTokVocabSize   // tok emb
-        ;
-}
-
-void EmbLayerNormInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mEmbType);
-    serialize_value(&buffer, mMhaType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mSeqLen);
-    serialize_value(&buffer, mPadId);
-    serialize_value(&buffer, mWordVocabSize);
-    serialize_value(&buffer, mPosVocabSize);
-    serialize_value(&buffer, mTokVocabSize);
-    serialize_value(&buffer, mUseFullMask);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, mBetaDev.get(), mHiddenSize);
-    serFromDev(d, mGammaDev.get(), mHiddenSize);
-    size_t const wordSize = getElementSize(mEmbType);
-    serFromDev(d, static_cast<char*>(mWordEmbDev.get()), mHiddenSize * mWordVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mPosEmbDev.get()), mHiddenSize * mPosVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mTokEmbDev.get()), mHiddenSize * mTokVocabSize * wordSize);
-}
-
-void EmbLayerNormInt8PluginDynamic::destroy() noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic destroy." << endl;
-    // This gets called when the network containing plugin is destroyed
-    mGammaDev.reset(nullptr);
-    mBetaDev.reset(nullptr);
-    mWordEmbDev.reset(nullptr);
-    mPosEmbDev.reset(nullptr);
-    mTokEmbDev.reset(nullptr);
-    delete this;
-}
-
-void EmbLayerNormInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType EmbLayerNormInt8PluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                          int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index >= 0 && index <= 2);
-    if (index == 0) {
-        return mMhaType;
-    }
-    if (index == 1) {
-        return DataType::kINT8;
-    }
-    return DataType::kFLOAT;
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "EmbLayerNormInt8PluginDynamic clone." << endl;
-
-        auto p = new EmbLayerNormInt8PluginDynamic(mLayerName, mEmbType, mMhaType, mBeta, mGamma, mWordEmb, mPosEmb,
-                                                   mTokEmb, mUseFullMask);
-        p->mSeqLen = mSeqLen;
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs EmbLayerNormInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                             int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        // Input should be input ids and token ids and the input mask
-        // Output should be the embeddings tensor and mask indices
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == 2);  // BxS
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[2].nbDims);
-
-        IXRT_PLUGIN_ASSERT(outputIndex >= 0 || outputIndex <= 2);
-
-        if (outputIndex == 0) {
-            DimsExprs ret;
-            ret.nbDims = 5;
-            ret.d[0] = inputs[0].d[BDIM];
-            ret.d[1] = inputs[0].d[SDIM];
-            ret.d[2] = exprBuilder.constant(mHiddenSize);
-            ret.d[3] = exprBuilder.constant(1);
-            ret.d[4] = exprBuilder.constant(1);
-            return ret;
-        }
-        if (outputIndex == 1) {
-            DimsExprs ret;
-            ret.nbDims = 2;
-            ret.d[0] = inputs[0].d[BDIM];
-            ret.d[1] = inputs[0].d[SDIM];
-            return ret;
-        }
-
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[BDIM];
-        ret.d[1] = inputs[0].d[SDIM];
-        ret.d[2] = exprBuilder.constant(mHiddenSize);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool EmbLayerNormInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                              int32_t nbInputs, int32_t nbOutputs) noexcept {
-    // 3 inputs of size BxS
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 3);
-
-    PluginTensorDesc const& desc = inOut[pos];
-    if (desc.format != TensorFormat::kLINEAR) {
-        return false;
-    }
-    if (pos == 0) {
-        return desc.type == DataType::kINT32;
-    }
-
-    PluginTensorDesc const& prev = inOut[pos - 1];
-    if (pos == 1 || pos == 2) {
-        return desc.type == DataType::kINT32 && desc.format == prev.format;
-    }
-
-    // emb_out
-    if (pos == 3 || pos == 4) {
-        return desc.type == DataType::kINT8 && desc.format == prev.format;
-    }
-    // residual
-    return desc.type == DataType::kFLOAT;
-}
-
-void EmbLayerNormInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                    DynamicPluginTensorDesc const* outputs,
-                                                    int32_t nbOutputs) noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic configurePlugin." << endl;
-
-    // Validate input arguments
-    IXRT_PLUGIN_ASSERT(nbOutputs == 3);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-    IXRT_PLUGIN_ASSERT(inputs[0].desc.dims.nbDims == 2);
-    int32_t const S = inputs[0].desc.dims.d[SDIM];
-    mSeqLen = S;
-    int32_t const B = inputs[0].desc.dims.d[BDIM];
-    TRT_UNUSED B;
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[1].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[1].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[2].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[2].desc.dims.d[BDIM]);
-
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[SDIM]) == mSeqLen);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[BDIM] == B);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[2]) == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[4] == 1);
-
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[0] == B);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[1] == S);
-
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[SDIM] == outputs[0].desc.dims.d[SDIM]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[BDIM] == outputs[0].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[2] == outputs[0].desc.dims.d[2]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[4] == 1);
-}
-
-size_t EmbLayerNormInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                       PluginTensorDesc const* outputs,
-                                                       int32_t nbOutputs) const noexcept {
-    int32_t const B = inputs[0].dims.d[BDIM];
-    int32_t const S = inputs[0].dims.d[SDIM];
-    return B * S * sizeof(int32_t);
-}
-
-int32_t EmbLayerNormInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                               void const* const* inputs, void* const* outputs, void* workspace,
-                                               cudaStream_t stream) noexcept {
-    try {
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t status = STATUS_SUCCESS;
-        int32_t fmha_S = S;
-        int32_t batch_tokens = B * fmha_S;
-
-        // Our plugin outputs only one tensor
-        auto const inputIds = static_cast<int32_t const*>(inputs[0]);
-        auto const segmentIds = static_cast<int32_t const*>(inputs[1]);
-
-        float const* beta = mBetaDev.get();
-        float const* gamma = mGammaDev.get();
-        auto output = static_cast<int8_t*>(outputs[0]);
-        auto mNewMask = static_cast<int8_t*>(outputs[1]);
-        auto residual = static_cast<float*>(outputs[2]);
-        auto const wordEmb = static_cast<float const*>(mWordEmbDev.get());
-        auto const tokEmb = static_cast<float const*>(mTokEmbDev.get());
-        auto const posEmb = static_cast<float const*>(mPosEmbDev.get());
-
-        float l0_qkv_in_amax = outputDesc[0].scale * 127;
-
-        auto mask_idx = static_cast<int32_t*>(workspace);
-        status = embLayerNorm(stream, static_cast<int32_t>(mHiddenSize), B, S, inputIds, segmentIds, beta, gamma,
-                              wordEmb, posEmb, tokEmb, mWordVocabSize, mTokVocabSize, residual, output, mask_idx,
-                              mPadId, l0_qkv_in_amax);
-
-        IxinferMaskPad(mask_idx, mNewMask, B, S, mHiddenSize, fmha_S, batch_tokens, stream);
-
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu
deleted file mode 100644
index 3aa0cd8668cf3c35ae8befde4ec3afd7f3e73e22..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu
+++ /dev/null
@@ -1,342 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "embLayerNormInt8Plugin.h"
-#include "backend/bert/bert_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualI8O(const float *input, int8_t *output, int hidden_size, float quant_scale) {
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size;
-
-    input += block_start;
-    output += block_start;
-
-    float4 *p_input = (float4 *)input;
-    char4 *p_output = (char4 *)output;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        vals[it].x = p_input[element_index].x;
-        vals[it].y = p_input[element_index].y;
-        vals[it].z = p_input[element_index].z;
-        vals[it].w = p_input[element_index].w;
-
-        char4 res = float42char4(vals[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <typename T>
-void IxinferResidualI8OLauncher(const T *input, int8_t *output, int batch_tokens, int hidden_size, float quant_scale,
-                                  cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            IxinferResidualI8O<1><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 2:
-            IxinferResidualI8O<2><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 3:
-            IxinferResidualI8O<3><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 4:
-            IxinferResidualI8O<4><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 5:
-            IxinferResidualI8O<5><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 6:
-            IxinferResidualI8O<6><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 7:
-            IxinferResidualI8O<7><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 8:
-            IxinferResidualI8O<8><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 9:
-            IxinferResidualI8O<9><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 10:
-            IxinferResidualI8O<10><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 11:
-            IxinferResidualI8O<11><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 12:
-            IxinferResidualI8O<12><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 13:
-            IxinferResidualI8O<13><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 14:
-            IxinferResidualI8O<14><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 15:
-            IxinferResidualI8O<15><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 16:
-            IxinferResidualI8O<16><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        default:
-            throw std::runtime_error("IxinferResidualI8OLauncher");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferBertEmbedLnKernel(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens,
-                                         float *output, int *pad_mask, int *type_ids, int pad_id, int batch_size,
-                                         int seq_len, int hidden_dim, const float *scale, const float *bias) {
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_dim;
-    int batch_idx, seq_idx;
-    batch_idx = blockIdx.x / seq_len;
-    seq_idx = blockIdx.x % seq_len;
-
-    int tokens_idx = blockIdx.x;
-    int token = tokens[tokens_idx];
-    int token_type = type_ids[tokens_idx];
-
-    output += block_start;
-
-    float4 *p_output = (float4 *)output;
-
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_value = (float4 *)(token_emb + token * hidden_dim);
-    float4 *p_pemb = (float4 *)(pos_emb + seq_idx * hidden_dim);
-    float4 *p_temb = (float4 *)(type_emb + token_type * hidden_dim);
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-        if (token == pad_id) {
-            if (element_index == 0) {
-                pad_mask[tokens_idx] = 1;
-            }
-            vals[it] = make_float4(0.f, 0.f, 0.f, 0.f);
-
-        } else {
-            if (element_index == 0) {
-                pad_mask[tokens_idx] = 0;
-            }
-        
-            vals[it].x = p_value[element_index].x + p_pemb[element_index].x + p_temb[element_index].x;
-            vals[it].y = p_value[element_index].y + p_pemb[element_index].y + p_temb[element_index].y;
-            vals[it].z = p_value[element_index].z + p_pemb[element_index].z + p_temb[element_index].z;
-            vals[it].w = p_value[element_index].w + p_pemb[element_index].w + p_temb[element_index].w;
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-        }
-    }
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 scale_value = p_scale[element_index];
-        float4 bias_value = p_bias[element_index];
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_dim, epsilon,
-                                                      scale_value, bias_value);
-        int tokens_idx = blockIdx.x;
-
-        int token = tokens[tokens_idx];
-        if (token == pad_id) {
-            p_output[element_index] = make_float4(0.f, 0.f, 0.f, 0.f);
-        } else {
-            p_output[element_index] = norm_value;
-        }
-    }
-}
-
-
-void IxinferBertEmbedLn(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, float *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const float *scale, const float *bias, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int batch_tokens = batch_size * seq_len;
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-    int num_warp = hidden_size / C10_WARP_SIZE / 4; 
-
-    switch (num_warp) {
-        case 1:
-            IxinferBertEmbedLnKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 2:
-            IxinferBertEmbedLnKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 3:
-            IxinferBertEmbedLnKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 4:
-            IxinferBertEmbedLnKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 5:
-            IxinferBertEmbedLnKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 6:
-            IxinferBertEmbedLnKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 7:
-            IxinferBertEmbedLnKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 8:
-            IxinferBertEmbedLnKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 9:
-            IxinferBertEmbedLnKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 10:
-            IxinferBertEmbedLnKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 11:
-            IxinferBertEmbedLnKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 12:
-            IxinferBertEmbedLnKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 13:
-            IxinferBertEmbedLnKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 14:
-            IxinferBertEmbedLnKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 15:
-            IxinferBertEmbedLnKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 16:
-            IxinferBertEmbedLnKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        default:
-            throw std::runtime_error("IxinferBertEmbedLn");
-            break;
-    }
-}
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    float const* beta, float const* gamma, float const* wordEmb, float const* posEmb, float const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, float* buffer, int8_t* output, int32_t* maskIdx, int32_t padId, float l0_qkv_in_amax)
-{
-    IxinferBertEmbedLn(wordEmb, posEmb, tokEmb, inputIds, buffer, maskIdx, (int*)segmentIds,
-                                    padId, B, S, E, gamma, beta, stream);
-                         
-    IxinferResidualI8OLauncher<float>(buffer, output, B*S, E, 127.0 / l0_qkv_in_amax, stream);
-    return cudaSuccess;
-}
-
-void __global__ IxinferMaskPadKernel(const int32_t* mask, int8_t* new_mask, int bsz,
-                                    int ori_seq_len, int hsz, int fmha_seq_len) {
-    int batch_idx = blockIdx.x;
-    int seq_idx = blockIdx.y;
-
-    if (seq_idx < ori_seq_len) {
-        if (threadIdx.x == 0) {
-            new_mask[batch_idx * fmha_seq_len + seq_idx] = mask[batch_idx * ori_seq_len + seq_idx];
-        }
-    } else {
-        new_mask[batch_idx * fmha_seq_len + seq_idx] = 1;
-    }
-} 
-
-void IxinferMaskPad(int32_t* mask, int8_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream) {
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hsz/2>4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 !=0");
-    }
-    if (ori_seq_len > fmha_seq_len) {
-        throw std::runtime_error("ori_seq_len > fmha_seq_len");
-    }
-    if (bsz * ori_seq_len > batch_tokens) {
-        throw std::runtime_error("bsz*ori_seq_len > batch_tokens");
-    }
-    dim3 blockSize(bsz, fmha_seq_len);
-    IxinferMaskPadKernel<<<blockSize, hsz / 2, 0, stream>>>(mask, new_mask, bsz, ori_seq_len, hsz,
-                                                           fmha_seq_len);
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin                  
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h
deleted file mode 100644
index 5fee7a4326b6ce9dbd45a1f868507956db8e450f..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-void IxinferBertEmbedLn(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, float *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const float *scale, const float *bias, cudaStream_t stream);
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    float const* beta, float const* gamma, float const* wordEmb, float const* posEmb, float const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, float* buffer, int8_t* output, int32_t* maskIdx, int32_t padId, float token_embed_amax_);
-
-void IxinferMaskPad(int32_t* mask, int8_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream);
-
-class EmbLayerNormInt8PluginDynamic : public IPluginV2DynamicExt {
-   public:
-    EmbLayerNormInt8PluginDynamic(std::string const& name, nvinfer1::DataType const type, nvinfer1::DataType const mhaType,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& word_emb,
-        nvinfer1::Weights const& pos_emb, nvinfer1::Weights const& tok_emb, bool const useFullMask, int32_t padId = 0);
-    EmbLayerNormInt8PluginDynamic(std::string const& name, void const* data, size_t length);
-    EmbLayerNormInt8PluginDynamic() noexcept = delete;
-    ~EmbLayerNormInt8PluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    size_t mHiddenSize;
-    size_t mSeqLen;
-    size_t mPadId;
-    DataType mEmbType;
-    bool mUseFullMask;
-    DataType mMhaType;
-    size_t mWordVocabSize, mPosVocabSize, mTokVocabSize;
-    cuda_unique_ptr<float> mGammaDev;
-    cuda_unique_ptr<float> mBetaDev;
-    cuda_unique_ptr<void> mWordEmbDev;
-    cuda_unique_ptr<void> mTokEmbDev;
-    cuda_unique_ptr<void> mPosEmbDev;
-    // cuda_unique_ptr<int32_t> mNewMask;
-    WeightsWithOwnership mBeta;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mWordEmb;
-    WeightsWithOwnership mTokEmb;
-    WeightsWithOwnership mPosEmb; 
-};
-
-class EmbLayerNormInt8PluginDynamicCreator : public IPluginCreator {
-   public:
-    EmbLayerNormInt8PluginDynamicCreator();
-
-    ~EmbLayerNormInt8PluginDynamicCreator() override = default;
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    PluginFieldCollection const* getFieldNames() noexcept override;
-
-    IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
-
-    IPluginV2DynamicExt* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
-
-};
-
-
-} // namespace bert
-} //namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp
deleted file mode 100644
index 499b2eefc7c691caf0234bde372412d7e69d1aef..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp
+++ /dev/null
@@ -1,495 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "embLayerNormPlugin.h"
-
-#include "NvInferImpl.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "driver_types.h"
-
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* EMB_LAYER_NORM_VERSION{"1"};
-char const* EMB_LAYER_NORM_NAME{"CustomEmbLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection EmbLayerNormPluginDynamicCreator::mFC{};
-std::vector<PluginField> EmbLayerNormPluginDynamicCreator::mPluginAttributes;
-
-EmbLayerNormPluginDynamicCreator::EmbLayerNormPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_beta"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_gamma"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_word_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_token_type_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_position_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("output_fp16"));
-    mPluginAttributes.emplace_back(PluginField("full_mask"));
-    mPluginAttributes.emplace_back(PluginField("mha_type_id"));
-    mPluginAttributes.emplace_back(PluginField("pad_id"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginName() const noexcept { return EMB_LAYER_NORM_NAME; }
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginVersion() const noexcept { return EMB_LAYER_NORM_VERSION; }
-
-PluginFieldCollection const* EmbLayerNormPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2DynamicExt* EmbLayerNormPluginDynamicCreator::createPlugin(char const* name,
-                                                                    PluginFieldCollection const* fc) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        gLogInfo << "EmbLayerNormPluginDynamic createPlugin." << endl;
-        std::set<std::string> const requiredAttributes{
-            "bert_embeddings_layernorm_beta",      "bert_embeddings_layernorm_gamma",
-            "bert_embeddings_word_embeddings",     "bert_embeddings_token_type_embeddings",
-            "bert_embeddings_position_embeddings",
-        };
-
-        bool output_fp16 = false;
-        bool useFullMask = false;
-        Weights beta{};
-        Weights gamma{};
-        Weights word_emb{};
-        Weights pos_emb{};
-        Weights tok_emb{};
-        int32_t mhaTypeId = 0;
-        int32_t pad_id = 0;
-
-        for (auto i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("bert_embeddings_layernorm_beta") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_word_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_word_embeddings..." << endl;
-                word_emb.values = fc->fields[i].data;
-                word_emb.count = fc->fields[i].length;
-                word_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_token_type_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_token_type_embeddings..." << endl;
-                tok_emb.values = fc->fields[i].data;
-                tok_emb.count = fc->fields[i].length;
-                tok_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_position_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_position_embeddings..." << endl;
-                pos_emb.values = fc->fields[i].data;
-                pos_emb.count = fc->fields[i].length;
-                pos_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("output_fp16") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building output_fp16: " << output_fp16 << endl;
-            }
-
-            if (field_name.compare("full_mask") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                useFullMask = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building full_mask: " << useFullMask << endl;
-            }
-
-            if (field_name.compare("mha_type_id") == 0) {
-                mhaTypeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_ASSERT(mhaTypeId >= 0 && mhaTypeId < 3);
-                gLogInfo << "Building mha typeId: " << mhaTypeId << endl;
-            }
-
-            if (field_name.compare("pad_id") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32)
-                pad_id = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-        gLogInfo << "Building EmbLayerNormPluginDynamic Plugin..." << endl;
-        DataType mhaType = static_cast<DataType>(mhaTypeId);
-        EmbLayerNormPluginDynamic* p =
-            new EmbLayerNormPluginDynamic(name, output_fp16 ? DataType::kHALF : DataType::kFLOAT, mhaType, beta, gamma,
-                                          word_emb, pos_emb, tok_emb, useFullMask, pad_id);
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2DynamicExt* EmbLayerNormPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                         size_t serialLength) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(serialData != nullptr);
-        return new EmbLayerNormPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void EmbLayerNormPluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(pluginNamespace != nullptr);
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(EmbLayerNormPluginDynamicCreator);
-
-//#########################################################################//
-EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(std::string const& name, DataType const type,
-                                                     DataType const mhaType, Weights const& beta, Weights const& gamma,
-                                                     Weights const& wordEmb, Weights const& posEmb,
-                                                     Weights const& tokEmb, bool const useFullMask, int32_t padId)
-    : mLayerName(name),
-      mHiddenSize(beta.count),
-      mEmbType(type),
-      mUseFullMask(useFullMask),
-      mMhaType(mhaType),
-      mPadId(padId) {
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    IXRT_PLUGIN_ASSERT(mHiddenSize > 0U);
-    IXRT_PLUGIN_ASSERT(wordEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(posEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(tokEmb.count % mHiddenSize == 0);
-    mWordVocabSize = wordEmb.count / mHiddenSize;
-    mPosVocabSize = posEmb.count / mHiddenSize;
-    mTokVocabSize = tokEmb.count / mHiddenSize;
-
-    mBeta.convertAndCopy(beta, nvinfer1::DataType::kHALF);
-    mGamma.convertAndCopy(gamma, nvinfer1::DataType::kHALF);
-    mWordEmb.convertAndCopy(wordEmb, mEmbType);
-    mTokEmb.convertAndCopy(tokEmb, mEmbType);
-    mPosEmb.convertAndCopy(posEmb, mEmbType);
-
-    copyToDevice(mGamma, sizeof(half) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(half) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mWordEmbDev(nullptr),
-      mTokEmbDev(nullptr),
-      mPosEmbDev(nullptr) {
-    gLogInfo << "EmbLayerNormPluginDynamic deserialize." << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mEmbType);
-    deserialize_value(&data, &length, &mMhaType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mSeqLen);
-    deserialize_value(&data, &length, &mPadId);
-    deserialize_value(&data, &length, &mWordVocabSize);
-    deserialize_value(&data, &length, &mPosVocabSize);
-    deserialize_value(&data, &length, &mTokVocabSize);
-    deserialize_value(&data, &length, &mUseFullMask);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kHALF);
-    mGamma.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kHALF);
-    mWordEmb.convertAndCopy(d, mHiddenSize * mWordVocabSize, mEmbType);
-    mPosEmb.convertAndCopy(d, mHiddenSize * mPosVocabSize, mEmbType);
-    mTokEmb.convertAndCopy(d, mHiddenSize * mTokVocabSize, mEmbType);
-
-    copyToDevice(mGamma, sizeof(half) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(half) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-// IPluginV2 Methods
-char const* EmbLayerNormPluginDynamic::getPluginType() const noexcept { return EMB_LAYER_NORM_NAME; }
-
-char const* EmbLayerNormPluginDynamic::getPluginVersion() const noexcept { return EMB_LAYER_NORM_VERSION; }
-
-int32_t EmbLayerNormPluginDynamic::getNbOutputs() const noexcept { return 2; }
-
-int32_t EmbLayerNormPluginDynamic::initialize() noexcept { return 0; }
-
-void EmbLayerNormPluginDynamic::terminate() noexcept {  gLogInfo << "EmbLayerNormPluginDynamic terminate." << endl; }
-
-size_t EmbLayerNormPluginDynamic::getSerializationSize() const noexcept {
-    size_t const wordSize = getElementSize(mEmbType);
-    return sizeof(mEmbType) * 2                       // mEmbType, mMhaType
-           + sizeof(mHiddenSize) * 6                  // mHiddenSize, mSeqLen, 3*VocabSize, mPadId
-           + sizeof(mUseFullMask)                     // mask type
-           + 2 * sizeof(half) * mHiddenSize           // beta + gamma
-           + wordSize * mHiddenSize * mWordVocabSize  // word emb
-           + wordSize * mHiddenSize * mPosVocabSize   // pos emb
-           + wordSize * mHiddenSize * mTokVocabSize   // tok emb
-        ;
-}
-
-void EmbLayerNormPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mEmbType);
-    serialize_value(&buffer, mMhaType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mSeqLen);
-    serialize_value(&buffer, mPadId);
-    serialize_value(&buffer, mWordVocabSize);
-    serialize_value(&buffer, mPosVocabSize);
-    serialize_value(&buffer, mTokVocabSize);
-    serialize_value(&buffer, mUseFullMask);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, mBetaDev.get(), mHiddenSize);
-    serFromDev(d, mGammaDev.get(), mHiddenSize);
-    size_t const wordSize = getElementSize(mEmbType);
-    serFromDev(d, static_cast<char*>(mWordEmbDev.get()), mHiddenSize * mWordVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mPosEmbDev.get()), mHiddenSize * mPosVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mTokEmbDev.get()), mHiddenSize * mTokVocabSize * wordSize);
-}
-
-void EmbLayerNormPluginDynamic::destroy() noexcept {
-    gLogInfo << "EmbLayerNormPluginDynamic destroy." << endl;
-    // This gets called when the network containing plugin is destroyed
-    mGammaDev.reset(nullptr);
-    mBetaDev.reset(nullptr);
-    mWordEmbDev.reset(nullptr);
-    mPosEmbDev.reset(nullptr);
-    mTokEmbDev.reset(nullptr);
-    delete this;
-}
-
-void EmbLayerNormPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType EmbLayerNormPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                      int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0 || index == 1);
-    if (index == 0) {
-        IXRT_PLUGIN_ASSERT(mMhaType == DataType::kHALF || mMhaType == DataType::kFLOAT);
-        return mMhaType;
-    }
-    return DataType::kINT32;
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* EmbLayerNormPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "EmbLayerNormPluginDynamic clone." << endl;
-
-        auto p = new EmbLayerNormPluginDynamic(mLayerName, mEmbType, mMhaType, mBeta, mGamma, mWordEmb, mPosEmb,
-                                               mTokEmb, mUseFullMask);
-        p->mSeqLen = mSeqLen;
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs EmbLayerNormPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                         IExprBuilder& exprBuilder) noexcept {
-    try {
-        // Input should be input ids and token ids and the input mask
-        // Output should be the embeddings tensor and mask indices
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == 2);  // BxS
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[2].nbDims);
-
-        IXRT_PLUGIN_ASSERT(outputIndex == 0 || outputIndex == 1);
-
-        if (outputIndex == 0) {
-            DimsExprs ret;
-            ret.nbDims = 5;
-            ret.d[0] = inputs[0].d[0];
-            ret.d[1] = inputs[0].d[1];
-            ret.d[2] = exprBuilder.constant(mHiddenSize);
-            ret.d[3] = exprBuilder.constant(1);
-            ret.d[4] = exprBuilder.constant(1);
-            return ret;
-        }
-
-        DimsExprs ret;
-        ret.nbDims = 2;
-        ret.d[0] = inputs[0].d[BDIM];
-        ret.d[1] = inputs[0].d[SDIM];
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool EmbLayerNormPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                          int32_t nbOutputs) noexcept {
-    // 3 inputs of size BxS
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 2);
-
-    PluginTensorDesc const& desc = inOut[pos];
-    if (desc.format != TensorFormat::kLINEAR) {
-        return false;
-    }
-    if (pos == 0) {
-        return desc.type == DataType::kINT32;
-    }
-
-    PluginTensorDesc const& prev = inOut[pos - 1];
-    if (pos == 1 || pos == 2) {
-        return desc.type == DataType::kINT32 && desc.format == prev.format;
-    }
-
-    // embedded sequence
-    if (pos == 3) {
-        return desc.type == mMhaType && desc.format == prev.format;
-    }
-    // mask
-    return desc.type == ((mMhaType == DataType::kHALF) ? DataType::kINT32 : mMhaType);
-}
-
-void EmbLayerNormPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    gLogInfo << "EmbLayerNormPluginDynamic configurePlugin." << endl;
-
-    // Validate input arguments
-    IXRT_PLUGIN_ASSERT(nbOutputs == 2);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-    IXRT_PLUGIN_ASSERT(inputs[0].desc.dims.nbDims == 2);
-    int32_t const S = inputs[0].desc.dims.d[SDIM];
-    mSeqLen = S;
-    int32_t const B = inputs[0].desc.dims.d[BDIM];
-    TRT_UNUSED B;
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[1].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[1].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[2].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[2].desc.dims.d[BDIM]);
-
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(mSeqLen == outputs[0].desc.dims.d[SDIM])
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[BDIM] == B);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[2]) == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[4] == 1);
-
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[0] == B);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[1] == mSeqLen);
-}
-
-size_t EmbLayerNormPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                   PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t EmbLayerNormPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                           void const* const* inputs, void* const* outputs, void* workspace,
-                                           cudaStream_t stream) noexcept {
-    gLogInfo << "enqueue EmbLayerNormPluginDynamic.." << endl;
-    try {
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t status = STATUS_SUCCESS;
-        int32_t fmha_S = S;
-        int32_t batch_tokens = B * fmha_S;
-
-        // Our plugin outputs only one tensor
-        auto const inputIds = static_cast<int32_t const*>(inputs[0]);
-        auto const segmentIds = static_cast<int32_t const*>(inputs[1]);
-
-        half const* beta = mBetaDev.get();
-        half const* gamma = mGammaDev.get();
-        if (mMhaType == DataType::kFLOAT) {
-            gLogError << "embLayerNormPlugin float type not supported!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        } else if (mMhaType == DataType::kHALF) {
-            auto output = static_cast<half*>(outputs[0]);
-            auto mNewMask = static_cast<int32_t*>(outputs[1]);
-            auto const wordEmb = static_cast<half const*>(mWordEmbDev.get());
-            auto const tokEmb = static_cast<half const*>(mTokEmbDev.get());
-            auto const posEmb = static_cast<half const*>(mPosEmbDev.get());
-
-            status =
-                embLayerNorm(stream, static_cast<int32_t>(mHiddenSize), B, S, inputIds, segmentIds, beta, gamma,
-                                wordEmb, posEmb, tokEmb, mWordVocabSize, mTokVocabSize, output, mNewMask, mPadId);
-            if (status != cudaSuccess) {
-                return STATUS_FAILURE;
-            }
-        }
-        else {
-            gLogError << "Unsupported type error, expected [kHALF,kFLOAT], but received "
-                      << static_cast<int32_t>(mMhaType) << endl;
-
-            return STATUS_NOT_SUPPORTED;
-        }
-
-        return status;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu
deleted file mode 100644
index 5766d382a6b3bda8cd315bb71916d568e7b380b7..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "embLayerNormPlugin.h"
-#include "backend/bert/bert_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferBertEmbedLnKernel(const __half *token_emb, const __half *pos_emb, const __half *type_emb,
-                                         const int *tokens, __half *output, int *pad_mask, int *type_ids, int pad_id,
-                                         int batch_size, int seq_len, int hidden_dim, const __half *scale,
-                                         const __half *bias) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_dim;
-    output += block_start;
-
-    __half2 *p_output = (__half2 *)output;
-    __half2 *p_scale = (__half2 *)scale;
-    __half2 *p_bias = (__half2 *)bias;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-        int batch_idx, seq_idx, dim_idx;
-        batch_idx = blockIdx.x / seq_len;
-        seq_idx = blockIdx.x % seq_len;
-        dim_idx = element_index;
-        int tokens_idx = blockIdx.x;
-        int token = tokens[tokens_idx];
-        int token_type = type_ids[tokens_idx];
-
-        half2 value;
-
-        if (token == pad_id) {
-            if (dim_idx == 0) {
-                pad_mask[tokens_idx] = 1;
-            }
-            value.x = __float2half(0.f);
-            value.y = __float2half(0.f);
-
-        } else {
-            if (dim_idx == 0) {
-                pad_mask[tokens_idx] = 0;
-            }
-            value = ((half2 *)(token_emb + token * hidden_dim + dim_idx * 2))[0];
-            half2 pemb = ((half2 *)(pos_emb + seq_idx * hidden_dim + dim_idx * 2))[0];
-            half2 temb = ((half2 *)(type_emb + token_type * hidden_dim + dim_idx * 2))[0];
-
-            vals[it].x = __half2float(value.x) + __half2float(pemb.x) + __half2float(temb.x);
-            vals[it].y = __half2float(value.y) + __half2float(pemb.y) + __half2float(temb.y);
-
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        }
-
-        float mean = 0;
-        float m2 = 0;
-        float count = 0;
-        WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-        mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-        m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-        count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-        m2 = rsqrtf(m2 / hidden_dim + epsilon);
-
-#pragma unroll
-        for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-            int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-            __half2 scale_value = p_scale[element_index];
-            __half2 bias_value = p_bias[element_index];
-
-            float2 norm_value;
-            norm_value.x = (vals[it].x - mean) * m2 * __half2float(scale_value.x) + __half2float(bias_value.x);
-            norm_value.y = (vals[it].y - mean) * m2 * __half2float(scale_value.y) + __half2float(bias_value.y);
-
-            __half2 res;
-            res.x = __float2half(norm_value.x);
-            res.y = __float2half(norm_value.y);
-
-            int token = tokens[tokens_idx];
-            if (token == pad_id) {
-                res.x = __float2half(0.f);
-                res.y = __float2half(0.f);
-                p_output[element_index] = res;
-            } else {
-                p_output[element_index] = res;
-            }
-        }
-    }
-}
-
-void IxinferBertEmbedLn(const half *token_emb, const half *pos_emb, const half *type_emb,
-                                const int *tokens, half *output, int *pad_mask, int *type_ids, int pad_id,
-                                int batch_size, int seq_len, int hidden_size, const half *scale, const half *bias,
-                                cudaStream_t stream) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 2048");
-    }
-    if (hidden_size / 2 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size / 2 // C10_WARP_SIZE != 0");
-    }
-    int batch_tokens = batch_size * seq_len;
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 2;
-
-    switch (num_warp) {
-        case 1:
-            IxinferBertEmbedLnKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 2:
-            IxinferBertEmbedLnKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 3:
-            IxinferBertEmbedLnKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 4:
-            IxinferBertEmbedLnKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 5:
-            IxinferBertEmbedLnKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 6:
-            IxinferBertEmbedLnKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 7:
-            IxinferBertEmbedLnKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 8:
-            IxinferBertEmbedLnKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 9:
-            IxinferBertEmbedLnKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 10:
-            IxinferBertEmbedLnKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 11:
-            IxinferBertEmbedLnKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 12:
-            IxinferBertEmbedLnKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 13:
-            IxinferBertEmbedLnKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 14:
-            IxinferBertEmbedLnKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 15:
-            IxinferBertEmbedLnKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 16:
-            IxinferBertEmbedLnKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        default:
-            throw std::runtime_error("IxinferBertEmbedLn");
-            break;
-    }
-}
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    half const* beta, half const* gamma, half const* wordEmb, half const* posEmb, half const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, half* output, int32_t* maskIdx, int32_t padId)
-{
-    IxinferBertEmbedLn(wordEmb, posEmb, tokEmb, inputIds, output, maskIdx, (int*)segmentIds,
-                                    padId, B, S, E, gamma, beta, stream);
-    return cudaSuccess;
-}
-
-void __global__ IxinferMaskPadKernel(const int32_t* mask, int32_t* new_mask, int bsz,
-                                    int ori_seq_len, int hsz, int fmha_seq_len) {
-    int batch_idx = blockIdx.x;
-    int seq_idx = blockIdx.y;
-
-    if (seq_idx < ori_seq_len) {
-        if (threadIdx.x == 0) {
-            new_mask[batch_idx * fmha_seq_len + seq_idx] = mask[batch_idx * ori_seq_len + seq_idx];
-        }
-    } else {
-        new_mask[batch_idx * fmha_seq_len + seq_idx] = 1;
-    }
-} 
-
-void IxinferMaskPad(int32_t* mask, int32_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream) {
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hsz/2>4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 !=0");
-    }
-    if (ori_seq_len > fmha_seq_len) {
-        throw std::runtime_error("ori_seq_len > fmha_seq_len");
-    }
-    if (bsz * ori_seq_len > batch_tokens) {
-        throw std::runtime_error("bsz*ori_seq_len > batch_tokens");
-    }
-    dim3 blockSize(bsz, fmha_seq_len);
-    IxinferMaskPadKernel<<<blockSize, hsz / 2, 0, stream>>>(mask, new_mask, bsz, ori_seq_len, hsz,
-                                                           fmha_seq_len);
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h
deleted file mode 100644
index f96e7d7310613be1967072597317e45ee7dfbdb6..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-
-
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-cudaError embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    half const* beta, half const* gamma, half const* wordEmb, half const* posEmb, half const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, half* output, int32_t* maskIdx, int32_t padId);
-
-void IxinferMaskPad(int32_t* mask, int32_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream);
-
-void IxinferBertEmbedLn(const half *token_emb, const half *pos_emb, const half *type_emb, const int *tokens, half *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const half *scale, const half *bias, cudaStream_t stream);;
-
-class EmbLayerNormPluginDynamic : public IPluginV2DynamicExt {
-   public:
-    EmbLayerNormPluginDynamic(std::string const& name, nvinfer1::DataType const type, nvinfer1::DataType const mhaType,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& word_emb,
-        nvinfer1::Weights const& pos_emb, nvinfer1::Weights const& tok_emb, bool const useFullMask, int32_t padId = 0);
-    EmbLayerNormPluginDynamic(std::string const& name, void const* data, size_t length);
-    EmbLayerNormPluginDynamic() noexcept = delete;
-    ~EmbLayerNormPluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    size_t mHiddenSize;
-    size_t mSeqLen;
-    size_t mPadId;
-    DataType mEmbType;
-    bool mUseFullMask;
-    DataType mMhaType;
-    size_t mWordVocabSize, mPosVocabSize, mTokVocabSize;
-    cuda_unique_ptr<half> mGammaDev;
-    cuda_unique_ptr<half> mBetaDev;
-    cuda_unique_ptr<void> mWordEmbDev;
-    cuda_unique_ptr<void> mTokEmbDev;
-    cuda_unique_ptr<void> mPosEmbDev;
-    WeightsWithOwnership mBeta;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mWordEmb;
-    WeightsWithOwnership mTokEmb;
-    WeightsWithOwnership mPosEmb; 
-};
-
-class EmbLayerNormPluginDynamicCreator : public IPluginCreator {
-   public:
-    EmbLayerNormPluginDynamicCreator();
-
-    ~EmbLayerNormPluginDynamicCreator() override = default;
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    PluginFieldCollection const* getFieldNames() noexcept override;
-
-    IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
-
-    IPluginV2DynamicExt* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
-
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp
deleted file mode 100644
index 30b47f88ae624db48d86d3d3f35327db82639012..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "ffnPlugin.h"
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "gelu/geluPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFFN_VERSION{"1"};
-char const* const kFFN_NAME{"CustomFFNPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FFNPluginDynamicCreator::mFFN{};
-std::vector<PluginField> FFNPluginDynamicCreator::mPluginAttributes;
-
-FFNPluginDynamicCreator::FFNPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("act_type", nullptr, PluginFieldType::kINT32, 1));
-
-    mFFN.nbFields = mPluginAttributes.size();
-    mFFN.fields = mPluginAttributes.data();
-}
-
-char const* FFNPluginDynamicCreator::getPluginName() const noexcept { return kFFN_NAME; }
-
-char const* FFNPluginDynamicCreator::getPluginVersion() const noexcept { return kFFN_VERSION; }
-
-PluginFieldCollection const* FFNPluginDynamicCreator::getFieldNames() noexcept { return &mFFN; }
-
-IPluginV2* FFNPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FFNPluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        int32_t typeId = -1;
-        int32_t act_type = -1;
-        Weights W1{DataType::kFLOAT, nullptr, 0LL};
-        Weights W2{DataType::kFLOAT, nullptr, 0LL};
-        Weights B1{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "type_id", "W1", "W2", "B1"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (fieldName.compare("W1") == 0) {
-                gLogInfo << "Building W1..." << endl;
-                W1.values = fc->fields[i].data;
-                W1.count = fc->fields[i].length;
-                W1.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W1 float32: " << (W1.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("W2") == 0) {
-                gLogInfo << "Building W2..." << endl;
-                W2.values = fc->fields[i].data;
-                W2.count = fc->fields[i].length;
-                W2.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W2 float32: " << (W2.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("B1") == 0) {
-                gLogInfo << "Building B1..." << endl;
-                B1.values = fc->fields[i].data;
-                B1.count = fc->fields[i].length;
-                B1.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is B1 float32: " << (B1.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("act_type") == 0) {
-                gLogInfo << "Building act_type..." << endl;
-                act_type = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building act_type: " << act_type << endl;
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (typeId < 0 || typeId > 1) {
-            gLogInfo << "Invalid type id" << typeId << endl;
-        }
-        if (W1.count == 0 || W1.values == nullptr) {
-            gLogInfo << "Invalid weights W1" << endl;
-        }
-        if (W2.count == 0 || W2.values == nullptr) {
-            gLogInfo << "Invalid weights W2" << endl;
-        }
-        if (B1.count == 0 || B1.values == nullptr) {
-            gLogInfo << "Invalid weights B1" << endl;
-        }
-
-        DataType type = typeId == 0 ? DataType::kFLOAT : DataType::kHALF;
-        return new FFNPluginDynamic(name, type, outDims, act_type, W1, W2, B1);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FFNPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                      size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FFNPluginDynamic::destroy()
-    try {
-        return new FFNPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FFNPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FFNPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FFNPluginDynamicCreator);
-//#########################################################################//
-FFNPluginDynamic::FFNPluginDynamic(std::string const name, DataType const type, int32_t const outDim,
-                                   int32_t const act_type, Weights const& W1, Weights const& W2, Weights const& B1)
-    : mLayerName(name),
-      mType(type),
-      mHiddenSize(outDim),
-      mActType(act_type),
-      mWdev1(nullptr),
-      mWdev2(nullptr),
-      mBdev1(nullptr) {
-    mW1.convertAndCopy(W1, mType);
-    mW2.convertAndCopy(W2, mType);
-    mB1.convertAndCopy(B1, mType);
-    copyToDevice(mW1, getWeightsSize(mW1, mType), mWdev1);
-    copyToDevice(mW2, getWeightsSize(mW2, mType), mWdev2);
-    copyToDevice(mB1, getWeightsSize(mB1, mType), mBdev1);
-}
-
-FFNPluginDynamic::FFNPluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev1(nullptr), mWdev2(nullptr), mBdev1(nullptr) {
-    gLogInfo << "FFNPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mActType);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW1.convertAndCopy(d, mHiddenSize * mHiddenSize * 4, mType);
-    copyToDevice(mW1, getWeightsSize(mW1, mType), mWdev1);
-
-    mW2.convertAndCopy(d, mHiddenSize * mHiddenSize * 4, mType);
-    copyToDevice(mW2, getWeightsSize(mW2, mType), mWdev2);
-
-    mB1.convertAndCopy(d, mHiddenSize * 4, mType);
-    copyToDevice(mB1, getWeightsSize(mB1, mType), mBdev1);
-}
-
-// IPluginV2 Methods
-char const* FFNPluginDynamic::getPluginType() const noexcept { return kFFN_NAME; }
-
-char const* FFNPluginDynamic::getPluginVersion() const noexcept { return kFFN_VERSION; }
-
-int32_t FFNPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FFNPluginDynamic::initialize() noexcept {
-    gLogInfo << "FFNPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FFNPluginDynamic::terminate() noexcept { gLogInfo << "FFNPluginDynamic terminate" << endl; }
-
-size_t FFNPluginDynamic::getSerializationSize() const noexcept {
-    size_t wordSize = getElementSize(mType);
-    return wordSize * (mHiddenSize * mHiddenSize * 8 + mHiddenSize * 4) + sizeof(mType) + sizeof(mHiddenSize) +
-           sizeof(mActType);
-}
-
-void FFNPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mActType);
-
-    size_t wordSize = getElementSize(mType);
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev1.get()), 4 * mHiddenSize * mHiddenSize * wordSize);
-    serFromDev(d, static_cast<char*>(mWdev2.get()), 4 * mHiddenSize * mHiddenSize * wordSize);
-    serFromDev(d, static_cast<char*>(mBdev1.get()), 4 * mHiddenSize * wordSize);
-}
-
-void FFNPluginDynamic::destroy() noexcept {
-    gLogInfo << "FFNPluginDynamic destroy" << endl;
-    mWdev1.reset(nullptr);
-    mWdev2.reset(nullptr);
-    mBdev1.reset(nullptr);
-    delete this;
-}
-
-void FFNPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FFNPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FFNPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                             int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FFNPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FFNPluginDynamic clone" << endl;
-
-        auto* p = new FFNPluginDynamic(mLayerName, mType, mHiddenSize, mActType, mW1, mW2, mB1);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FFNPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mHiddenSize);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FFNPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                 int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FFNPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                       DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FFNPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                          PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    int32_t const S = inputs[0].dims.d[SDIM];
-    int32_t const B = inputs[0].dims.d[BDIM];
-    return B * S * 4 * mHiddenSize * sizeof(half);
-}
-
-int32_t FFNPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                  void const* const* inputs, void* const* outputs, void* workSpace,
-                                  cudaStream_t stream) noexcept {
-    gLogInfo << "in FFNPluginDynamic.." << endl;
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        if (mType == DataType::kHALF) {
-            auto const* const input = static_cast<half const*>(inputs[0]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto weight1 = static_cast<half*>(mWdev1.get());
-            auto weight2 = static_cast<half*>(mWdev2.get());
-            auto bias1 = static_cast<half*>(mBdev1.get());
-            auto buffer = static_cast<half*>(workSpace);
-
-#ifdef __ILUVATAR__
-            cuinfer_gemm(weight1, input, bias1, buffer, 1, mHiddenSize * 4, n, mHiddenSize, 0, 0, 0, 1.0f, mActType,
-                         stream, cuinfer_handle);
-            cuinfer_gemm(weight2, buffer, nullptr, output, 1, mHiddenSize, n, 4 * mHiddenSize, 0, 0, 0, 1.0f, -1,
-                         stream, cuinfer_handle);
-#else
-            cublaslt_gemm(weight1, input, buffer, 1, mHiddenSize * 4, n, mHiddenSize, 0, 0, 0, 1.0f, blaslt_handle,
-                          stream);
-            computeGeluBias(buffer, buffer, bias1, 4 * mHiddenSize, n, stream);
-            cublaslt_gemm(weight2, buffer, output, 1, mHiddenSize, n, mHiddenSize * 4, 0, 0, 0, 1.0f, blaslt_handle,
-                          stream);
-#endif
-        } else {
-            gLogError << "Unsupported type error, expected [kHALF], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h
deleted file mode 100644
index 21459c9bfe7ed5c1a206e8dc6b920bf17228fc29..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-
-#include <memory>
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "backend/cublas/cublas_helper.h"
-#include "bertCommon.h"
-#include <vector>
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-class FFNPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FFNPluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                     int32_t const out_type, nvinfer1::Weights const& W1, nvinfer1::Weights const& W2,
-                     nvinfer1::Weights const& B1);
-
-    FFNPluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FFNPluginDynamic without arguments, so we
-    // delete default constructor.
-    FFNPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mHiddenSize;
-    size_t mActType;
-
-    bert::WeightsWithOwnership mW1;
-    bert::WeightsWithOwnership mB1;
-    bert::WeightsWithOwnership mW2;
-    bert::cuda_unique_ptr<void> mWdev1;
-    bert::cuda_unique_ptr<void> mWdev2;
-    bert::cuda_unique_ptr<void> mBdev1;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FFNPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FFNPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFFN;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class FFNInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FFNInt8PluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                         nvinfer1::Weights const& W, nvinfer1::Weights const& Bias, vector<float> const& scale);
-
-    FFNInt8PluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FFNInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    FFNInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    int32_t mNmax;
-    int32_t mK;
-    int32_t mNumBias;
-
-    vector<float> mScale;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FFNInt8PluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FFNInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp
deleted file mode 100644
index b9ae517746d9ecee513e6834f150142562385c15..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "geluPlugin.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-#include <cstdint>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kGELU_IXRT_PLUGIN_VERSION{"1"};
-char const* const kGELU_IXRT_PLUGIN_NAME{"CustomGeluPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection GeluPluginDynamicCreator::mFC{};
-std::vector<PluginField> GeluPluginDynamicCreator::mPluginAttributes;
-
-GeluPluginDynamicCreator::GeluPluginDynamicCreator() {
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("bias", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    // Fill PluginFieldCollection with PluginField arguments metadata
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* GeluPluginDynamicCreator::getPluginName() const noexcept { return kGELU_IXRT_PLUGIN_NAME; }
-
-char const* GeluPluginDynamicCreator::getPluginVersion() const noexcept { return kGELU_IXRT_PLUGIN_VERSION; }
-
-PluginFieldCollection const* GeluPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* GeluPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogVerbose << "GeluPluginDynamicCreator createPlugin\n";
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        int32_t typeId = -1;
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "ld"}, fc);
-        int32_t ld = 0;
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].name != nullptr);
-            std::string fieldName(fc->fields[i].name);
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-            if (fieldName.compare("bias") == 0) {
-                bias.values = fc->fields[i].data;
-                bias.count = fc->fields[i].length;
-                bias.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-            if (fieldName.compare("ld") == 0) {
-                ld = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-
-        if (typeId < 0 || typeId > 3) {
-            gLogError << "GeluPluginDynamicCreator: invalid typeId " << typeId << std::endl;
-            return nullptr;
-        }
-
-        return new GeluPluginDynamic(name, static_cast<DataType>(typeId), bias, ld);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* GeluPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                       size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call GeluPluginDynamic::destroy()
-    try {
-        return new GeluPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void GeluPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* GeluPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(GeluPluginDynamicCreator);
-//#########################################################################//
-GeluPluginDynamic::GeluPluginDynamic(const std::string name, const DataType type, Weights const& bias, const int ld)
-    : mLayerName(name), mType(type), mLd(ld), mNumBias(bias.count) {
-    if (mNumBias > 0) {
-        mBias.convertAndCopy(bias, DataType::kHALF);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kHALF), mBiasDev);
-    }
-}
-
-GeluPluginDynamic::GeluPluginDynamic(const std::string name, void const* data, size_t length) : mLayerName(name) {
-    gLogVerbose << "GeluPluginDynamic deserialize\n";
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mLd);
-    deserialize_value(&data, &length, &mNumBias);
-
-    if (mNumBias > 0) {
-        IXRT_PLUGIN_ASSERT(mLd > 0);
-        char const* d = static_cast<char const*>(data);
-        mBias.convertAndCopy(d, mNumBias, DataType::kHALF);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kHALF), mBiasDev);
-    }
-}
-
-// IPluginV2 Methods
-
-char const* GeluPluginDynamic::getPluginType() const noexcept { return kGELU_IXRT_PLUGIN_NAME; }
-
-char const* GeluPluginDynamic::getPluginVersion() const noexcept { return kGELU_IXRT_PLUGIN_VERSION; }
-
-int32_t GeluPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t GeluPluginDynamic::initialize() noexcept {
-    gLogVerbose << "GeluPluginDynamic initalize\n";
-    return 0;
-}
-
-void GeluPluginDynamic::terminate() noexcept { gLogVerbose << "GeluPluginDynamic terminate\n"; }
-
-size_t GeluPluginDynamic::getSerializationSize() const noexcept {
-    const size_t wordSize = getElementSize(mType);
-    return sizeof(mType) + sizeof(mLd) + sizeof(mNumBias) + mNumBias * sizeof(half);
-}
-
-void GeluPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mLd);
-    serialize_value(&buffer, mNumBias);
-    if (mNumBias > 0) {
-        IXRT_PLUGIN_ASSERT(mLd > 0);
-        char* d = static_cast<char*>(buffer);
-
-        serFromDev(d, static_cast<char*>(mBiasDev.get()), mLd * getElementSize(DataType::kHALF));
-    }
-}
-
-void GeluPluginDynamic::destroy() noexcept {
-    gLogVerbose << "GeluPluginDynamic destroy\n";
-    // This gets called when the network containing plugin is destroyed
-    if (mNumBias > 0) {
-        mBiasDev.reset();
-    }
-    delete this;
-}
-
-void GeluPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* GeluPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-nvinfer1::DataType GeluPluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                        int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(index == 0);
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF ||
-                           inputTypes[0] == DataType::kINT8);
-        return inputTypes[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* GeluPluginDynamic::clone() const noexcept {
-    try {
-        gLogVerbose << "GeluPluginDynamic clone\n";
-        auto* plugin = new GeluPluginDynamic(mLayerName, mType, mBias, mLd);
-        plugin->setPluginNamespace(mNamespace.c_str());
-        return plugin;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs,
-                                                           int32_t nbInputs,
-                                                           nvinfer1::IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool GeluPluginDynamic::supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut,
-                                                  int32_t nbInputs, int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(pos >= 0);
-        IXRT_PLUGIN_ASSERT(pos < nbInputs + nbOutputs);
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return false;
-    }
-
-    PluginTensorDesc const& input = inOut[0];
-    if (pos == 0) {
-        return (input.type == mType) && (input.format == TensorFormat::kLINEAR);
-    }
-    if (pos == 1) {
-        PluginTensorDesc const& output = inOut[1];
-        return (input.type == output.type) && (output.format == TensorFormat::kLINEAR) && (output.type == mType);
-    }
-    return false;
-}
-
-void GeluPluginDynamic::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    gLogVerbose << "GeluPluginDynamic configurePlugin\n";
-
-    try {
-        IXRT_PLUGIN_ASSERT(in != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(mType == in[0].desc.type);
-        IXRT_PLUGIN_ASSERT(mType == DataType::kHALF || mType == DataType::kINT8);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t GeluPluginDynamic::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                                           nvinfer1::PluginTensorDesc const* outputs,
-                                           int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-template <typename TDataType>
-int32_t GeluPluginDynamic::enqueueTyped(void const* input_, void* output_, int32_t const inputVolume,
-                                        cudaStream_t stream) noexcept {
-    TDataType const* input = static_cast<TDataType const*>(input_);
-    TDataType* output = static_cast<TDataType*>(output_);
-    int32_t const cols = inputVolume / mLd;
-    int32_t const rows = mLd;
-
-    if (mNumBias > 0) {
-        TDataType const* bias = static_cast<TDataType*>(mBiasDev.get());
-        return computeGeluBias(output, input, bias, rows, cols, stream);
-    } else {
-        return computeGelu(stream, inputVolume, input, output);
-    }
-}
-
-int32_t GeluPluginDynamic::enqueueInt8(void const* input_, void* output_, float dequant_scale, float quant_scale,
-                                       int32_t const inputVolume, cudaStream_t stream) noexcept {
-    int8_t const* input = static_cast<int8_t const*>(input_);
-    int8_t* output = static_cast<int8_t*>(output_);
-    int32_t const cols = inputVolume / mLd;
-    int32_t const rows = mLd;
-
-    if (mNumBias > 0) {
-        half const* bias = static_cast<half*>(mBiasDev.get());
-        return computeGeluI8O8Bias(output, input, bias, rows, cols, dequant_scale, quant_scale, stream);
-    } else {
-        return computeGeluI8O8(stream, inputVolume, input, output, dequant_scale, quant_scale);
-    }
-}
-
-int32_t GeluPluginDynamic::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
-                                   nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs,
-                                   void* const* outputs, void* workspace, cudaStream_t stream) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputDesc != nullptr);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-
-    int32_t const inputVolume = volume(inputDesc[0].dims);
-    int32_t batch_token_num = inputDesc[0].dims.d[BDIM] * inputDesc[0].dims.d[SDIM];
-
-    // Our plugin outputs only one tensor.
-    // Launch CUDA kernel wrapper and save its return value.
-    switch (mType) {
-        case DataType::kFLOAT:
-            return enqueueTyped<float>(inputs[0], outputs[0], inputVolume, stream);
-        case DataType::kHALF:
-            return enqueueTyped<half>(inputs[0], outputs[0], inputVolume, stream);
-        case DataType::kINT8: {
-            int8_t* input = (int8_t*)(inputs[0]);
-            int8_t* output = (int8_t*)(outputs[0]);
-            IxinferBiasGeluI8II8O(batch_token_num, stream, (int8_t*)input, (int8_t*)output,
-                                           static_cast<half*>(mBiasDev.get()), mLd,  inputDesc[0].scale,
-                                           1.0/outputDesc[0].scale);
-            return STATUS_SUCCESS;
-        }
-        default:
-            return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu
deleted file mode 100644
index c36cac157bd49795d06c7bbb1f16bcf4b0b5cc8d..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "geluPlugin.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-// constants for approximating the normal cdf
-constexpr float A = 0.5f;
-constexpr float B = 0.7978845608028654f;    // sqrt(2.0/M_PI)
-constexpr float C = 0.035677408136300125f;  // 0.044715 * sqrt(2.0/M_PI)
-
-
-template <typename T>
-__global__ void IxinferBiasGeluI8II8OKernel(int8_t *input, int8_t *output, const T *bias, int feature_dim,
-                                            float dequant_scale, float quant_scale) {
-    int block_start = blockIdx.x * feature_dim;
-    int start = block_start + threadIdx.x;
-    int end = block_start + feature_dim;
-    for (int i = start; i < end; i += blockDim.x) {
-        int input_index = i;
-
-        float fout = gelu<float>(float(input[input_index]) * dequant_scale + __ldg(&bias[i - block_start]));
-
-        int output_index = i;
-        output[output_index] = float2int8(fout, quant_scale);
-    }
-}
-
-template <>
-__global__ void IxinferBiasGeluI8II8OKernel<__half>(int8_t *input, int8_t *output, const __half *bias, int feature_dim,
-                                                    float dequant_scale, float quant_scale) {
-    //  #pragma unroll
-    for (int block_index = 0; block_index < 2; block_index++) {
-        int block_start = (blockIdx.x * 2 + block_index) * feature_dim;
-        int start = block_start + threadIdx.x * 4;
-        int input_index = start;
-        char4 *p_input = (char4 *)(input + input_index);
-        half2 *p_bias = (half2 *)(bias + input_index - block_start);
-        float fout1 = gelu<float>(float(p_input[0].x) * dequant_scale + __half2float(p_bias[0].x));
-        float fout2 = gelu<float>(float(p_input[0].y) * dequant_scale + __half2float(p_bias[0].y));
-        float fout3 = gelu<float>(float(p_input[0].z) * dequant_scale + __half2float(p_bias[1].x));
-        float fout4 = gelu<float>(float(p_input[0].w) * dequant_scale + __half2float(p_bias[1].y));
-
-        int output_index = start;
-        char4 out;
-        out.x = float2int8(fout1, quant_scale);
-        out.y = float2int8(fout2, quant_scale);
-        out.z = float2int8(fout3, quant_scale);
-        out.w = float2int8(fout4, quant_scale);
-        char4 *p_output = (char4 *)(output + output_index);
-
-        p_output[0] = out;
-    }
-}
-
-template <typename T>
-void IxinferBiasGeluI8II8O(int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output, const T *bias,
-                           int feature_dim, float dequant_scale, float quant_scale) {
-    IxinferBiasGeluI8II8OKernel<T>
-        <<<batch_token_num, 1024, 0, stream>>>(input, output, bias, feature_dim, dequant_scale, quant_scale);
-}
-
-template void IxinferBiasGeluI8II8O<half>(int, cudaStream_t, int8_t*, int8_t *, const half *, int, float, float);
-
-template <unsigned TPB>
-__global__ void geluKernel(const half a, const half b, const half c, int n, const half* input, half* output) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const half in = input[idx];
-        const half cdf = a + a * __float2half(tanh(__half2float(in * (c * in * in + b))));
-        output[idx] = in * cdf;
-    }
-}
-
-template <unsigned TPB>
-__global__ void geluKernel(const float a, const float b, const float c, int n, const float* input, float* output) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const float in = input[idx];
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        output[idx] = in * cdf;
-    }
-}
-
-template <unsigned TPB>
-__global__ void geluKernel(const float a, const float b, const float c, int n, const int8_t* input, int8_t* output,
-                           float dequant_scale, float quant_scale) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const float in = float(input[idx]) * dequant_scale;
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        float i8_f = in * cdf * quant_scale;
-        int32_t i8 = floorf(i8_f + 0.5);
-        i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-        output[idx] = int8_t(i8);
-    }
-}
-
-int computeGelu(cudaStream_t stream, int n, const float* input, float* output) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output);
-
-    return 0;
-}
-
-int computeGelu(cudaStream_t stream, int n, const half* input, half* output) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output);
-
-    return 0;
-}
-
-int32_t computeGeluI8O8(cudaStream_t stream, int n, const int8_t* input, int8_t* output, float dequant_scale,
-                        float quant_scale) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output, dequant_scale, quant_scale);
-
-    return 0;
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const half a, const half b, const half c, half* output, const half* input,
-                               const half* bias, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const half in = input[idx] + bias[it];
-        const half cdf = a + a * __float2half(tanh(__half2float(in * (c * in * in + b))));
-        output[idx] = in * cdf;
-    }
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const float a, const float b, const float c, float* output, const float* input,
-                               const float* bias, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const float in = input[idx] + bias[it];
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        output[idx] = in * cdf;
-    }
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const float a, const float b, const float c, int8_t* output, const int8_t* input,
-                               const half* bias, float dequant_scale, float quant_scale, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const float in = float(input[idx]) * dequant_scale + __half2float(bias[it]);
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        float i8_f = in * cdf * quant_scale;
-        int32_t i8 = floorf(i8_f + 0.5);
-        i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-        output[idx] = int8_t(i8);
-    }
-}
-
-int computeGeluBias(float* output, const float* input, const float* bias, const int ld, const int cols,
-                    cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, ld);
-    return cudaPeekAtLastError();
-}
-
-int computeGeluBias(half* output, const half* input, const half* bias, const int ld, const int cols,
-                    cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, ld);
-    return cudaPeekAtLastError();
-}
-
-int32_t computeGeluI8O8Bias(int8_t* output, const int8_t* input, const half* bias, const int ld, const int cols,
-                            float dequant_scale, float quant_scale, cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, dequant_scale, quant_scale, ld);
-    return cudaPeekAtLastError();
-}
-
-}  // namespace bert
-}  // namespace nvinfer1::plugin
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h
deleted file mode 100644
index 182fe7f36de0f3cb6bfefee49b4c04a596563003..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <typename T>
-void IxinferBiasGeluI8II8O(int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output, const T *bias,
-                           int feature_dim, float dequant_scale, float quant_scale);
-
-int32_t computeGelu(cudaStream_t stream, int32_t n, float const* input, float* output);
-
-int32_t computeGelu(cudaStream_t stream, int32_t n, half const* input, half* output);
-
-int32_t computeGeluI8O8(cudaStream_t stream, int n, const int8_t* input, int8_t* output, float dequant_scale,
-                        float quant_scale);
-
-int32_t computeGeluBias(float* output, float const* input, float const* bias, int32_t const ld, int32_t const cols,
-                        cudaStream_t stream);
-
-int32_t computeGeluBias(half* output, half const* input, half const* bias, int32_t const ld, int32_t const cols,
-                        cudaStream_t stream);
-
-int32_t computeGeluI8O8Bias(int8_t* output, const int8_t* input, const half* bias, const int ld, const int cols,
-                            float dequant_scale, float quant_scale, cudaStream_t stream);
-
-class GeluPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    GeluPluginDynamic(const std::string name, const nvinfer1::DataType type, nvinfer1::Weights const& bias,
-                      const int ld);
-
-    GeluPluginDynamic(const std::string name, void const* data, size_t length);
-
-    // It doesn't make sense to make GeluPluginDynamic without arguments, so we delete
-    // default constructor.
-    GeluPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    // Helper method for enqueue()
-    template <typename TDataType>
-    int32_t enqueueTyped(void const* input, void* output, int32_t const inputVolume, cudaStream_t stream) noexcept;
-    int32_t enqueueInt8(void const* input_, void* output_, float dequant_scale, float quant_scale,
-                        int32_t const inputVolume, cudaStream_t stream) noexcept;
-
-    const std::string mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasDev;
-    size_t mLd;
-    size_t mNumBias;
-};
-
-class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    GeluPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace nvinfer1::plugin
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp
deleted file mode 100644
index c3a25ba1b2a655ef7bd5bd708a5d8dc5289d32c6..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "qkvToContextInt8Plugin.h"
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <iomanip>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION{"3"};
-char const* const kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic_IxRT"};
-}  // namespace
-
-PluginFieldCollection QKVToContextInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> QKVToContextInt8PluginDynamicCreator::mPluginAttributes;
-
-constexpr uint32_t IIDX = 0;  // index of the input tensor
-constexpr uint32_t MIDX = 1;  // index of the mask
-/*
-dq_probs:
-_arrange_qkv_amax
-_softmax_in_amax
-_softmax_out_amax
-*/
-QKVToContextInt8PluginDynamicCreator::QKVToContextInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("hidden_size", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("dq_probs", nullptr, PluginFieldType::kFLOAT32, 3));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginName() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION;
-}
-
-PluginFieldCollection const* QKVToContextInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* QKVToContextInt8PluginDynamicCreator::createPlugin(char const* name,
-                                                              PluginFieldCollection const* fc) noexcept {
-    try {
-        int32_t hiddenSize = 0;
-        // Since numHeads must always exist or validateRequiredAttributes will fail,
-        // we can set numHeads to -1 so that static analysis tools don't warn about
-        // a division by zero in QKVToContextInt8PluginDynamic constructor.
-        int32_t numHeads{-1};
-
-        vector<float> dqProbs;
-
-        ixrt_plugin::validateRequiredAttributesExist({"hidden_size", "num_heads"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-
-            if (field_name.compare("hidden_size") == 0) {
-                hiddenSize = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hiddenSize > 0,
-                                        ("QKV: Invalid hiddenSize " + std::to_string(hiddenSize)).c_str());
-                gLogInfo << "Building hiddenSize: " << hiddenSize << endl;
-            }
-            if (field_name.compare("num_heads") == 0) {
-                numHeads = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(numHeads > 0, ("QKV: Invalid numHeads " + std::to_string(numHeads)).c_str());
-                gLogInfo << "Building numHeads: " << numHeads << endl;
-            }
-            if (field_name.compare("dq_probs") == 0) {
-                IXRT_PLUGIN_CHECK_VALUE(fc->fields[i].length > 0,
-                                        ("QKV: dpProbs can not be empty, error: [dpProbs.length == 0]!"));
-                gLogInfo << "Building dqProbs: [";
-                for (auto j = 0; j < fc->fields[i].length; j++) {
-                    dqProbs.emplace_back(static_cast<float const*>((fc->fields[i].data))[j]);
-                    gLogInfo << std::setprecision(5) << dqProbs[j];
-                }
-                gLogInfo << "]" << endl;
-            }
-        }
-
-        QKVToContextInt8PluginDynamic* p = new QKVToContextInt8PluginDynamic(name, hiddenSize, numHeads, dqProbs);
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* QKVToContextInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                   size_t serialLength) noexcept {
-    try {
-        // This object will be deleted when the network is destroyed, which will
-        // call QKVToContextInt8PluginDynamic::destroy() noexcept
-        return new QKVToContextInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void QKVToContextInt8PluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(QKVToContextInt8PluginDynamicCreator);
-//#########################################################################//
-QKVToContextInt8PluginDynamic::QKVToContextInt8PluginDynamic(std::string const& name, int32_t const hiddenSize,
-                                                             int32_t const numHeads, vector<float> const dqProbs)
-    : mLayerName(name),
-      mS(0),
-      mB(0),
-      mHeadSize(hiddenSize / numHeads),
-      mHiddenSize(hiddenSize),
-      mNumHeads(numHeads),
-      mDqProbs(dqProbs) {}
-
-QKVToContextInt8PluginDynamic::QKVToContextInt8PluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name) {
-    gLogInfo << "deserialize QKVToContextInt8PluginDynamic" << endl;
-    deserialize_value(&data, &length, &mNumHeads);
-    deserialize_value(&data, &length, &mHeadSize);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mDqProbs);
-}
-
-// IPluginV2 Methods
-char const* QKVToContextInt8PluginDynamic::getPluginType() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextInt8PluginDynamic::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION;
-}
-
-int32_t QKVToContextInt8PluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t QKVToContextInt8PluginDynamic::initialize() noexcept { return 0; }
-
-void QKVToContextInt8PluginDynamic::terminate() noexcept {}
-
-size_t QKVToContextInt8PluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mNumHeads) + sizeof(mHeadSize) + sizeof(mHiddenSize) + mDqProbs.size() * sizeof(float) +
-           sizeof(mDqProbs.size());
-}
-
-void QKVToContextInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mNumHeads);
-    serialize_value(&buffer, mHeadSize);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mDqProbs);
-}
-
-void QKVToContextInt8PluginDynamic::destroy() noexcept { delete this; }
-
-void QKVToContextInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* QKVToContextInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType QKVToContextInt8PluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                          int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0)
-    return DataType::kINT8;
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* QKVToContextInt8PluginDynamic::clone() const noexcept {
-    try {
-        QKVToContextInt8PluginDynamic* ret =
-            new QKVToContextInt8PluginDynamic(mLayerName, mHiddenSize, mNumHeads, mDqProbs);
-
-        ret->setPluginNamespace(mNamespace.c_str());
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs QKVToContextInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                             int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    // input [B, S, 3*E] int8
-    // pad_mask [B, S] int8
-    
-    // output [B, S, E] int8
-    IXRT_PLUGIN_ASSERT(outputIndex == 0);
-    // Copy over everything
-    DimsExprs output(inputs[IIDX]);
-    // Divide last dim by three
-    auto const* three = exprBuilder.constant(3);
-    output.d[HDIM] = exprBuilder.constant(mHiddenSize);
-    return output;
-}
-bool QKVToContextInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                              int32_t nbInputs, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 2);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    return (inOut[pos].type == DataType::kINT8) && (inOut[pos].format == TensorFormat::kLINEAR);
-}
-
-void QKVToContextInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                                    DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 2);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    PluginTensorDesc const& inDesc = in[IIDX].desc;
-    PluginTensorDesc const& outDesc = out[0].desc;
-    IXRT_PLUGIN_ASSERT(inDesc.dims.nbDims == 5)
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[HDIM] == 3 * mHiddenSize);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[4] == 1);
-
-    PluginTensorDesc const& maskDesc = in[MIDX].desc;
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.d[0] == inDesc.dims.d[0]);
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.d[1] == inDesc.dims.d[1]);
-
-    const int32_t S = inDesc.dims.d[SDIM];
-
-    IXRT_PLUGIN_ASSERT(outDesc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[BDIM] == inDesc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[SDIM] == S);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[HDIM] == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[4] == 1);
-
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-    CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-}
-
-size_t QKVToContextInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                       PluginTensorDesc const* outputs,
-                                                       int32_t nbOutputs) const noexcept {
-    const int32_t B = inputs[0].dims.d[BDIM];
-    const int32_t S = inputs->dims.d[SDIM];
-    const int32_t E = inputs->dims.d[HDIM];
-    IXRT_PLUGIN_ASSERT(E == 3 * mHiddenSize);
-    int64_t buffer_size = B * S * E * sizeof(int8_t) + B * S * S * mNumHeads * sizeof(int8_t);
-#ifndef __ILUVATAR__
-    buffer_size += B * S * S * mNumHeads * sizeof(int32_t);
-#endif
-    return buffer_size;
-}
-
-int32_t QKVToContextInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                               void const* const* inputs, void* const* outputs, void* workspace,
-                                               cudaStream_t stream) noexcept {
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, 0));
-#endif
-        int32_t const B = inputDesc[0].dims.d[BDIM];
-        int32_t const S = inputDesc[0].dims.d[SDIM];
-
-        float qkv_out_amax_ = inputDesc[0].scale * 127;
-        float linear_in_amax_ = outputDesc[0].scale * 127;
-        float arrange_qkv_amax_ = mDqProbs[0];
-        float softmax_in_amax_ = mDqProbs[1];
-        float softmax_out_amax_ = mDqProbs[2];
-
-        int8_t* qkv_buffer_ = (int8_t*)inputs[0];
-        int8_t* qkv_out_ = (int8_t*)outputs[0];
-        int8_t* mask_ = (int8_t*)inputs[1];
-
-        int64_t buffer_size = B * S * mHiddenSize;
-        int64_t buffer_size2 = B * S * S * mNumHeads;
-        int8_t* q_buffer_ = static_cast<int8_t*>(workspace);
-        int8_t* k_buffer_ = q_buffer_ + buffer_size;
-        int8_t* v_buffer_ = k_buffer_ + buffer_size;
-        int8_t* qk_buffer_ = v_buffer_ + buffer_size;
-        
-#ifdef __ILUVATAR__
-        auto status =
-            fused_multihead_attetion_int8(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qkv_out_,
-                                          qk_buffer_, B, S, mHeadSize, mNumHeads, mHiddenSize, arrange_qkv_amax_,
-                                          softmax_in_amax_, softmax_out_amax_, linear_in_amax_, cuinfer_handle, stream);
-#else
-        int32_t* qk_out_ = reinterpret_cast<int32_t*>(qk_buffer_ + buffer_size2);
-        auto status =
-            fused_multihead_attetion_int8(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qk_out_, qkv_out_,
-                                          qk_buffer_, B, S, mHeadSize, mNumHeads, mHiddenSize, arrange_qkv_amax_,
-                                          softmax_in_amax_, softmax_out_amax_, linear_in_amax_, blaslt_handle, stream);
-#endif
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu
deleted file mode 100644
index 2330debf3e1bee647c70336b35729699b90ad06e..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu
+++ /dev/null
@@ -1,488 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "backend/cublas/cublas_helper.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "qkvToContextInt8Plugin.h"
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-const int _max_thread_per_block = 1024;
-const float _quant_range = 127.0;
-
-__global__ void IxinferArrangeEncselfQkvI8II8ONoBias(const int8_t *ori_qkv, int8_t *new_qkv, int max_batch_dim,
-                                                     int batch_seq_len, int dim_per_head, int head_num) {
-    int hidden_size = dim_per_head * head_num;
-    int batch_id = blockIdx.x / batch_seq_len;
-    int token_id = blockIdx.x % batch_seq_len;
-
-    int i = threadIdx.x;  // 1个线程处理4个数据
-
-    int head_id = (i * 4) / dim_per_head;
-    int dim_id = (i * 4) % dim_per_head;
-    int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num, batch_seq_len, dim_per_head);
-
-#pragma unroll
-    for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) {
-        char4 *p_ori_qkv = (char4 *)(ori_qkv + (blockIdx.x * 3 + qkv_idx) * hidden_size);
-        int qkv_offset = max_batch_dim * qkv_idx;
-        char4 *p_new_qkv = (char4 *)(new_qkv + qkv_offset + target_id);
-        p_new_qkv[0] = p_ori_qkv[i];
-    }
-}
-
-template <int log2_elements, int WARP_BATCH>
-__global__ void IxinferCorrelationSoftmaxEncselfI8II8OKernel(int8_t *correlation, const int8_t *src_padding_mask,
-                                                             int batch_seq_len, float quant_scale,
-                                                             float dequant_scale) {
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / SOFT_WARP_SIZE;
-    int local_idx = threadIdx.x;
-
-    for (int warp_idx = 0; warp_idx < WARP_BATCH; ++warp_idx) {
-        int start_idx = (blockIdx.x * gridDim.y * WARP_BATCH * gridDim.z * batch_seq_len +
-                         (blockIdx.y + gridDim.y * warp_idx) * gridDim.z * batch_seq_len + blockIdx.z * batch_seq_len);
-
-        char4 *p_correlation = (char4 *)(correlation + start_idx);
-        char4 *p_src_padding_mask = (char4 *)(src_padding_mask + blockIdx.x * batch_seq_len);
-
-        // load data from global memory
-        // float
-        float4 elements[WARP_ITERATIONS];
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            int element_index = local_idx + it * SOFT_WARP_SIZE;
-            if (element_index < batch_seq_len / 4) {
-                char4 mask = p_src_padding_mask[element_index];
-                char4 correlation_value = p_correlation[element_index];
-
-                elements[it].x =
-                    mask.x ? -INFINITY : (float)correlation_value.x * dequant_scale;
-                elements[it].y =
-                    mask.y ? -INFINITY : (float)correlation_value.y * dequant_scale;
-                elements[it].z =
-                    mask.z ? -INFINITY : (float)correlation_value.z * dequant_scale;
-                elements[it].w =
-                    mask.w ? -INFINITY : (float)correlation_value.w * dequant_scale;
-
-            } else {
-                elements[it].x = -INFINITY;
-                elements[it].y = -INFINITY;
-                elements[it].z = -INFINITY;
-                elements[it].w = -INFINITY;
-            }
-        }
-
-        // compute max_value
-        float max_value = elements[0].x;
-        max_value = (max_value > elements[0].y) ? max_value : elements[0].y;
-        max_value = (max_value > elements[0].z) ? max_value : elements[0].z;
-        max_value = (max_value > elements[0].w) ? max_value : elements[0].w;
-
-#pragma unroll
-        for (int it = 1; it < WARP_ITERATIONS; ++it) {
-            max_value = (max_value > elements[it].x) ? max_value : elements[it].x;
-            max_value = (max_value > elements[it].y) ? max_value : elements[it].y;
-            max_value = (max_value > elements[it].z) ? max_value : elements[it].z;
-            max_value = (max_value > elements[it].w) ? max_value : elements[it].w;
-        }
-
-        warp_reduce<float, SOFT_WARP_SIZE, Max>(&max_value);
-
-        // exp sum
-        float sum = 0.0f;
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            elements[it].x = __expf(elements[it].x - max_value);
-            elements[it].y = __expf(elements[it].y - max_value);
-            elements[it].z = __expf(elements[it].z - max_value);
-            elements[it].w = __expf(elements[it].w - max_value);
-
-            sum += (elements[it].x + elements[it].y + elements[it].z + elements[it].w);
-        }
-
-        warp_reduce<float, SOFT_WARP_SIZE, Add>(&sum);
-        sum = 1.0f / sum;
-        // store result
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            int element_index = local_idx + it * SOFT_WARP_SIZE;
-            char4 correlation_value;
-            if (element_index < batch_seq_len / 4) {
-                correlation_value.x = float2int8(elements[it].x * sum, quant_scale);
-                correlation_value.y = float2int8(elements[it].y * sum, quant_scale);
-                correlation_value.z = float2int8(elements[it].z * sum, quant_scale);
-                correlation_value.w = float2int8(elements[it].w * sum, quant_scale);
-
-                p_correlation[element_index] = correlation_value;
-
-            } else {
-                break;
-            }
-        }
-    }
-}
-
-void IxinferCorrelationSoftmaxEncselfI8II8O(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                            int8_t *correlation, const int8_t *src_padding_mask, float quant_scale,
-                                            float dequant_scale) {
-    const int NUM_INT8_SOFTMAX_BATCH_WARP = 4;
-    if (batch_seq_len > 512) {
-        throw std::runtime_error("batch_seq_len should <= 512");
-    }
-    if (head_num % NUM_INT8_SOFTMAX_BATCH_WARP != 0) {
-        throw std::runtime_error("head_num % NUM_INT8_SOFTMAX_BATCH_WARP !0");
-    }
-    if (batch_seq_len % 4 != 0) {
-        throw std::runtime_error("batch_seq_len % 4 != 0");
-    }
-
-    int log2_elements = log2_ceil(batch_seq_len / 4);
-    int next_power_of_two = 1 << log2_elements;
-    int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    // dim3 blockSize(batch_size, head_num / NUM_INT8_SOFTMAX_BATCH_WARP,
-    // batch_seq_len);
-    //
-    dim3 grid(batch_size, head_num / NUM_INT8_SOFTMAX_BATCH_WARP, batch_seq_len);
-
-    dim3 block(SOFT_WARP_SIZE);
-
-    switch (log2_elements) {
-        case 0:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<0, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-
-            break;
-
-        case 1:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<1, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 2:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<2, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 3:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<3, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 4:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<4, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 5:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<5, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 6:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<6, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 7:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<7, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 8:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<8, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 9:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<9, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        default:
-            throw std::runtime_error(
-                "ker_correlation_softmax_encself_i8I_i8O_ix_ "
-                "NotImplementedError");
-            break;
-    }
-}
-
-
-__global__ void IxinferArrangeAttenOutputI8II8OKernel(const int8_t *ori_q, int8_t *new_q, int beam_size,
-                                                      int dim_per_head, int head_num, float quant_scale,
-                                                      float dequant_scale) {
-    int hidden_size = dim_per_head * head_num;
-
-#pragma unroll
-    for (int blockin = 0; blockin < 4; blockin++) {
-        int batch_id = (blockIdx.x * 4 + blockin) / beam_size;
-        // note, for encoder, beam_id is token_id; for decoder, beam_id is beam_id
-        int beam_id = (blockIdx.x * 4 + blockin) % beam_size;
-        int i = threadIdx.x;
-        int out_index = (blockIdx.x * 4 + blockin) * hidden_size + i;
-        int head_id = i / dim_per_head;
-        int dim_id = i % dim_per_head;
-
-        char4 *p_ori_q = (char4 *)ori_q;
-        char4 *p_new_q = (char4 *)new_q;
-        char4 value;
-
-        value = p_ori_q[targetid_4dim(batch_id, head_id, beam_id, dim_id, head_num, beam_size, dim_per_head)];
-        value.x = float2int8(value.x * dequant_scale, quant_scale);
-        value.y = float2int8(value.y * dequant_scale, quant_scale);
-        value.z = float2int8(value.z * dequant_scale, quant_scale);
-        value.w = float2int8(value.w * dequant_scale, quant_scale);
-        p_new_q[out_index] = value;
-    }
-}
-
-void IxinferArrangeAttenOutputI8II8O(int batch_token_num, int hidden_size, cudaStream_t stream, const int8_t *ori_q,
-                                     int8_t *new_q, int beam_size, int dim_per_head, int head_num,
-                                     int max_thread_per_block, float quant_scale, float dequant_scale) {
-    int qual_hidden_size = hidden_size >> 2;
-    int qual_dim_per_head = dim_per_head >> 2;
-    IxinferArrangeAttenOutputI8II8OKernel<<<batch_token_num / 4, qual_hidden_size, 0, stream>>>(
-        ori_q, new_q, beam_size, qual_dim_per_head, head_num, quant_scale, dequant_scale);
-}
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax, cuinferHandle_t& cuinfer_handle,
-                                          cudaStream_t& stream) {
-    int batch_token_num = batch_size * batch_seq_len;
-    int max_batch_dim = batch_token_num * hidden_size;
-
-    float scaleCtx = linear_in_amax / _quant_range;
-    float scaleArrange = arrange_qkv_amax / _quant_range;
-    float scaleSoftin = softmax_in_amax / _quant_range;
-    float scaleSoftout = softmax_out_amax / _quant_range;
-
-    float scaleBmm1 = scaleArrange * scaleArrange / scaleSoftin * sqrt(1.f / head_dim);
-    float scaleBmm2 = scaleSoftout * scaleArrange / scaleCtx;
-
-    IxinferArrangeEncselfQkvI8II8ONoBias<<<batch_token_num, hidden_size / 4, 0, stream>>>(
-        qkv_buffer, q_buffer, max_batch_dim, batch_seq_len, head_dim, head_num);
-
-    switch (head_dim) {
-        case 64:
-        case 128:
-        case 192:
-        case 256: {
-            cuinferFlashAttnConfigInfo flashAttnInfo;
-            flashAttnInfo.scaling = sqrt(1.f / (head_dim * 1.0));
-            flashAttnInfo.quantParam.q_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.k_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.v_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.p_amax = softmax_out_amax;
-            flashAttnInfo.quantParam.o_amax = linear_in_amax;
-
-            cuinferTensorDescriptor_t qDesc, kDesc, vDesc, maskDesc, oDesc;
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&qDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&kDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&vDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&maskDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&oDesc));
-
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(qDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(kDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(vDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(maskDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, 1, 1, batch_seq_len));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(oDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-
-            CUINFER_CHECK(cuinferFMHAForwardEx(cuinfer_handle, flashAttnInfo, qDesc, q_buffer, kDesc, k_buffer, vDesc,
-                                               v_buffer, maskDesc, mask, oDesc, qk_buffer));
-            break;
-        }
-        default: {
-            cuinfer_i8_gemm(k_buffer, q_buffer, nullptr, qkv_buffer, batch_size * head_num, batch_seq_len,
-                            batch_seq_len, head_dim, batch_seq_len * head_dim, batch_seq_len * head_dim,
-                            batch_seq_len * batch_seq_len, scaleBmm1, 0.0, 0, cuinfer_handle, stream);
-
-            IxinferCorrelationSoftmaxEncselfI8II8O(batch_size, batch_seq_len, head_num, stream, qkv_buffer, mask,
-                                                   1.0 / scaleSoftout, scaleSoftin);
-
-            cuinfer_nn_i8_gemm(v_buffer, qkv_buffer, qk_buffer, batch_size * head_num, head_dim, batch_seq_len,
-                               batch_seq_len, batch_seq_len * head_dim, batch_seq_len * batch_seq_len,
-                               batch_seq_len * head_dim, scaleBmm2, cuinfer_handle, stream);
-            break;
-        }
-    }
-
-    IxinferArrangeAttenOutputI8II8O(batch_token_num, hidden_size, stream, qk_buffer, qkv_out, batch_seq_len, head_dim,
-                                    head_num, _max_thread_per_block, 1.f, 1.f);
-    return cudaSuccess;
-}
-#else
-template <int THREAD_DATA_LEN>
-__global__ void quant_qkv_gemm(const int32_t* input, int8_t* output, int hidden_size, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_id = blockIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z;
-    int block_start = block_id * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale);
-
-        p_output[element_index] = q_input;
-    }
-}
-
-void quantQKVGemm(int32_t* input, int8_t* output, int batch_size, int head_num, int batch_seq_len, int hidden_size, float dequant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    int num_per_tca = min(hidden_size / 4, C10_WARP_SIZE); 
-    dim3 gridSize(batch_size, head_num, batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-    switch (num_warp) {
-        case 1:
-            quant_qkv_gemm<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 2:
-            quant_qkv_gemm<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 3:
-            quant_qkv_gemm<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 4:
-            quant_qkv_gemm<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 5:
-            quant_qkv_gemm<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 6:
-            quant_qkv_gemm<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 7:
-            quant_qkv_gemm<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 8:
-            quant_qkv_gemm<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 9:
-            quant_qkv_gemm<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 10:
-            quant_qkv_gemm<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 11:
-            quant_qkv_gemm<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 12:
-            quant_qkv_gemm<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 13:
-            quant_qkv_gemm<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 14:
-            quant_qkv_gemm<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 15:
-            quant_qkv_gemm<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 16:
-            quant_qkv_gemm<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("quantQKVGemm");
-            break;
-    }
-}
-
-
-cudaError_t fused_multihead_attetion_int8(int8_t *qkv_buffer, int8_t *mask, int8_t *q_buffer, int8_t *k_buffer,
-                                          int8_t *v_buffer, int32_t *qk_out, int8_t *qkv_out, int8_t *qk_buffer, int batch_size,
-                                          int batch_seq_len, int head_dim, int head_num, int hidden_size,
-                                          float arrange_qkv_amax, float softmax_in_amax, float softmax_out_amax,
-                                          float linear_in_amax, cublasLtHandle_t &cublas_lt_handle,
-                                          cudaStream_t &stream) {
-    int batch_token_num = batch_size * batch_seq_len;
-    int max_batch_dim = batch_token_num * hidden_size;
-
-    float scaleCtx = linear_in_amax / _quant_range;
-    float scaleArrange = arrange_qkv_amax / _quant_range;
-    float scaleSoftin = softmax_in_amax / _quant_range;
-    float scaleSoftout = softmax_out_amax / _quant_range;
-
-    float scaleBmm1 = scaleArrange * scaleArrange / scaleSoftin * sqrt(1.f / head_dim);
-    float scaleBmm2 = scaleSoftout * scaleArrange / scaleCtx;
-
-    IxinferArrangeEncselfQkvI8II8ONoBias<<<batch_token_num, hidden_size / 4, 0, stream>>>(
-        qkv_buffer, q_buffer, max_batch_dim, batch_seq_len, head_dim, head_num);
-
-    cublaslt_gemm(k_buffer, q_buffer, qk_out, batch_size * head_num, batch_seq_len, batch_seq_len, head_dim,
-                  batch_seq_len * head_dim, batch_seq_len * head_dim, batch_seq_len * batch_seq_len, 1,
-                  cublas_lt_handle, stream);
-    quantQKVGemm(qk_out, qk_buffer, batch_size, head_num, batch_seq_len, batch_seq_len, scaleBmm1, stream);
-
-    IxinferCorrelationSoftmaxEncselfI8II8O(batch_size, batch_seq_len, head_num, stream, qk_buffer, mask,
-                                           1.0 / scaleSoftout, scaleSoftin);
-
-    cublaslt_gemm_nn(v_buffer, qk_buffer, qk_out, batch_size * head_num, head_dim, batch_seq_len, batch_seq_len,
-                     batch_seq_len * head_dim, batch_seq_len * batch_seq_len, batch_seq_len * head_dim, 1,
-                     cublas_lt_handle, stream);
-    quantQKVGemm(qk_out, q_buffer, batch_size, head_num, batch_seq_len, head_dim, scaleBmm2, stream);
-
-    IxinferArrangeAttenOutputI8II8O(batch_token_num, hidden_size, stream, q_buffer, qkv_out, batch_seq_len, head_dim,
-                                    head_num, _max_thread_per_block, 1.f, 1.f);
-    return cudaSuccess;
-}
-#endif
-}  // namespace bert
-}  // namespace nvinfer1::ixrt_plugin
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h
deleted file mode 100644
index b5c501fc35e06d259c62391dbaa43f7c3473481e..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <cublasLt.h>
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-#include <string>
-#include <vector>
-#ifdef __ILUVATAR__
-#include "ixinfer.h"
-#endif
-
-namespace nvinfer1::ixrt_plugin
-{
-namespace bert
-{
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax, cuinferHandle_t& cuinfer_handle,
-                                          cudaStream_t& stream);
-#else
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int32_t* qk_out, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax,
-                                          cublasLtHandle_t& cublas_lt_handle, cudaStream_t& stream);
-#endif
-
-void IxinferCorrelationSoftmaxEncselfI8II8O(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                            int8_t *correlation, const int8_t *src_padding_mask, float quant_scale,
-                                            float dequant_scale);
-
-void IxinferArrangeAttenOutputI8II8O(int batch_token_num, int hidden_size, cudaStream_t stream, const int8_t *ori_q,
-                                     int8_t *new_q, int beam_size, int dim_per_head, int head_num,
-                                     int max_thread_per_block, float quant_scale, float dequant_scale);
-class QKVToContextInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    QKVToContextInt8PluginDynamic(std::string const& name, int32_t const hiddenSize, int32_t const numHeads,
-        vector<float> const dqProbs);
-
-    QKVToContextInt8PluginDynamic(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make QKVToContextInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    QKVToContextInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;  
-
-protected:
-    void createMHARunner() noexcept;
-    int32_t getSMVersion() const noexcept;
-
-private:
-    std::string const& mLayerName;
-    std::string mNamespace;
-
-    int32_t mS;
-    int32_t mB;
-    int32_t mSM;
-    int32_t mHeadSize;
-    int32_t mHiddenSize;
-    int32_t mNumHeads;
-
-    cuda_unique_ptr<half> mQkvBias;
-
-    vector<float> mDqProbs;
-    bool mUseInt8ScaleMax{true};
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-};
-
-class QKVToContextInt8PluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    QKVToContextInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp
deleted file mode 100644
index a69fb957ceb7a2d6bb7d4e5edc46fbe9fc8ca63c..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "qkvToContextPlugin.h"
-
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "cuda_runtime_api.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <cstddef>
-#include <cstdint>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION{"1"};
-char const* const kQKV_TO_CONTEXT_VAR_SEQLEN_IXRT_PLUGIN_VERSION{"2"};
-char const* const kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection QKVToContextPluginDynamicCreator::mFC{};
-std::vector<PluginField> QKVToContextPluginDynamicCreator::mPluginAttributes;
-
-constexpr uint32_t IIDX = 0;  // index of the input tensor
-constexpr uint32_t MIDX = 1;  // index of the mask
-
-QKVToContextPluginDynamicCreator::QKVToContextPluginDynamicCreator() {
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("hidden_size", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("has_mask", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("dq_probs", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginName() const noexcept {
-    return kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION;
-}
-
-PluginFieldCollection const* QKVToContextPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* QKVToContextPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating QKV2ContextPlugin..." << endl;
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        int32_t hiddenSize = 0;
-        // Since numHeads must always exist or validateRequiredAttributes will fail,
-        // we can set numHeads to -1 so that static analysis tools don't warn about
-        // a division by zero in QKVToContextPluginDynamic constructor.
-        int32_t numHeads{-1};
-        bool hasMask = false;
-        int32_t typeId = -1;
-
-        float dqProbs = -1;
-
-        IXRT_PLUGIN_ASSERT(fc->fields != nullptr);
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "hidden_size", "num_heads", "has_mask"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].name != nullptr);
-            IXRT_PLUGIN_ASSERT(fc->fields[i].data != nullptr);
-            std::string field_name(fc->fields[i].name);
-
-            if (field_name.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(typeId >= 0 && typeId <= 2,
-                                        ("QKV: Invalid TypeId " + std::to_string(typeId)).c_str());
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-            if (field_name.compare("hidden_size") == 0) {
-                hiddenSize = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hiddenSize > 0,
-                                        ("QKV: Invalid hiddenSize " + std::to_string(hiddenSize)).c_str());
-                gLogInfo << "Building hiddenSize: " << hiddenSize << endl;
-            }
-            if (field_name.compare("num_heads") == 0) {
-                numHeads = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(numHeads > 0, ("QKV: Invalid numHeads " + std::to_string(numHeads)).c_str());
-                gLogInfo << "Building numHeads: " << numHeads << endl;
-            }
-            if (field_name.compare("has_mask") == 0) {
-                auto hasMaskValue = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hasMaskValue == 0 || hasMaskValue == 1,
-                                        ("QKV: Invalid hasMask " + std::to_string(hasMaskValue)).c_str());
-                hasMask = static_cast<bool>(hasMaskValue);
-                gLogInfo << "Building hasMask: " << hasMask << endl;
-            }
-        }
-
-        gLogInfo << "Building the Plugin..." << endl;
-        auto type = static_cast<DataType>(typeId);
-        auto* p = new QKVToContextPluginDynamic(name, type, hiddenSize, numHeads, dqProbs, hasMask);
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* QKVToContextPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                               size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call QKVToContextPluginDynamic::destroy()
-    return new QKVToContextPluginDynamic(name, serialData, serialLength);
-}
-
-void QKVToContextPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(QKVToContextPluginDynamicCreator);
-//#########################################################################//
-QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, const DataType type,
-                                                     const int32_t hiddenSize, const int32_t numHeads,
-                                                     float const dqProbs, bool hasImask)
-    : mLayerName(name),
-      mS(0),
-      mB(0),
-      mHeadSize(hiddenSize / numHeads),
-      mHiddenSize(hiddenSize),
-      mNumHeads(numHeads),
-      mHasImask(hasImask),
-      mType(type)
-
-{
-    //
-}
-
-QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, void const* data, size_t length)
-    : mLayerName(name) {
-    gLogInfo << "QKV Deser Start" << endl;
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mNumHeads);
-    deserialize_value(&data, &length, &mHeadSize);
-    deserialize_value(&data, &length, &mHasImask);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mS);
-    deserialize_value(&data, &length, &mB);
-
-    gLogInfo << "QKV Deser done" << endl;
-}
-
-// IPluginV2 Methods
-char const* QKVToContextPluginDynamic::getPluginType() const noexcept { return kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME; }
-
-char const* QKVToContextPluginDynamic::getPluginVersion() const noexcept { return kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION; }
-
-int32_t QKVToContextPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t QKVToContextPluginDynamic::initialize() noexcept { return 0; }
-
-void QKVToContextPluginDynamic::terminate() noexcept {}
-
-size_t QKVToContextPluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mNumHeads) + sizeof(mHeadSize) + sizeof(DataType) + sizeof(mHasImask) + sizeof(mHiddenSize) +
-           sizeof(mS) + sizeof(mB);
-}
-
-void QKVToContextPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mNumHeads);
-    serialize_value(&buffer, mHeadSize);
-    serialize_value(&buffer, mHasImask);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mS);
-    serialize_value(&buffer, mB);
-}
-
-void QKVToContextPluginDynamic::destroy() noexcept { delete this; }
-
-void QKVToContextPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* QKVToContextPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType QKVToContextPluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                      int32_t /*nbInputs*/) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF ||
-                       inputTypes[0] == DataType::kINT8);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* QKVToContextPluginDynamic::clone() const noexcept {
-    gLogInfo << "QKV Clone" << endl;
-
-    QKVToContextPluginDynamic* ret = nullptr;
-    ret = new QKVToContextPluginDynamic(mLayerName, mType, mHiddenSize, mNumHeads, mDqProbs, mHasImask);
-
-    ret->setPluginNamespace(mNamespace.c_str());
-    gLogInfo << "QKV Clone done" << endl;
-    return ret;
-}
-
-DimsExprs QKVToContextPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                         int32_t /*nbInputs*/, IExprBuilder& exprBuilder) noexcept {
-    // Input is BxSx3*N*H, output should be BxSxN*H
-    IXRT_PLUGIN_ASSERT(outputIndex == 0);
-    // Copy over everything
-    DimsExprs output(inputs[IIDX]);
-    // Divide last dim by three
-    auto const* three = exprBuilder.constant(3);
-    output.d[HDIM] = exprBuilder.constant(mHiddenSize);
-    return output;
-}
-bool QKVToContextPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                          int32_t /*nbOutputs*/) noexcept {
-    IXRT_PLUGIN_ASSERT(pos >= 0);
-    IXRT_PLUGIN_ASSERT(pos < 2 + mHasImask);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1 + mHasImask);
-    auto const* in = inOut;
-    auto const* out = inOut + nbInputs;
-
-    if (pos == 0) {
-        return (in->type == mType) && (in->format == TensorFormat::kLINEAR);
-    }
-
-    // pos==1
-    if ((mHasImask && pos == 1))  // pos 1 is the mask
-    {
-        auto const* inMask = &inOut[1];
-
-        // detect full mask and check that it was produced
-        return (inMask->type == DataType::kINT32) &&       // precision
-               (inMask->format == TensorFormat::kLINEAR);  // format
-    }
-
-    if (!mHasImask || pos == 2)  // output pos
-    {
-        return (in->type == out->type) && (out->format == TensorFormat::kLINEAR);
-    }
-
-    return false;
-}
-void QKVToContextPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                                DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1 + mHasImask);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    PluginTensorDesc const& inDesc = in[IIDX].desc;
-    TRT_UNUSED inDesc;
-    PluginTensorDesc const& outDesc = out->desc;
-    TRT_UNUSED outDesc;
-    IXRT_PLUGIN_ASSERT(mType == inDesc.type);
-    IXRT_PLUGIN_ASSERT(mType == outDesc.type);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.nbDims == 5)
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[HDIM] == 3 * mHiddenSize);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[4] == 1);
-    if (mHasImask) {
-        PluginTensorDesc const& maskDesc = in[MIDX].desc;
-        TRT_UNUSED maskDesc;
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.nbDims == 2);
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.d[0] == inDesc.dims.d[0]);
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.d[1] == inDesc.dims.d[1]);
-    }
-
-    const int32_t S = inDesc.dims.d[SDIM];
-    const int32_t B = inDesc.dims.d[BDIM] <= 0 ? in->max.d[BDIM] : inDesc.dims.d[BDIM];
-    mS = S;
-    mB = B;
-
-    IXRT_PLUGIN_ASSERT(outDesc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[BDIM] == inDesc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[SDIM] == mS);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[HDIM] == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[4] == 1);
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-    CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-}
-
-size_t QKVToContextPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                   PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    const int32_t B = inputs->dims.d[BDIM];
-    const int32_t S = inputs->dims.d[SDIM];
-    const int32_t E = inputs->dims.d[2];
-    int32_t fmha_S = S;
-    int64_t buffer_size = B * fmha_S * E;
-#ifndef __ILUVATAR__
-    buffer_size += B * S * S * mNumHeads;
-#endif
-    return 4 * buffer_size * sizeof(mType);
-}
-
-inline void print_element(half* x, int num, string name) {
-    printf("%s: \n", name.c_str());
-    half* out = (half*)malloc(num * sizeof(half));
-    cudaMemcpy(out, x, num * sizeof(half), cudaMemcpyDeviceToHost);
-    for (auto i = 0; i < num; i++) {
-        printf("%f\n", __half2float(out[i]));
-    }
-    printf("\n");
-}
-
-int32_t QKVToContextPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                           void const* const* inputs, void* const* outputs, void* workspace,
-                                           cudaStream_t stream) noexcept {
-    gLogInfo << "in QKVToContextPluginDynamic.." << endl;
-    int32_t S = inputDesc->dims.d[SDIM];
-    int32_t B = inputDesc->dims.d[BDIM];
-    int32_t status = STATUS_SUCCESS;
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-
-    try {
-        if (mType != DataType::kHALF) {
-            gLogError << "embLayerNormPlugin infer type{" << int(mType) << "} not supported!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        }
-        half* qkv_buffer_ = (half*)inputs[0];
-        half* qkv_out_ = (half*)outputs[0];
-        // [B, fmha_S]
-        int32_t* mask_ = mHasImask ? (int32_t*)inputs[1] : nullptr;
-        int fmha_seq_len = S;
-
-        int64_t buffer_size = B * fmha_seq_len * mHiddenSize;
-        half* q_buffer_ = reinterpret_cast<half*>(workspace);
-        half* k_buffer_ = q_buffer_ + buffer_size;
-        half* v_buffer_ = k_buffer_ + buffer_size;
-        
-
-        // [B, S, 3*E, 1, 1] [B, fmha_S]
-#ifdef __ILUVATAR__
-        auto status =
-            fused_multihead_attetion(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qkv_out_, B, mHeadSize,
-                                     mNumHeads, mHiddenSize, S, fmha_seq_len, cuinfer_handle, stream);
-#else    
-        half* qk_out_ = v_buffer_ + buffer_size;
-        auto status =
-            fused_multihead_attetion(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qk_out_, qkv_out_, B, mHeadSize,
-                                     mNumHeads, mHiddenSize, S, fmha_seq_len, blaslt_handle, stream);
-#endif
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu
deleted file mode 100644
index fb9455c6c2f1dfcdc3e75fec03c16eb4169ed2db..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu
+++ /dev/null
@@ -1,317 +0,0 @@
-#include "qkvToContextPlugin.h"
-#include "backend/bert/bert_helper.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#else
-#include "backend/cublas/cublas_helper.h"
-#endif
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-void __global__ IxinferArrangeEncQkvKernel(half *ori_qkv, half *new_q, half *new_k, half *new_v,
-                                           int head_dim, int head_num, int batch_seq_len, int fmha_seq_len) {
-    int hidden_size = head_dim * head_num;
-    int batch_id = blockIdx.x;
-    int token_id = blockIdx.y;
-
-    int i = threadIdx.x;  // 1个线程处理2个数据
-    int head_id = (i * 2) / head_dim;
-    int dim_id = (i * 2) % head_dim;
-
-    half2 *p_ori_qkv = (half2 *)(ori_qkv + batch_id * batch_seq_len * hidden_size * 3 + token_id * hidden_size * 3);
-    half2 *p_new_qkv;
-
-    int target_id = batch_id * head_num * fmha_seq_len * head_dim + head_id * fmha_seq_len * head_dim +
-                    token_id * head_dim + dim_id;
-    /* q */
-    p_new_qkv = (half2 *)(new_q + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-    /* k */
-    p_ori_qkv += hidden_size / 2;
-    p_new_qkv = (half2 *)(new_k + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-    /* v */
-    p_ori_qkv += hidden_size / 2;
-    p_new_qkv = (half2 *)(new_v + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-}
-
-void IxinferArrangeEncQkv(half *ori_qkv, half *new_q, half *new_k, half *new_v, int bsz,
-                          int head_num, int head_dim, int ori_seq_len, int fmha_seq_len, cudaStream_t stream) {
-    int hsz = head_num * head_dim;
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hidden_size / 2 > 4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 != 0");
-    }
-    if (head_dim % 2 != 0) {
-        throw std::runtime_error("head_dim %2 != 0");
-    }
-    dim3 blockSize(bsz, ori_seq_len);
-    IxinferArrangeEncQkvKernel<<<blockSize, hsz / 2, 0, stream>>>(ori_qkv, new_q, new_k, new_v, head_dim,
-                                                                  head_num, ori_seq_len, fmha_seq_len);
-}
-
-__global__ void IxinferEncAttnOutArrangeKernel(const half *ori_q, half *new_q, const int bsz, const int ori_seq_len,
-                                               const int fmha_seq_len, const int head_num, const int head_dim) {
-    half2 *p_ori_q = (half2 *)ori_q;
-    half2 *p_new_q = (half2 *)new_q;
-
-    int batch_token_num = ori_seq_len * head_dim * head_num;
-    int hidden_size = head_dim * head_num;
-    int date_length = bsz * ori_seq_len * head_num * head_dim;
-
-    int elem_idx = threadIdx.x + blockIdx.x * blockDim.x;
-    while (elem_idx < date_length / 2) {
-        int half_elem_idx = elem_idx * 2;
-
-        int bsz_idx = half_elem_idx / batch_token_num;
-        int seq_idx = half_elem_idx % batch_token_num / hidden_size;
-        int head_idx = half_elem_idx % batch_token_num % hidden_size / head_dim;
-        int dim_idx = half_elem_idx % batch_token_num % hidden_size % head_dim;
-
-        int src_index = bsz_idx * head_num * fmha_seq_len * head_dim + head_idx * fmha_seq_len * head_dim +
-                        seq_idx * head_dim + dim_idx;
-
-        p_new_q[elem_idx] = p_ori_q[src_index / 2];
-
-        elem_idx += gridDim.x * blockDim.x;
-    }
-}
-
-void IxinferEncAttnOutArrange(half *ori_q, half *new_q, int bsz, int ori_seq_len, int fmha_seq_len, int head_num,
-                              int head_dim, cudaStream_t stream) {
-    if (bsz * ori_seq_len * head_num * head_dim % 2 != 0) {
-        throw std::runtime_error("bsz * ori_seq_len * head_num * head_dim % 2 != 0");
-    }
-    int data_length = bsz * ori_seq_len * head_num * head_dim / 2;
-    int num_threads = 512;
-    int num_blocks = ((data_length - 1 + num_threads) / num_threads);
-    num_blocks = std::min(num_blocks, 128);
-    IxinferEncAttnOutArrangeKernel<<<num_blocks, num_threads, 0, stream>>>(ori_q, new_q, bsz, ori_seq_len, fmha_seq_len,
-                                                                           head_num, head_dim);
-}
-
-
-template <int log2_elements>
-__global__ void IxinferCorrelationSoftmaxEncselfKernel(__half *correlation, const int *src_padding_mask,
-                                                       const int batch_seq_len) {
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / SOFT_WARP_SIZE;
-
-    int head_num = blockDim.y;
-    int seq_len = gridDim.y;
-    int start_idx = (blockIdx.x * head_num * seq_len * batch_seq_len + threadIdx.y * seq_len * batch_seq_len +
-                     blockIdx.y * batch_seq_len);
-
-    half2 *p_correlation = (half2 *)(correlation + start_idx);
-    int32_t *p_mask = (int32_t *)(src_padding_mask + blockIdx.x * batch_seq_len);
-
-    int local_idx = threadIdx.x;
-
-    float2 elements[WARP_ITERATIONS];
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        int element_index = local_idx + it * SOFT_WARP_SIZE;
-        if (element_index < batch_seq_len / 2) {
-            half2 correlation_value = p_correlation[element_index];
-
-            elements[it].x =
-                p_mask[element_index * 2] ? -INFINITY : __half2float(correlation_value.x);
-            elements[it].y = p_mask[element_index * 2 + 1] ? -INFINITY
-                                                           : __half2float(correlation_value.y);
-
-        } else {
-            elements[it].x = -INFINITY;
-            elements[it].y = -INFINITY;
-        }
-    }
-
-    float max_value = elements[0].x;
-    max_value = (max_value > elements[0].y) ? max_value : elements[0].y;
-
-#pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-        max_value = (max_value > elements[it].x) ? max_value : elements[it].x;
-        max_value = (max_value > elements[it].y) ? max_value : elements[it].y;
-    }
-
-    warp_reduce<float, SOFT_WARP_SIZE, Max>(&max_value);
-
-    float sum = 0.0f;
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        elements[it].x = __expf(elements[it].x - max_value);
-        elements[it].y = __expf(elements[it].y - max_value);
-
-        sum += (elements[it].x + elements[it].y);
-    }
-
-    warp_reduce<float, SOFT_WARP_SIZE, Add>(&sum);
-    sum = 1.0f / sum;
-
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        int element_index = local_idx + it * SOFT_WARP_SIZE;
-        half2 correlation_value;
-        if (element_index < batch_seq_len / 2) {
-            correlation_value.x = __float2half(elements[it].x * sum);
-            correlation_value.y = __float2half(elements[it].y * sum);
-
-            p_correlation[element_index] = correlation_value;
-
-        } else {
-            break;
-        }
-    }
-}
-
-void IxinferCorrelationSoftmaxEncself(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                      __half *correlation, const int *src_padding_mask) {
-    if (batch_seq_len > 4096) {
-        throw std::runtime_error("batch_seq_len should <= 4096");
-    }
-    if (batch_seq_len % 2 != 0) {
-        throw std::runtime_error("batch_seq_len % 2 != 0");
-    }
-
-    int log2_elements = log2_ceil(batch_seq_len / 2);
-    int next_power_of_two = 1 << log2_elements;
-    int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-
-    dim3 grid(batch_size, batch_seq_len);
-
-    dim3 block(WARP_SIZE, head_num);
-
-    switch (log2_elements) {
-        case 0:
-            IxinferCorrelationSoftmaxEncselfKernel<0>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 1:
-            IxinferCorrelationSoftmaxEncselfKernel<1>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 2:
-            IxinferCorrelationSoftmaxEncselfKernel<2>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 3:
-            IxinferCorrelationSoftmaxEncselfKernel<3>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 4:
-            IxinferCorrelationSoftmaxEncselfKernel<4>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 5:
-            IxinferCorrelationSoftmaxEncselfKernel<5>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 6:
-            IxinferCorrelationSoftmaxEncselfKernel<6>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 7:
-            IxinferCorrelationSoftmaxEncselfKernel<7>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 8:
-            IxinferCorrelationSoftmaxEncselfKernel<8>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 9:
-            IxinferCorrelationSoftmaxEncselfKernel<9>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 10:
-            IxinferCorrelationSoftmaxEncselfKernel<10>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 11:
-            IxinferCorrelationSoftmaxEncselfKernel<11>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 12:
-            IxinferCorrelationSoftmaxEncselfKernel<12>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        default:
-            throw std::runtime_error("IxinferCorrelationSoftmaxEncself NotImplementedError");
-            break;
-    }
-}
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask,
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cuinferHandle_t &cuinfer_handle, cudaStream_t &stream) {
-    /* qkv arrange*/
-    // bsz,ori_seq_len,3*hsz -> 3*(bsz,head_num,fmha_seq_len,head_dim)
-    IxinferArrangeEncQkv(qkv_buffer, q_buffer, k_buffer, v_buffer, bsz, head_num, head_dim, ori_seq_len,
-                         fmha_seq_len, stream);
-
-    cuinferTensorDescriptor_t qDesc, kDesc, vDesc, maskDesc, oDesc;
-    cuinferDataType_t _cuinferCompType = cuinferDataType_t::CUINFER_DATA_FLOAT;
-    cuinferDataType_t _cuinferDataType = cuinferDataType_t::CUINFER_DATA_HALF;
-    cuinferDataType_t _cuinferMaskType = cuinferDataType_t::CUINFER_DATA_INT32;
-    cuinferCreateTensorDescriptor(&qDesc);
-    cuinferCreateTensorDescriptor(&kDesc);
-    cuinferCreateTensorDescriptor(&vDesc);
-    cuinferCreateTensorDescriptor(&maskDesc);
-    cuinferCreateTensorDescriptor(&oDesc);
-
-    cuinferSetTensor4dDescriptor(qDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(kDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(vDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(maskDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferMaskType, bsz, 1, 1,
-                                    fmha_seq_len);
-    cuinferSetTensor4dDescriptor(oDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-
-    cuinferFMHAParam fmha_param;
-    cuinferFMHAForward(cuinfer_handle, fmha_param, _cuinferCompType, _cuinferDataType, _cuinferMaskType, qDesc,
-                        q_buffer, kDesc, k_buffer, vDesc, v_buffer, maskDesc, mask, oDesc, q_buffer, true);
-    
-    IxinferEncAttnOutArrange(q_buffer, qkv_out, bsz, ori_seq_len, fmha_seq_len, head_num, head_dim, stream);
-    return cudaSuccess;
-}
-#else
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qk_out, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cublasLtHandle_t &blaslt_handle, cudaStream_t &stream) {
-    /* qkv arrange*/
-    // bsz,ori_seq_len,3*hsz -> 3*(bsz,head_num,fmha_seq_len,head_dim)
-    IxinferArrangeEncQkv(qkv_buffer, q_buffer, k_buffer, v_buffer, bsz, head_num, head_dim, ori_seq_len,
-                         fmha_seq_len, stream);
-
-    cublaslt_gemm(k_buffer, q_buffer, qk_out, bsz * head_num, fmha_seq_len, fmha_seq_len, head_dim,
-                    fmha_seq_len * head_dim, fmha_seq_len * head_dim, fmha_seq_len * fmha_seq_len, 1.0/sqrt(head_dim*1.0), blaslt_handle, stream);
- 
-    IxinferCorrelationSoftmaxEncself(bsz, fmha_seq_len, head_num, stream, qk_out, mask);
- 
-    cublaslt_gemm_nn(v_buffer, qk_out, q_buffer, bsz * head_num, head_dim, fmha_seq_len, fmha_seq_len,
-                    fmha_seq_len * head_dim, fmha_seq_len * fmha_seq_len, fmha_seq_len * head_dim, 1.0f, blaslt_handle, stream);
-
-    IxinferEncAttnOutArrange(q_buffer, qkv_out, bsz, ori_seq_len, fmha_seq_len, head_num, head_dim, stream);
-    return cudaSuccess;                            
-}
-#endif
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h
deleted file mode 100644
index aaee52b710d275427188b9bf8174bfc9b924faaf..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cuinferHandle_t &cuinfer_handle, cudaStream_t &stream);
-#else
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qk_out, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cublasLtHandle_t &blaslt_handle, cudaStream_t &stream);
-#endif
-
-void IxinferArrangeEncQkv(half *ori_qkv, half *new_q, half *new_k, half *new_v, int bsz,
-                          int head_num, int head_dim, int ori_seq_len, int fmha_seq_len, cudaStream_t stream);
-
-void IxinferEncAttnOutArrange(half *ori_q, half *new_q, int bsz, int ori_seq_len, int fmha_seq_len, int head_num,
-                              int head_dim, cudaStream_t stream);
-
-void IxinferCorrelationSoftmaxEncself(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                      half *correlation, const int *src_padding_mask);
-
-class QKVToContextPluginDynamic : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    QKVToContextPluginDynamic(const std::string name, const nvinfer1::DataType type, const int32_t hiddenSize,
-        const int32_t numHeads, float const dqProbs, bool hasImask = false);
-
-    QKVToContextPluginDynamic(const std::string name, void const* data, size_t length);
-
-    // It doesn't make sense to make QKVToContextPluginDynamic without arguments, so we
-    // delete default constructor.
-    QKVToContextPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-private:
-    const std::string mLayerName;
-    std::string mNamespace;
-
-    int32_t mS;
-    int32_t mB;
-    int32_t mSM;
-    int32_t mHeadSize;
-    int32_t mHiddenSize;
-    int32_t mNumHeads;
-    bool mHasImask;
-    nvinfer1::DataType mType;
-    float mDqProbs;
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-    
-    half* query_;
-};
-
-class QKVToContextPluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    QKVToContextPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp
deleted file mode 100644
index 6e4e5a37e148b4ad3719cbce8bef4e3261a83c89..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "skipLayerNormInt8Plugin.h"
-
-#include "NvInferRuntime.h"
-#include "checkMacrosPlugin.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-// Clip plugin specific constants
-namespace {
-char const* kSKIP_LAYER_NORM_INT8_VERSION_HFACE{"3"};
-char const* kSKIP_LAYER_NORM_INT8_VERSION_MTRON{"4"};
-char const* kSKIP_LAYER_NORM_INT8_NAME{"CustomSkipLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection SkipLayerNormInt8PluginBaseCreator::mFC{};
-std::vector<PluginField> SkipLayerNormInt8PluginBaseCreator::mPluginAttributes;
-
-constexpr auto param_type = DataType::kFLOAT;
-
-SkipLayerNormInt8PluginBaseCreator::SkipLayerNormInt8PluginBaseCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("beta"));
-    mPluginAttributes.emplace_back(PluginField("gamma"));
-    mPluginAttributes.emplace_back(PluginField("bias"));
-    mPluginAttributes.emplace_back(PluginField("output_fp32"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-SkipLayerNormInt8PluginHFaceCreator::SkipLayerNormInt8PluginHFaceCreator() : SkipLayerNormInt8PluginBaseCreator() {}
-
-char const* SkipLayerNormInt8PluginBaseCreator::getPluginName() const noexcept { return kSKIP_LAYER_NORM_INT8_NAME; }
-
-PluginFieldCollection const* SkipLayerNormInt8PluginBaseCreator::getFieldNames() noexcept { return &mFC; }
-
-void SkipLayerNormInt8PluginBaseCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* SkipLayerNormInt8PluginBaseCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-char const* SkipLayerNormInt8PluginHFaceCreator::getPluginVersion() const noexcept {
-    return kSKIP_LAYER_NORM_INT8_VERSION_HFACE;
-}
-
-bool buildBetaAndGamma(PluginFieldCollection const* fc, Weights& beta, Weights& gamma, Weights& bias) {
-    ixrt_plugin::validateRequiredAttributesExist({"beta", "gamma"}, fc);
-
-    bool output_fp32 = false;
-
-    for (int32_t i = 0; i < fc->nbFields; i++) {
-        std::string field_name(fc->fields[i].name);
-
-        if (field_name.compare("beta") == 0) {
-            gLogInfo << "Building beta..." << endl;
-            beta.values = fc->fields[i].data;
-            beta.count = fc->fields[i].length;
-            beta.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("gamma") == 0) {
-            gLogInfo << "Building gamma..." << endl;
-            gamma.values = fc->fields[i].data;
-            gamma.count = fc->fields[i].length;
-            gamma.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("bias") == 0) {
-            gLogInfo << "Building bias..." << endl;
-            bias.values = fc->fields[i].data;
-            bias.count = fc->fields[i].length;
-            bias.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("output_fp32") == 0) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-            output_fp32 = (static_cast<int32_t const*>(fc->fields[i].data)[0] == 1);
-            gLogInfo << "Building output_fp32" << output_fp32 << endl;
-        }
-    }
-
-    IXRT_PLUGIN_CHECK_VALUE(beta.values != nullptr, "SkipLayerNorm: invalid beta");
-    IXRT_PLUGIN_CHECK_VALUE(beta.count > 0, "SkipLayerNorm: invalid beta");
-
-    IXRT_PLUGIN_CHECK_VALUE(gamma.values != nullptr, "SkipLayerNorm: invalid gamma");
-    IXRT_PLUGIN_CHECK_VALUE(gamma.count > 0, "SkipLayerNorm: invalid gamma");
-    return output_fp32;
-}
-
-IPluginV2* SkipLayerNormInt8PluginHFaceCreator::createPlugin(char const* name,
-                                                             PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormInt8PluginHFaceCreator createPlugin" << endl;
-
-        Weights beta{DataType::kFLOAT, nullptr, 0};
-        Weights gamma{DataType::kFLOAT, nullptr, 0};
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        bool output_fp32 = buildBetaAndGamma(fc, beta, gamma, bias);
-        return new SkipLayerNormInt8PluginHFace(name, beta, gamma, bias, output_fp32);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* SkipLayerNormInt8PluginHFaceCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                  size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call SkipLayerNormInterleavedPlugin::destroy()
-    try {
-        gLogInfo << "SkipLayerNormInterleavedPluginHFaceCreator deserializePlugin" << endl;
-        return new SkipLayerNormInt8PluginHFace(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-// REGISTER_TENSORRT_PLUGIN(SkipLayerNormInt8PluginHFaceCreator);
-//#########################################################################//
-SkipLayerNormInt8PluginBase::SkipLayerNormInt8PluginBase(std::string const& name, Weights const& beta,
-                                                         Weights const& gamma, Weights const& bias, bool output_fp32)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mBiasDev(nullptr),
-      mLd(beta.count),
-      mParamsOnDevice(false),
-      output_fp32(output_fp32) {
-    IXRT_PLUGIN_ASSERT(mLd > 0);
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    // dataType for beta, gamma weights is always fp16
-    mParamWordsize = getElementSize(param_type);
-
-    mBeta.convertAndCopy(beta, param_type);
-    mGamma.convertAndCopy(gamma, param_type);
-
-    mHasBias = (bias.values != nullptr);
-    if (mHasBias) {
-        mBias.convertAndCopy(bias, param_type);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, param_type), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, param_type), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, param_type), mBiasDev);
-    }
-}
-
-SkipLayerNormInt8PluginBase::SkipLayerNormInt8PluginBase(std::string const& name, void const* data, size_t length)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mParamsOnDevice(false) {
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mLd);
-    deserialize_value(&data, &length, &mHasBias);
-    deserialize_value(&data, &length, &output_fp32);
-
-    mParamWordsize = getElementSize(param_type);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mLd, param_type);
-    mGamma.convertAndCopy(d, mLd, param_type);
-
-    if (mHasBias) {
-        mBias.convertAndCopy(d, mLd, param_type);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, param_type), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, param_type), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, param_type), mBiasDev);
-    }
-}
-
-SkipLayerNormInt8PluginHFace::SkipLayerNormInt8PluginHFace(std::string const& name, Weights const& beta,
-                                                           Weights const& gamma, Weights const& bias, bool output_fp32)
-    : SkipLayerNormInt8PluginBase(name, beta, gamma, bias, output_fp32) {}
-
-SkipLayerNormInt8PluginHFace::SkipLayerNormInt8PluginHFace(std::string const& name, void const* data, size_t length)
-    : SkipLayerNormInt8PluginBase(name, data, length) {
-    gLogInfo << "SkipLayerNormInt8PluginHFace deserialize" << endl;
-}
-
-// IPluginV2 Methods
-char const* SkipLayerNormInt8PluginBase::getPluginType() const noexcept { return kSKIP_LAYER_NORM_INT8_NAME; }
-
-size_t SkipLayerNormInt8PluginBase::getSerializationSize() const noexcept {
-    const size_t biasSize = mHasBias ? (mLd * mParamWordsize) : 0;
-    return 2 * mParamWordsize * mLd + sizeof(mLd) + sizeof(mHasBias) + sizeof(output_fp32) + biasSize;
-}
-
-void SkipLayerNormInt8PluginBase::serialize(void* buffer) const noexcept {
-    try {
-        serialize_value(&buffer, mLd);
-        serialize_value(&buffer, mHasBias);
-        serialize_value(&buffer, output_fp32);
-
-        char* d = static_cast<char*>(buffer);
-        serFromDev(d, static_cast<char*>(mBetaDev.get()), mLd * mParamWordsize);
-        serFromDev(d, static_cast<char*>(mGammaDev.get()), mLd * mParamWordsize);
-        if (mHasBias) {
-            serFromDev(d, static_cast<char*>(mBiasDev.get()), mLd * mParamWordsize);
-        }
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormInt8PluginBase::destroy() noexcept {
-    try {
-        // This gets called when the network containing plugin is destroyed
-        mGammaDev.reset(nullptr);
-        mBetaDev.reset(nullptr);
-        if (mHasBias) {
-            mBiasDev.reset(nullptr);
-        }
-        delete this;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormInt8PluginBase::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* SkipLayerNormInt8PluginBase::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// HFace
-int32_t SkipLayerNormInt8PluginHFace::initialize() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace initialize" << endl;
-    return 0;
-}
-
-void SkipLayerNormInt8PluginHFace::terminate() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace terminate" << endl;
-}
-
-void SkipLayerNormInt8PluginHFace::destroy() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace destroy" << endl;
-    SkipLayerNormInt8PluginBase::destroy();
-}
-
-char const* SkipLayerNormInt8PluginHFace::getPluginVersion() const noexcept {
-    return kSKIP_LAYER_NORM_INT8_VERSION_HFACE;
-}
-
-int32_t SkipLayerNormInt8PluginHFace::getNbOutputs() const noexcept { return 2; }
-
-// IPluginV2Ext Methods
-DataType SkipLayerNormInt8PluginBase::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                        int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(index >= 0 && index < getNbOutputs());
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        if (index == 0) {
-            return output_fp32 ? DataType::kFLOAT : DataType::kINT8;
-        }
-        return DataType::kFLOAT;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2DynamicExt Methods
-DimsExprs SkipLayerNormInt8PluginBase::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                           int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        IXRT_PLUGIN_ASSERT(outputIndex >= 0 && outputIndex < getNbOutputs());
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool SkipLayerNormInt8PluginBase::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                            int32_t nbInputs, int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        IXRT_PLUGIN_ASSERT(nbOutputs == getNbOutputs());
-        IXRT_PLUGIN_ASSERT(pos >= 0 && pos < (nbInputs + nbOutputs));
-
-        PluginTensorDesc const& desc = inOut[pos];
-        if (pos == 2 || pos == 4 || (output_fp32 && pos == 3)) {
-            return desc.type == DataType::kFLOAT && desc.format == TensorFormat::kLINEAR;
-        }
-        return desc.type == DataType::kINT8 && desc.format == TensorFormat::kLINEAR;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return false;
-}
-
-void SkipLayerNormInt8PluginBase::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                  DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbOutputs == getNbOutputs());
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        auto const& inDims0 = inputs[0].desc.dims;
-        auto const& inDims1 = inputs[1].desc.dims;
-        auto const& inDims2 = inputs[2].desc.dims;
-        TRT_UNUSED inDims1;
-        TRT_UNUSED inDims2;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims1.nbDims);
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d));
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims2.nbDims);
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims2.d));
-
-        mParamWordsize = getElementSize(param_type);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t SkipLayerNormInt8PluginBase::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                     PluginTensorDesc const* outputs,
-                                                     int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-// HFace IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* SkipLayerNormInt8PluginHFace::clone() const noexcept {
-    try {
-        gLogInfo << "SkipLayerNormInterleavedPluginHFace clone" << endl;
-        auto* p = new SkipLayerNormInt8PluginHFace(mLayerName, mBeta, mGamma, mBias, output_fp32);
-        p->initialize();
-        p->setPluginNamespace(mNamespace.c_str());
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-int32_t SkipLayerNormInt8PluginHFace::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                              void const* const* inputs, void* const* outputs, void* workspace,
-                                              cudaStream_t stream) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        auto const iDesc = inputDesc[0];
-        auto const oDesc = outputDesc[0];
-
-        const int32_t B = iDesc.dims.d[0];
-        const int32_t S = iDesc.dims.d[1];
-        const int32_t E = iDesc.dims.d[2];
-        int batch_token_num = B * S;
-        float const dqScaleIn = iDesc.scale;
-        IXRT_PLUGIN_ASSERT(dqScaleIn > 1e-9);
-        float const qScale = oDesc.scale;
-        int8_t const* input = static_cast<int8_t const*>(inputs[0]);
-        int8_t const* skip = static_cast<int8_t const*>(inputs[1]);
-        float* residual = (float*)inputs[2];
-        float const* gamma = static_cast<float const*>(mGammaDev.get());
-        float const* beta = static_cast<float const*>(mBetaDev.get());
-        float const* bias = static_cast<float const*>(mBiasDev.get());
-        float* residual_out = static_cast<float*>(outputs[1]);
-
-        if (!output_fp32) {
-            int8_t* output = static_cast<int8_t*>(outputs[0]);
-            skipLayerNormI8II8O(input, gamma, beta, bias, output, residual, residual_out, batch_token_num, E,
-                                dqScaleIn, 1.0 / qScale, 1024, stream, true);
-        } else {
-            float* output = static_cast<float*>(outputs[0]);
-            skipLayerNormI8IF32O(input, gamma, beta, bias, output, residual, residual_out, batch_token_num, E,
-                                 1.0 / dqScaleIn, 1.0 / qScale, 1024, stream, true);
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu
deleted file mode 100644
index 7cd3e56418726a3b8e32b7835560771aee873cca..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "skipLayerNormInt8Plugin.h"
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void skipLayernormI8II8OKernel(const int8_t *input, const float *scale, const float *bias,
-                                        const float *residual_bias, int8_t *output, float *residual, float* residual_out, 
-                                        int hidden_size, float dequant_scale, float quant_scale,
-                                        bool is_post_ln) {
-    // register
-    // process 2 data
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x *  hidden_size / 4;
-    char4 *p_input = (char4 *)input;
-    char4 *p_output = (char4 *)output;
-    float4 *p_residual = (float4 *)residual;
-    float4 *p_residual_out = (float4 *)residual_out;
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_residual_bias = (float4 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start; 
-    p_residual_out += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-    // load data from global memory
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-         // vals = dequant(input) + residual + bias
-        p_residual_out[element_index].x = p_residual[element_index].x + p_residual_bias[element_index].x;
-        p_residual_out[element_index].y = p_residual[element_index].y + p_residual_bias[element_index].y;
-        p_residual_out[element_index].z = p_residual[element_index].z + p_residual_bias[element_index].z;
-        p_residual_out[element_index].w = p_residual[element_index].w + p_residual_bias[element_index].w;
-        vals[it] = char4addfloat4_dequant(p_input[element_index], p_residual_out[element_index], dequant_scale);
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    // mean var
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_size, epsilon,
-                                                      p_scale[element_index], p_bias[element_index]);
-
-        p_residual_out[element_index].x = norm_value.x;
-        p_residual_out[element_index].y = norm_value.y;
-        p_residual_out[element_index].z = norm_value.z;
-        p_residual_out[element_index].w = norm_value.w;
-
-        char4 res = float42char4(norm_value, quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void skipLayernormI8IF32OKernel(const int8_t *input, const float *scale, const float *bias,
-                                        const float *residual_bias, float *output, float *residual, float* residual_out, 
-                                        int hidden_size, float dequant_scale, float quant_scale,
-                                        bool is_post_ln) {
-    // register
-    // process 2 data
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 4;
-    char4 *p_input = (char4 *)input;
-    float4 *p_output = (float4 *)output;
-    float4 *p_residual = (float4 *)residual;
-    float4 *p_residual_out = (float4 *)residual_out;
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_residual_bias = (float4 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-    p_residual_out += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-    // load data from global memory
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-         // vals = dequant(input) + residual + bias
-        p_residual_out[element_index].x = p_residual[element_index].x + p_residual_bias[element_index].x;
-        p_residual_out[element_index].y = p_residual[element_index].y + p_residual_bias[element_index].y;
-        p_residual_out[element_index].z = p_residual[element_index].z + p_residual_bias[element_index].z;
-        p_residual_out[element_index].w = p_residual[element_index].w + p_residual_bias[element_index].w;
-        vals[it] = char4addfloat4_dequant(p_input[element_index], p_residual_out[element_index], dequant_scale);
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    // mean var
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_size, epsilon,
-                                                      p_scale[element_index], p_bias[element_index]);
-        
-        p_output[element_index].x = norm_value.x;
-        p_output[element_index].y = norm_value.y;
-        p_output[element_index].z = norm_value.z;
-        p_output[element_index].w = norm_value.w;
-    }
-}
-
-
-void skipLayerNormI8II8O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias, 
-                       int8_t *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln) {
-
-    if (hidden_size > 1024) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            skipLayernormI8II8OKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 2:
-            skipLayernormI8II8OKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 3:
-            skipLayernormI8II8OKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 4:
-            skipLayernormI8II8OKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 5:
-            skipLayernormI8II8OKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 6:
-            skipLayernormI8II8OKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 7:
-            skipLayernormI8II8OKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 8:
-            skipLayernormI8II8OKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 9:
-            skipLayernormI8II8OKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 10:
-            skipLayernormI8II8OKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 11:
-            skipLayernormI8II8OKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 12:
-            skipLayernormI8II8OKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 13:
-            skipLayernormI8II8OKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 14:
-            skipLayernormI8II8OKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 15:
-            skipLayernormI8II8OKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 16:
-            skipLayernormI8II8OKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        default:
-            throw std::runtime_error("skipLayernormI8II8OKernel");
-            break;
-    }
-}
-
-void skipLayerNormI8IF32O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias,
-                       float *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln) {
-    if (hidden_size > 1024) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            skipLayernormI8IF32OKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 2:
-            skipLayernormI8IF32OKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 3:
-            skipLayernormI8IF32OKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 4:
-            skipLayernormI8IF32OKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 5:
-            skipLayernormI8IF32OKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 6:
-            skipLayernormI8IF32OKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 7:
-            skipLayernormI8IF32OKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 8:
-            skipLayernormI8IF32OKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 9:
-            skipLayernormI8IF32OKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 10:
-            skipLayernormI8IF32OKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 11:
-            skipLayernormI8IF32OKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 12:
-            skipLayernormI8IF32OKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 13:
-            skipLayernormI8IF32OKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 14:
-            skipLayernormI8IF32OKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 15:
-            skipLayernormI8IF32OKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 16:
-            skipLayernormI8IF32OKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        default:
-            throw std::runtime_error("skipLayernormI8II8OKernel");
-            break;    
-    }             
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin 
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h
deleted file mode 100644
index f752f59f5e590b00485568f07c43cd47ea4586a1..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-
-#include <string>
-#include <vector>
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-
-void skipLayerNormI8II8O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias, 
-                       int8_t *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln);
-
-void skipLayerNormI8IF32O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias,
-                       float *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln);
-
-class SkipLayerNormInt8PluginBase : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    SkipLayerNormInt8PluginBase(
-        std::string const& name, nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias, bool output_fp32);
-
-    SkipLayerNormInt8PluginBase(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make SkipLayerNormInterleavedPlugin without
-    // arguments, so we delete default constructor.
-    SkipLayerNormInt8PluginBase() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-
-protected:
-    std::string const& mLayerName;
-    std::string mNamespace;
-
-    bert::cuda_unique_ptr<void> mGammaDev;
-    bert::cuda_unique_ptr<void> mBetaDev;
-    size_t mLd{}; // leading dim
-    bert::WeightsWithOwnership mGamma;
-    bert::WeightsWithOwnership mBeta;
-
-    size_t mParamWordsize{};
-    bool mParamsOnDevice{};
-    bool mHasBias{};
-    cuda_unique_ptr<void> mBiasDev;
-    WeightsWithOwnership mBias;
-    bool output_fp32{};
-};
-
-class SkipLayerNormInt8PluginHFace : public SkipLayerNormInt8PluginBase
-{
-public:
-    SkipLayerNormInt8PluginHFace(
-        std::string const& name, nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias, bool output_fp32);
-
-    SkipLayerNormInt8PluginHFace(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make SkipLayerNormInterleavedPlugin without
-    // arguments, so we delete default constructor.
-    SkipLayerNormInt8PluginHFace() = delete;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-    // IPluginV2 Methods
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    void destroy() noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-};
-
-class SkipLayerNormInt8PluginBaseCreator : public nvinfer1::IPluginCreator
-{
-public:
-    SkipLayerNormInt8PluginBaseCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class SkipLayerNormInt8PluginHFaceCreator : public SkipLayerNormInt8PluginBaseCreator
-{
-public:
-    SkipLayerNormInt8PluginHFaceCreator();
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp
deleted file mode 100644
index 4ca63061c499490f3bf679fedb0c44bf16e961eb..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "skipLayerNormPlugin.h"
-
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* kSKIP_LAYER_NORM_VERSION{"1"};
-char const* kSKIP_LAYER_NORM_NAME{"CustomSkipLayerNormPluginDynamic_IxRT"};
-char const* kSKIP_LAYER_NORM_VAR_SEQLEN_VERSION{"2"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection SkipLayerNormPluginDynamicCreator::mFC{};
-std::vector<PluginField> SkipLayerNormPluginDynamicCreator::mPluginAttributes;
-
-// REGISTER_TENSORRT_PLUGIN(SkipLayerNormPluginDynamicCreator);
-
-static inline DataType getParamWordType(DataType cfgType) noexcept {
-    if (cfgType == DataType::kINT8) {
-        return DataType::kHALF;
-    }
-
-    return cfgType;
-}
-
-SkipLayerNormPluginDynamicCreator::SkipLayerNormPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("ld"));
-    mPluginAttributes.emplace_back(PluginField("type_id"));
-    mPluginAttributes.emplace_back(PluginField("beta"));
-    mPluginAttributes.emplace_back(PluginField("gamma"));
-    mPluginAttributes.emplace_back(PluginField("bias"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginName() const noexcept { return kSKIP_LAYER_NORM_NAME; }
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginVersion() const noexcept { return kSKIP_LAYER_NORM_VERSION; }
-
-PluginFieldCollection const* SkipLayerNormPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* SkipLayerNormPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamicCreator createPlugin" << endl;
-
-        int32_t ld = 0;
-        Weights beta{DataType::kFLOAT, nullptr, 0};
-        Weights gamma{DataType::kFLOAT, nullptr, 0};
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        int32_t typeId = -1;
-
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "beta", "ld", "gamma"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("ld") == 0) {
-                ld = *static_cast<int32_t const*>(fc->fields[i].data);
-                gLogInfo << "Building ld: " << ld << endl;
-            }
-
-            if (field_name.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (field_name.compare("beta") == 0) {
-                gLogInfo << "Building beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("gamma") == 0) {
-                gLogInfo << "Building gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bias") == 0) {
-                gLogInfo << "Building bias..." << endl;
-                bias.values = fc->fields[i].data;
-                bias.count = fc->fields[i].length;
-                bias.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-        }
-        gLogInfo << "Type " << typeId << endl;
-
-        IXRT_PLUGIN_CHECK_VALUE(typeId >= 0 && typeId <= 3,
-                                ("SkipLayerNorm: Invalid type ID: " + std::to_string(typeId)).c_str());
-
-        IXRT_PLUGIN_CHECK_VALUE(beta.values != nullptr, "SkipLayerNorm: invalid beta");
-        IXRT_PLUGIN_CHECK_VALUE(beta.count > 0, "SkipLayerNorm: invalid beta");
-
-        IXRT_PLUGIN_CHECK_VALUE(gamma.values != nullptr, "SkipLayerNorm: invalid gamma");
-        IXRT_PLUGIN_CHECK_VALUE(gamma.count > 0, "SkipLayerNorm: invalid gamma");
-
-        IXRT_PLUGIN_CHECK_VALUE(typeId == (int)DataType::kHALF, "typeId != DataType::kHALF error");
-
-        return new SkipLayerNormPluginDynamic(name, static_cast<DataType>(typeId), ld, beta, gamma, bias);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-nvinfer1::IPluginV2* SkipLayerNormPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                          size_t serialLength) noexcept {
-    try {
-        return new SkipLayerNormPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void SkipLayerNormPluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-//#########################################################################//
-SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string name, const DataType type, int32_t const ld,
-                                                       Weights const& beta, Weights const& gamma, Weights const& bias)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mHiddenSize(ld), mType(type), mBiasDev(nullptr) {
-    IXRT_PLUGIN_ASSERT(mType == nvinfer1::DataType::kFLOAT || mType == nvinfer1::DataType::kHALF ||
-                       mType == nvinfer1::DataType::kINT8);
-
-    mCfgType = mType == DataType::kINT8 ? DataType::kHALF : mType;
-    mParamWordsize = getElementSize(mCfgType);
-
-    mBeta.convertAndCopy(beta, mCfgType);
-    mGamma.convertAndCopy(gamma, mCfgType);
-
-    mHasBias = (bias.values != nullptr);
-    if (mHasBias) {
-        mBias.convertAndCopy(bias, mCfgType);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, mCfgType), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, mCfgType), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, mCfgType), mBiasDev);
-    }
-}
-
-SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string& name, void const* data, size_t length)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mBiasDev(nullptr) {
-    gLogInfo << "SkipLayerNormPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mCfgType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mHasBias);
-
-    IXRT_PLUGIN_ASSERT(mCfgType == nvinfer1::DataType::kFLOAT || mCfgType == nvinfer1::DataType::kHALF);
-    mParamWordsize = getElementSize(mCfgType);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, mCfgType);
-    mGamma.convertAndCopy(d, mHiddenSize, mCfgType);
-    if (mHasBias) {
-        mBias.convertAndCopy(d, mHiddenSize, mCfgType);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, mCfgType), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, mCfgType), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, mCfgType), mBiasDev);
-    }
-}
-
-// IPluginV2Ext Methods
-DataType SkipLayerNormPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                       int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(index == 0);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        return inputTypes[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2 Methods
-char const* SkipLayerNormPluginDynamic::getPluginType() const noexcept { return kSKIP_LAYER_NORM_NAME; }
-
-char const* SkipLayerNormPluginDynamic::getPluginVersion() const noexcept { return kSKIP_LAYER_NORM_VERSION; }
-
-int32_t SkipLayerNormPluginDynamic::getNbOutputs() const noexcept { return 1; }
-int32_t SkipLayerNormPluginDynamic::initialize() noexcept {
-    gLogInfo << "SkipLayerNormPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void SkipLayerNormPluginDynamic::terminate() noexcept { gLogInfo << "SkipLayerNormPluginDynamic terminate" << endl; }
-
-size_t SkipLayerNormPluginDynamic::getSerializationSize() const noexcept {
-    const size_t biasSize = mHasBias ? (mHiddenSize * mParamWordsize) : 0;
-    return 2 * mParamWordsize * mHiddenSize + 2 * sizeof(DataType) + sizeof(mHiddenSize) + biasSize + sizeof(mHasBias);
-}
-
-void SkipLayerNormPluginDynamic::serialize(void* buffer) const noexcept {
-    try {
-        serialize_value(&buffer, mType);
-        serialize_value(&buffer, mCfgType);
-        serialize_value(&buffer, mHiddenSize);
-        serialize_value(&buffer, mHasBias);
-
-        char* d = static_cast<char*>(buffer);
-        serFromDev(d, static_cast<char*>(mBetaDev.get()), mHiddenSize * mParamWordsize);
-        serFromDev(d, static_cast<char*>(mGammaDev.get()), mHiddenSize * mParamWordsize);
-        if (mHasBias) {
-            serFromDev(d, static_cast<char*>(mBiasDev.get()), mHiddenSize * mParamWordsize);
-        }
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormPluginDynamic::destroy() noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic destroy" << endl;
-        // This gets called when the network containing plugin is destroyed
-        mGammaDev.reset(nullptr);
-        mBetaDev.reset(nullptr);
-        if (mHasBias) {
-            mBiasDev.reset(nullptr);
-        }
-        delete this;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* SkipLayerNormPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* SkipLayerNormPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic clone" << endl;
-
-        auto* p = new SkipLayerNormPluginDynamic(mLayerName, mType, mHiddenSize, mBeta, mGamma, mBias);
-        p->initialize();
-        p->setPluginNamespace(mNamespace.c_str());
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                          int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool SkipLayerNormPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                           int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(pos >= 0 && pos < (nbInputs + nbOutputs));
-
-        PluginTensorDesc const& in = inOut[pos];
-        if (pos == 0) {
-            return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-        }
-        PluginTensorDesc const& prev = inOut[pos - 1];
-
-        return in.type == prev.type && in.format == prev.format && (in.type == DataType::kHALF);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return false;
-}
-
-void SkipLayerNormPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                 DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic configurePlugin" << endl;
-
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        if (mType == DataType::kFLOAT || mType == DataType::kHALF) {
-            IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-            IXRT_PLUGIN_ASSERT(mType == inputs[1].desc.type);
-        } else {
-            IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type || DataType::kFLOAT == inputs[0].desc.type);
-            IXRT_PLUGIN_ASSERT(mType == inputs[1].desc.type || DataType::kFLOAT == inputs[1].desc.type);
-        }
-        auto const& inDims0 = inputs[0].desc.dims;
-        auto const& inDims1 = inputs[1].desc.dims;
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims1.nbDims);
-
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d));
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        mHiddenSize = inDims0.d[HDIM];  // hiddensize
-        IXRT_PLUGIN_ASSERT(mHiddenSize != 0U);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-        IXRT_PLUGIN_ASSERT(outputs[0].desc.type == DataType::kHALF);
-
-        mCfgType = inputs[0].desc.type == DataType::kINT8 ? DataType::kHALF : inputs[0].desc.type;
-
-        auto const paramType = getParamWordType(mCfgType);
-        mParamWordsize = getElementSize(paramType);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t SkipLayerNormPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                    PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t SkipLayerNormPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                            void const* const* inputs, void* const* outputs, void* workspace,
-                                            cudaStream_t stream) noexcept {
-    gLogInfo << "in SkipLayerNormPluginDynamic.." << endl;
-    int32_t status = -1;
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        int32_t const inputVolume = volume(inputDesc[0].dims);
-        DataType iType = inputDesc->type;
-
-        // Our plugin outputs only one tensor
-        // Launch CUDA kernel wrapper and save its return value
-        if (iType == DataType::kFLOAT) {
-            gLogInfo << "SkipLayerNormPlugin fp32 not supported yet!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        } else if (iType == DataType::kHALF) {
-            auto const* input = static_cast<half const*>(inputs[0]);
-            auto skip = (half*)(inputs[1]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto const* const bias = static_cast<half const*>(mBiasDev.get());
-            auto const* const beta = static_cast<half const*>(mBetaDev.get());
-            auto const* const gamma = static_cast<half const*>(mGammaDev.get());
-
-            if (mHasBias) {
-                status = computeSkipLayerNorm<half, true>(stream, static_cast<int32_t>(mHiddenSize), inputVolume, input,
-                                                          gamma, beta, bias, skip, output);
-            } else {
-                status = computeSkipLayerNorm<half, false>(stream, static_cast<int32_t>(mHiddenSize), inputVolume,
-                                                           input, gamma, beta, bias, skip, output);
-            }
-        } else {
-            IXRT_PLUGIN_CHECK_VALUE(false, "Unsupported type error, expected [kHALF,kFLOAT], but received " +
-                                               std::to_string(static_cast<int32_t>(iType)));
-        }
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu
deleted file mode 100644
index 1b127fc5bbd62c131fea5f0eceddc4dc5b464d47..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu
+++ /dev/null
@@ -1,401 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-
-#include "backend/bert/bert_helper.h"
-#include "skipLayerNormPlugin.h"
-// #include "backend/transformer/transformer_add_norm.h"
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias,
-                                         const half *residual_bias, half *output, half *residual, int hidden_size,
-                                         bool is_post_ln) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 2;
-    half2 *p_input = (half2 *)input;
-    half2 *p_output = (half2 *)output;
-    half2 *p_residual = (half2 *)residual;
-    half2 *p_scale = (half2 *)scale;
-    half2 *p_bias = (half2 *)bias;
-    half2 *p_residual_bias = (half2 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        if (element_index < hidden_size / 2) {
-            half2 value1 = p_input[element_index];
-            half2 value2 = p_residual[element_index];
-
-            vals[it].x = __half2float(value1.x) + __half2float(value2.x);
-            vals[it].y = __half2float(value1.y) + __half2float(value2.y);
-
-            half2 res_bias_val_1;
-            if (residual_bias == nullptr) {
-                res_bias_val_1.x = __float2half(0.0f);
-                res_bias_val_1.y = __float2half(0.0f);
-            } else {
-                res_bias_val_1 = p_residual_bias[element_index];
-            }
-            vals[it].x = vals[it].x + __half2float(res_bias_val_1.x);
-            vals[it].y = vals[it].y + __half2float(res_bias_val_1.y);
-
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        }
-    }
-
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        if (element_index < hidden_size / 2) {
-            float2 norm_value;
-            half2 scale_1 = p_scale[element_index];
-            half2 bias_1 = p_bias[element_index];
-            norm_value.x = (vals[it].x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) +
-                           __half2float(bias_1.x);
-            norm_value.y = (vals[it].y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) +
-                           __half2float(bias_1.y);
-
-            half2 res;
-            res.x = __float2half(norm_value.x);
-            res.y = __float2half(norm_value.y);
-
-            p_output[element_index] = res;
-
-            half2 r1;
-            if (is_post_ln) {
-                r1 = res;
-            } else {
-                r1.x = __float2half(vals[it].x);
-                r1.y = __float2half(vals[it].y);
-            }
-            p_residual[element_index] = r1;
-        }
-    }
-}
-
-void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                              half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                              bool is_post_ln) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % 2 != 0) {
-        throw std::runtime_error("hidden_size % 2 != 0");
-    }
-
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int neareast_hidden_size = hidden_size;
-    if (neareast_hidden_size % (C10_WARP_SIZE * 2) != 0) {
-        neareast_hidden_size = neareast_hidden_size + C10_WARP_SIZE * 2 - neareast_hidden_size % (C10_WARP_SIZE * 2);
-    }
-
-    int num_warp = neareast_hidden_size / C10_WARP_SIZE / 2;
-
-    switch (num_warp) {
-        case 1:
-            IxinferResidualBiasLnPad<1><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 2:
-            IxinferResidualBiasLnPad<2><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 3:
-            IxinferResidualBiasLnPad<3><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 4:
-            IxinferResidualBiasLnPad<4><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 5:
-            IxinferResidualBiasLnPad<5><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 6:
-            IxinferResidualBiasLnPad<6><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 7:
-            IxinferResidualBiasLnPad<7><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 8:
-            IxinferResidualBiasLnPad<8><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 9:
-            IxinferResidualBiasLnPad<9><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 10:
-            IxinferResidualBiasLnPad<10><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 11:
-            IxinferResidualBiasLnPad<11><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 12:
-            IxinferResidualBiasLnPad<12><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 13:
-            IxinferResidualBiasLnPad<13><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 14:
-            IxinferResidualBiasLnPad<14><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 15:
-            IxinferResidualBiasLnPad<15><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 16:
-            IxinferResidualBiasLnPad<16><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        default:
-            std::cout << "hidden size: " << hidden_size << std::endl;
-            throw std::runtime_error("IxinferResidualBiasLnPad not supported!");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                                      half *output, half *residual, int hidden_size, bool is_post_ln) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 2;
-    half2 *p_input = (half2 *)input;
-    half2 *p_output = (half2 *)output;
-    half2 *p_residual = (half2 *)residual;
-    half2 *p_scale = (half2 *)scale;
-    half2 *p_bias = (half2 *)bias;
-    half2 *p_residual_bias = (half2 *)residual_bias;
-
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        half2 value1 = p_input[element_index];
-        half2 value2 = p_residual[element_index];
-
-        vals[it].x = __half2float(value1.x) + __half2float(value2.x);
-        vals[it].y = __half2float(value1.y) + __half2float(value2.y);
-
-        half2 res_bias_val_1;
-        if (residual_bias == nullptr) {
-            res_bias_val_1.x = __float2half(0.0f);
-            res_bias_val_1.y = __float2half(0.0f);
-        } else {
-            res_bias_val_1 = p_residual_bias[element_index];
-        }
-        vals[it].x = vals[it].x + __half2float(res_bias_val_1.x);
-        vals[it].y = vals[it].y + __half2float(res_bias_val_1.y);
-
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float2 norm_value;
-        half2 scale_1 = p_scale[element_index];
-        half2 bias_1 = p_bias[element_index];
-        norm_value.x =
-            (vals[it].x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + __half2float(bias_1.x);
-        norm_value.y =
-            (vals[it].y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + __half2float(bias_1.y);
-
-        half2 res;
-        res.x = __float2half(norm_value.x);
-        res.y = __float2half(norm_value.y);
-
-        p_output[element_index] = res;
-
-        half2 r1;
-        if (is_post_ln) {
-            r1 = res;
-        } else {
-            r1.x = __float2half(vals[it].x);
-            r1.y = __float2half(vals[it].y);
-        }
-        p_residual[element_index] = r1;
-    }
-}
-
-void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                           half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                           bool is_post_ln) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if ((hidden_size % 2 == 0) && (hidden_size % (C10_WARP_SIZE * 2) != 0)) {
-        IxinferResidualBiasLnPad(input, scale, bias, residual_bias, output, residual, batch_tokens, hidden_size, stream,
-                                 is_post_ln);
-    } else {
-        if (hidden_size % (C10_WARP_SIZE * 2) != 0) {
-            throw std::runtime_error("hidden_size // (C10_WARP_SIZE*2) != 0");
-        }
-        dim3 gridSize(batch_tokens);
-        dim3 blockSize(C10_WARP_SIZE);
-
-        int num_warp = hidden_size / C10_WARP_SIZE / 2;
-
-        switch (num_warp) {
-            case 1:
-                IxinferResidualBiasLn<1><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 2:
-                IxinferResidualBiasLn<2><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 3:
-                IxinferResidualBiasLn<3><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 4:
-                IxinferResidualBiasLn<4><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 5:
-                IxinferResidualBiasLn<5><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 6:
-                IxinferResidualBiasLn<6><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 7:
-                IxinferResidualBiasLn<7><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 8:
-                IxinferResidualBiasLn<8><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 9:
-                IxinferResidualBiasLn<9><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 10:
-                IxinferResidualBiasLn<10><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 11:
-                IxinferResidualBiasLn<11><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 12:
-                IxinferResidualBiasLn<12><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 13:
-                IxinferResidualBiasLn<13><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 14:
-                IxinferResidualBiasLn<14><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 15:
-                IxinferResidualBiasLn<15><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 16:
-                IxinferResidualBiasLn<16><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            default:
-                throw std::runtime_error("IxinferResidualBiasLn");
-                break;
-        }
-    }
-}
-
-template <typename T, bool has_bias>
-int32_t computeSkipLayerNorm(cudaStream_t stream, int32_t E, int32_t volume, const T* input, const T* gamma, const T* beta, const T* bias, T* skip, T* output)
-{
-    assert(volume % E == 0);
-    int32_t batch_tokens = volume / E;
-    IxinferResidualBiasLn(input, gamma, beta, bias, output, skip, batch_tokens, E, stream, true);
-    return 0;
-}
-
-template int32_t computeSkipLayerNorm<half, true>(cudaStream_t, int32_t, int32_t, const half*, const half*, const half*, const half*, half*, half*);
-template int32_t computeSkipLayerNorm<half, false>(cudaStream_t, int32_t, int32_t, const half*, const half*, const half*, const half*, half*, half*);
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h
deleted file mode 100644
index fa37318fcff8e3d8ab3f3cfcadf34e378477a1a3..0000000000000000000000000000000000000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <string>
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <typename T, bool has_bias>
-int32_t computeSkipLayerNorm(cudaStream_t stream, int32_t E, int32_t volume, const T* input, const T* gamma, const T* beta, const T* bias, T* skip, T* output);
-
-void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                           half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                           bool is_post_ln);
-
-void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                              half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                              bool is_post_ln);
-class SkipLayerNormPluginDynamic : public IPluginV2DynamicExt {
-   public:
-    SkipLayerNormPluginDynamic(const std::string name, const nvinfer1::DataType type, int32_t const ld,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias);
-    SkipLayerNormPluginDynamic(const std::string &name, void const* data, size_t length);
-    SkipLayerNormPluginDynamic() noexcept = delete;
-    ~SkipLayerNormPluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    cuda_unique_ptr<void> mGammaDev;
-    cuda_unique_ptr<void> mBetaDev;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mBeta;
-    size_t mHiddenSize{};
-    size_t mParamWordsize{};
-    DataType mType;
-    DataType mCfgType;
-    // mCfgType is the dataType for beta, gamma bias weights, always fp16 or fp32
-    // mType is the plugin IO datatype, can be int8
-    
-    bool mHasBias{};
-    cuda_unique_ptr<void> mBiasDev;
-    WeightsWithOwnership mBias;
-};
-
-class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    SkipLayerNormPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/tests/model_info.json b/tests/model_info.json
index a717db6791a98170231e4ccb96bd446488356c65..0f025cf81d8581ef0b1be15d2e2c059c2b182b8f 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -1519,7 +1519,7 @@
             "github_branch": "",
             "github_path": "",
             "datasets": "",
-            "download_url": "",
+            "download_url": "http://data.lip6.fr/cadene/pretrainedmodels/inceptionresnetv2-520b38e4.pth",
             "need_third_part": "",
             "precisions": [
                 "fp16",
@@ -3254,7 +3254,7 @@
             "github_repo": "",
             "github_branch": "",
             "github_path": "",
-            "datasets": "local/tmp",
+            "datasets": "local/facenet_datasets",
             "download_url": "https://drive.google.com/open?id=1R77HmFADxe87GmoLwzfgMu_HY0IhcyBz",
             "need_third_part": "",
             "precisions": [
@@ -6012,7 +6012,8 @@
             "download_url": "https://local/bert-large-uncased",
             "need_third_part": "",
             "precisions": [
-                "fp16"
+                "fp16",
+                "int8"
             ],
             "type": "inference",
             "hasDemo": false,
@@ -7098,7 +7099,7 @@
             "github_branch": "",
             "github_path": "",
             "datasets": "",
-            "download_url": "https://huggingface.co/openbmb/MiniCPM-V-2",
+            "download_url": "https://huggingface.co/openbmb/MiniCPM-V-2_6",
             "need_third_part": false,
             "precisions": [
                 "fp16"
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index 0bf60dbc2df9e2c1b336a31254fb966f80f2ad1d..1ab1995b4c95f0174f9229820724d85529b2887f 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -259,25 +259,41 @@ def run_detec_testcase(model):
     run_script(prepare_script)
 
     config_name = model_name.upper()
+    if model_name == "yolov5":
+        config_name = "YOLOV5M"
 
     for prec in model["precisions"]:
         logging.info(f"Start running {model_name} {prec} test case")
-        script = f"""
-        cd ../{model['model_path']}
-        export DATASETS_DIR=./{dataset_n}/
-
-        export MODEL_PATH=./{model_name}.onnx
-
-        export PROJ_DIR=./
-        export CHECKPOINTS_DIR=./checkpoints
-        export COCO_GT=./{dataset_n}/annotations/instances_val2017.json
-        export EVAL_DIR=./{dataset_n}/val2017
-        export RUN_DIR=./
-        export CONFIG_DIR=config/{config_name}_CONFIG
-
-        bash scripts/infer_{model_name}_{prec}_accuracy.sh
-        bash scripts/infer_{model_name}_{prec}_performance.sh
-        """
+        result["result"].setdefault(prec, {})
+        result["result"].setdefault(prec, {"status": "FAIL"})
+        if model_name in ["yolov3", "yolov5", "yolov5s", "yolov7"]:
+            script = f"""
+            cd ../{model['model_path']}
+            export DATASETS_DIR=./{dataset_n}/
+            export MODEL_PATH=./{model_name}.onnx
+            export PROJ_DIR=./
+            export CHECKPOINTS_DIR=./checkpoints
+            export COCO_GT=./{dataset_n}/annotations/instances_val2017.json
+            export EVAL_DIR=./{dataset_n}/images/val2017
+            export RUN_DIR=../../ixrt_common
+            export CONFIG_DIR=../../ixrt_common/config/{config_name}_CONFIG
+            bash scripts/infer_{model_name}_{prec}_accuracy.sh
+            bash scripts/infer_{model_name}_{prec}_performance.sh
+            """
+        else:
+            script = f"""
+            cd ../{model['model_path']}
+            export DATASETS_DIR=./{dataset_n}/
+            export MODEL_PATH=./{model_name}.onnx
+            export PROJ_DIR=./
+            export CHECKPOINTS_DIR=./checkpoints
+            export COCO_GT=./{dataset_n}/annotations/instances_val2017.json
+            export EVAL_DIR=./{dataset_n}/images/val2017
+            export RUN_DIR=./
+            export CONFIG_DIR=config/{config_name}_CONFIG
+            bash scripts/infer_{model_name}_{prec}_accuracy.sh
+            bash scripts/infer_{model_name}_{prec}_performance.sh
+            """
 
         if model_name == "rtmpose":
             script = f"""
@@ -292,7 +308,6 @@ def run_detec_testcase(model):
         combined_pattern = re.compile(f"{fps_pattern}|{e2e_pattern}")
         matchs = combined_pattern.finditer(sout)
         for match in matchs:
-            result["result"].setdefault(prec, {"status": "FAIL"})
             for name, value in match.groupdict().items():
                 if value:
                     try:
@@ -304,7 +319,6 @@ def run_detec_testcase(model):
         pattern = r"Average Precision  \(AP\) @\[ (IoU=0.50[:\d.]*)\s*\| area=   all \| maxDets=\s?\d+\s?\] =\s*([\d.]+)"
         matchs = re.findall(pattern, sout)
         for m in matchs:
-            result["result"].setdefault(prec, {})
             try:
                 result["result"][prec][m[0]] = float(m[1])
             except ValueError:
@@ -316,7 +330,6 @@ def run_detec_testcase(model):
             pattern = METRIC_PATTERN
             matchs = re.findall(pattern, sout)
             if matchs and len(matchs) == 1:
-                result["result"].setdefault(prec, {})
                 result["result"][prec].update(get_metric_result(matchs[0]))
                 result["result"][prec]["status"] = "PASS"
         result["result"][prec]["Cost time (s)"] = t
@@ -376,6 +389,23 @@ def run_segmentation_and_face_testcase(model):
             result["result"][prec].update(get_metric_result(m))
         if len(matchs) == 2:
             result["result"][prec]["status"] = "PASS"
+        else:
+            patterns = {
+                "FPS": r"FPS\s*:\s*(\d+\.?\d*)",
+                "Accuracy": r"Accuracy\s*:\s*(\d+\.?\d*)"
+            }
+
+            combined_pattern = re.compile("|".join(f"(?P<{name}>{pattern})" for name, pattern in patterns.items()))
+            matchs = combined_pattern.finditer(sout)
+            match_count = 0
+            for match in matchs:
+                for name, value in match.groupdict().items():
+                    if value:
+                        match_count += 1
+                        result["result"][prec][name] = float(f"{float(value.split(':')[1].strip()):.3f}")
+                        break
+            if match_count == len(patterns):
+                result["result"][prec]["status"] = "PASS"
 
         result["result"][prec]["Cost time (s)"] = t
         logging.debug(f"matchs:\n{matchs}")
@@ -451,16 +481,16 @@ def run_nlp_testcase(model):
         elif model_name == "bert_large_squad":
             script = f"""
             set -x
-            cd ../{model['model_path']}/python
-            bash script/build_engine.sh --bs 32
-            bash script/inference_squad.sh --bs 32
+            cd ../{model['model_path']}
+            bash scripts/infer_bert_large_squad_fp16_accuracy.sh
+            bash scripts/infer_bert_large_squad_fp16_performance.sh
             """
             if prec == "int8":
                 script = f"""
                 set -x
-                cd ../{model['model_path']}/python
-                bash script/build_engine.sh --bs 32 --int8
-                bash script/inference_squad.sh --bs 32 --int8
+                cd ../{model['model_path']}
+                bash scripts/infer_bert_large_squad_int8_accuracy.sh
+                bash scripts/infer_bert_large_squad_int8_performance.sh
                 """
 
         r, t = run_script(script)
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index c6100a4016ecb3b10b63e669c4170f899fc26c10..be795462d5917eb7c8f7a6dca34cdb70f75946d2 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -229,7 +229,13 @@ def run_nlp_testcase(model):
             export VLLM_ASSETS_CACHE=../vllm/
             python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
             """
-        elif model_name == "h2vol" or model_name == "idefics3":
+        elif model_name == "idefics3":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            python3 offline_inference_vision_language.py --model-type idefics3
+            """
+        elif model_name == "h2vol":
             script = f"""
             set -x
             cd ../{model['model_path']}
@@ -240,8 +246,7 @@ def run_nlp_testcase(model):
             script = f"""
             set -x
             cd ../{model['model_path']}
-            export VLLM_ASSETS_CACHE=../vllm/
-            PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+            python3 offline_inference_vision_language.py --model-type minicpmv
             """
         elif model_name == "llama-3.2":
             script = f"""