From 00c3946768fd8204079f67584bc445080ece0c81 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Mon, 30 Jun 2025 14:47:53 +0800
Subject: [PATCH 01/15] sync yolov3 infer

---
 .../cv/object_detection/yolov3/ixrt/README.md |  4 +-
 .../yolov3/ixrt/build_engine.py               | 15 -----
 .../yolov3/ixrt/build_nms_engine.py           | 19 +-----
 .../yolov3/ixrt/calibration_dataset.py        | 18 +-----
 .../yolov3/ixrt/coco_labels.py                | 15 -----
 .../cv/object_detection/yolov3/ixrt/common.py | 22 ++-----
 .../yolov3/ixrt/config/YOLOV3_CONFIG          | 15 -----
 .../object_detection/yolov3/ixrt/cut_model.py | 15 -----
 .../yolov3/ixrt/datasets/__init__.py          | 14 -----
 .../yolov3/ixrt/datasets/coco.py              | 15 -----
 .../yolov3/ixrt/datasets/common.py            | 15 -----
 .../yolov3/ixrt/datasets/post_process.py      | 15 -----
 .../yolov3/ixrt/datasets/pre_process.py       | 15 -----
 .../yolov3/ixrt/datasets/vision.py            | 15 -----
 .../cv/object_detection/yolov3/ixrt/deploy.py | 60 +++++++-----------
 .../object_detection/yolov3/ixrt/inference.py | 62 +++++++++----------
 .../yolov3/ixrt/load_ixrt_plugin.py           | 15 -----
 .../yolov3/ixrt/modify_batchsize.py           | 34 +++++-----
 .../cv/object_detection/yolov3/ixrt/quant.py  | 18 +-----
 .../scripts/infer_yolov3_fp16_accuracy.sh     | 31 ++++------
 .../scripts/infer_yolov3_fp16_performance.sh  | 37 +++++------
 .../scripts/infer_yolov3_int8_accuracy.sh     | 31 ++++------
 .../scripts/infer_yolov3_int8_performance.sh  | 33 ++++------
 .../yolov3/ixrt/simplify_model.py             | 15 -----
 .../stable-diffusion/diffusers/ci/prepare.sh  |  2 +-
 25 files changed, 133 insertions(+), 417 deletions(-)

diff --git a/models/cv/object_detection/yolov3/ixrt/README.md b/models/cv/object_detection/yolov3/ixrt/README.md
index c4fd306f..e0003094 100644
--- a/models/cv/object_detection/yolov3/ixrt/README.md
+++ b/models/cv/object_detection/yolov3/ixrt/README.md
@@ -51,10 +51,10 @@ mv weights/export.onnx /Path/to/checkpoints/yolov3.onnx
 
 ```bash
 export PROJ_DIR=/Path/to/yolov3/ixrt
-export DATASETS_DIR=/Path/to/coco2017/
+export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=./coco/annotations/instances_val2017.json
-export EVAL_DIR=./coco/val2017
+export EVAL_DIR=./coco/images/val2017
 export RUN_DIR=/Path/to/yolov3/ixrt
 export CONFIG_DIR=config/YOLOV3_CONFIG
 ```
diff --git a/models/cv/object_detection/yolov3/ixrt/build_engine.py b/models/cv/object_detection/yolov3/ixrt/build_engine.py
index a919bdd0..d47e45e5 100644
--- a/models/cv/object_detection/yolov3/ixrt/build_engine.py
+++ b/models/cv/object_detection/yolov3/ixrt/build_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import argparse
diff --git a/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
index 3be0d83d..25f0ab8a 100644
--- a/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
+++ b/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import argparse
 import torch
@@ -21,9 +6,11 @@ from onnx import helper
 from onnx import TensorProto, numpy_helper
 import tensorrt
 
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
 def create_onnx(args):
     nms = helper.make_node(
-        "NMS",
+        "DetectionNMS_IxRT",
         name="NMS",
         inputs=["nms_input"],
         outputs=["nms_output0", "nms_output1"],
diff --git a/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
index de37775a..578e013d 100644
--- a/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
+++ b/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
@@ -1,22 +1,10 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import torch
 import torchvision.datasets
 from torch.utils.data import DataLoader
+
+
+
 from datasets.coco import CocoDetection
 
 def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
diff --git a/models/cv/object_detection/yolov3/ixrt/coco_labels.py b/models/cv/object_detection/yolov3/ixrt/coco_labels.py
index 43f5bd82..69d38878 100644
--- a/models/cv/object_detection/yolov3/ixrt/coco_labels.py
+++ b/models/cv/object_detection/yolov3/ixrt/coco_labels.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 labels = [
     "person",
     "bicycle",
diff --git a/models/cv/object_detection/yolov3/ixrt/common.py b/models/cv/object_detection/yolov3/ixrt/common.py
index aba2117c..5f543555 100644
--- a/models/cv/object_detection/yolov3/ixrt/common.py
+++ b/models/cv/object_detection/yolov3/ixrt/common.py
@@ -1,23 +1,9 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import numpy as np
 from tqdm import tqdm
 
 import tensorrt
-import pycuda.driver as cuda
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 # input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
 # output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
@@ -81,13 +67,15 @@ def get_io_bindings(engine):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size,
         }
         print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
diff --git a/models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG b/models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG
index 8cbd0f49..9b1fe491 100644
--- a/models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG
+++ b/models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # BSZ : 构建engine以及推理时的batchsize
 # IMGSIZE : 模型输入hw大小
 # RUN_MODE : [FPS, MAP]
diff --git a/models/cv/object_detection/yolov3/ixrt/cut_model.py b/models/cv/object_detection/yolov3/ixrt/cut_model.py
index e9ee19aa..af0a3a4f 100644
--- a/models/cv/object_detection/yolov3/ixrt/cut_model.py
+++ b/models/cv/object_detection/yolov3/ixrt/cut_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov3/ixrt/datasets/__init__.py
index 162e24b4..e69de29b 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/__init__.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/__init__.py
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/coco.py b/models/cv/object_detection/yolov3/ixrt/datasets/coco.py
index 73c5df54..7f355b84 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/coco.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/coco.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os.path
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/common.py b/models/cv/object_detection/yolov3/ixrt/datasets/common.py
index ef36eba3..e120e00f 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/common.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/common.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
index 8590816a..a58c02f8 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
index c651f8ad..8cc643a8 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/vision.py b/models/cv/object_detection/yolov3/ixrt/datasets/vision.py
index eadefb2c..32da4a78 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/vision.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/vision.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov3/ixrt/deploy.py b/models/cv/object_detection/yolov3/ixrt/deploy.py
index 8c2cc424..ec56b7ab 100644
--- a/models/cv/object_detection/yolov3/ixrt/deploy.py
+++ b/models/cv/object_detection/yolov3/ixrt/deploy.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # !/usr/bin/env python
 # -*- coding: utf-8 -*-
 import argparse
@@ -77,17 +62,16 @@ def customize_ops(graph, args):
         stride=16,
         faster_impl=args.faster
     )
-    
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[num*2:num*2+1],
+        outputs=["decoder_32"],
+        op_type=args.decoder_type,
+        anchor=args.decoder32_anchor,
+        num_class=args.num_class,
+        stride=32,
+        faster_impl=args.faster
+    )
     if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:num*2+1],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
         graph = t.AddYoloDecoderOp(
             inputs=decoder_input[num*2+1:],
             outputs=["decoder_64"],
@@ -102,25 +86,24 @@ def customize_ops(graph, args):
             outputs=["output"],
             axis=1
         )
-    else:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
+    elif args.with_nms:
         graph = t.AddConcatOp(
             inputs=["decoder_32", "decoder_16", "decoder_8"],
             outputs=["output"],
             axis=1
         )
 
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
+        graph.outputs.clear()
+        graph.add_output("output")
+        graph.outputs["output"].dtype = "FLOAT"
+    else:
+        graph.outputs.clear()
+        graph.add_output("decoder_8")
+        graph.outputs["decoder_8"].dtype = "FLOAT"
+        graph.add_output("decoder_16")
+        graph.outputs["decoder_16"].dtype = "FLOAT"
+        graph.add_output("decoder_32")
+        graph.outputs["decoder_32"].dtype = "FLOAT"
     return graph
 
 def parse_args():
@@ -128,6 +111,7 @@ def parse_args():
     parser.add_argument("--src", type=str)
     parser.add_argument("--dst", type=str)
     parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
+    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
     parser.add_argument("--decoder_input_names", nargs='+', type=str)
     parser.add_argument("--decoder8_anchor", nargs='*', type=int)
     parser.add_argument("--decoder16_anchor", nargs='*', type=int)
diff --git a/models/cv/object_detection/yolov3/ixrt/inference.py b/models/cv/object_detection/yolov3/ixrt/inference.py
index 42413282..a2cc7d79 100644
--- a/models/cv/object_detection/yolov3/ixrt/inference.py
+++ b/models/cv/object_detection/yolov3/ixrt/inference.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
@@ -25,8 +10,8 @@ import sys
 
 import torch
 import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 from coco_labels import coco80_to_coco91_class, labels
 from common import save2json, box_class85to6
@@ -62,7 +47,7 @@ def main(config):
 
     bsz = config.bsz
     num_samples = 5000
-    if config.loop_count > 0 and config.loop_count < num_samples/bsz :
+    if config.loop_count > 0:
         num_samples = bsz * config.loop_count
     num_batch = len(dataloader)
     print("=" * 30)
@@ -109,17 +94,18 @@ def main(config):
         cur_bsz_sample = batch_data.shape[0]
 
         # Set input
-        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
-
+        err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
         # Forward
-        start_time = time.time()
+        # start_time = time.time()
         context.execute_v2(allocations)
-        end_time = time.time()
-        forward_time += end_time - start_time
+        # end_time = time.time()
+        # forward_time += end_time - start_time
 
         if config.test_mode == "MAP":
             # Fetch output
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 1 : prepare data to nms
             _, box_num, box_unit = output.shape
@@ -138,10 +124,13 @@ def main(config):
             if config.nms_type == "GPU":
 
                 # Set nms input
-                cuda.memcpy_htod(nms_inputs[0]["allocation"], nms_input)
+                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 nms_context.execute_v2(nms_allocations)
-                cuda.memcpy_dtoh(nms_output0, nms_outputs[0]["allocation"])
-                cuda.memcpy_dtoh(nms_output1, nms_outputs[1]["allocation"])
+                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 3 : post process + save
             pred_boxes = post_process_func(
@@ -153,9 +142,16 @@ def main(config):
                 max_det=config.max_det
             )
             save2json(batch_img_id, pred_boxes, json_result, class_map)
-    fps = num_samples / forward_time
+
+    # fps = num_samples / forward_time
 
     if config.test_mode == "FPS":
+        start_time = time.time()       
+        for i in range(config.loop_count):
+            context.execute_v2(allocations)  
+        end_time = time.time()  
+        forward_time = end_time - start_time      
+        fps = (config.loop_count*config.bsz) / forward_time
         print("FPS : ", fps)
         print(f"Performance Check : Test {fps} >= target {config.fps_target}")
         if fps >= config.fps_target:
@@ -163,12 +159,12 @@ def main(config):
             exit()
         else:
             print("failed!")
-            exit(1)
+            exit(10)
 
     if config.test_mode == "MAP":
         if len(json_result) == 0:
             print("Predict zero box!")
-            exit(1)
+            exit(10)
 
         if not os.path.exists(config.pred_dir):
             os.makedirs(config.pred_dir)
@@ -179,7 +175,6 @@ def main(config):
         with open(pred_json, "w") as f:
             json.dump(json_result, f)
 
-        start_time = time.time()
         anno_json = config.coco_gt
         anno = COCO(anno_json)  # init annotations api
         pred = anno.loadRes(pred_json)  # init predictions api
@@ -191,17 +186,16 @@ def main(config):
             f"==============================eval {config.model_name} {config.precision} coco map =============================="
         )
         eval.summarize()
-        e2e_time = time.time() - start_time
+
         map, map50 = eval.stats[:2]
         print("MAP@0.5 : ", map50)
         print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        print(F"E2E time : {e2e_time:.3f} seconds")
         if map50 >= config.map_target:
             print("pass!")
             exit()
         else:
             print("failed!")
-            exit(1)
+            exit(10)
 
 def parse_config():
     parser = argparse.ArgumentParser()
diff --git a/models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py
index ae47dc8e..932efbdf 100644
--- a/models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py
+++ b/models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
diff --git a/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
index 3a88c160..f696ae55 100644
--- a/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
+++ b/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
@@ -1,20 +1,7 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
+import copy
+import numpy as np
 
 def change_input_dim(model, bsz):
     batch_size = bsz
@@ -46,7 +33,22 @@ def parse_args():
     args = parser.parse_args()
     return args
 
+def modify_resize_nodes(model, bsz):
+    print("modify resize")
+    for node in model.graph.node:
+        if node.op_type == "Resize":
+            if len(node.input) >= 4 and node.input[3]:
+                sizes_name = node.input[3]
+                for initializer in model.graph.initializer:
+                    if initializer.name == sizes_name:
+                        shape = copy.deepcopy(onnx.numpy_helper.to_array(initializer))
+                        shape[0] = shape[0] * bsz
+                        new_sizes = np.array(shape, dtype=np.int64)
+                        initializer.CopyFrom(onnx.numpy_helper.from_array(new_sizes, name=initializer.name))
+                        break
+    
 args = parse_args()
 model = onnx.load(args.origin_model)
 change_input_dim(model, args.batch_size)
-onnx.save(model, args.output_model)
\ No newline at end of file
+modify_resize_nodes(model, args.batch_size)
+onnx.save(model, args.output_model)
diff --git a/models/cv/object_detection/yolov3/ixrt/quant.py b/models/cv/object_detection/yolov3/ixrt/quant.py
index 36fd39a1..d73212ca 100644
--- a/models/cv/object_detection/yolov3/ixrt/quant.py
+++ b/models/cv/object_detection/yolov3/ixrt/quant.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import random
 import argparse
@@ -20,6 +5,9 @@ import numpy as np
 from tensorrt.deploy import static_quantize
 
 import torch
+import sys
+sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
+print(sys.path)
 from calibration_dataset import create_dataloaders
 
 def setseed(seed=42):
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
index 7d6a609e..932edf9d 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
@@ -1,31 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.65
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -125,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -133,6 +123,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV3Decoder             \
+            --with_nms             True                   \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -147,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -163,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
index 4fdf2ada..5f3360a6 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
@@ -1,32 +1,19 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=1010
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -46,7 +33,6 @@ done
 source ${CONFIG_DIR}
 ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
-echo PROJ_DIR : ${PROJ_DIR}
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
@@ -55,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -91,6 +80,7 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
+
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -126,7 +116,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -134,6 +124,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV3Decoder             \
+            --with_nms             False                   \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -148,7 +139,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -164,7 +155,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -191,7 +182,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}               \
+    --model_engine=${ENGINE_FILE}              \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
index e2162612..3e468467 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
@@ -1,31 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.65
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -125,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -133,6 +123,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV3Decoder             \
+            --with_nms             True                   \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -147,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -163,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
index 7faed28c..c7ac4c1b 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
@@ -1,32 +1,19 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=1010
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -126,7 +116,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -134,6 +124,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV3Decoder             \
+            --with_nms             False                   \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -148,7 +139,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -164,7 +155,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov3/ixrt/simplify_model.py b/models/cv/object_detection/yolov3/ixrt/simplify_model.py
index 1400fd81..b4254b6f 100644
--- a/models/cv/object_detection/yolov3/ixrt/simplify_model.py
+++ b/models/cv/object_detection/yolov3/ixrt/simplify_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh b/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh
index b9140aa3..f256bc1d 100644
--- a/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh
+++ b/models/multimodal/diffusion_model/stable-diffusion/diffusers/ci/prepare.sh
@@ -24,5 +24,5 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/diffusers-0.31.0-py3-none-any.whl
+pip3 install /mnt/deepspark/data/3rd_party/diffusers-0.31.0-py3-none-any.whl
 pip3 install -r requirements.txt
\ No newline at end of file
-- 
Gitee


From e404fe33fa9ebaaf19e5c1881ab85fb15130aa15 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Mon, 30 Jun 2025 15:39:55 +0800
Subject: [PATCH 02/15] sync yolov5s

---
 .../object_detection/yolov5s/ixrt/README.md   |  4 +-
 .../yolov5s/ixrt/build_engine.py              | 15 -----
 .../yolov5s/ixrt/build_nms_engine.py          | 19 +-----
 .../yolov5s/ixrt/calibration_dataset.py       | 18 +-----
 .../yolov5s/ixrt/coco_labels.py               | 15 -----
 .../object_detection/yolov5s/ixrt/common.py   | 21 ++-----
 .../yolov5s/ixrt/config/YOLOV5S_CONFIG        | 15 -----
 .../yolov5s/ixrt/cut_model.py                 | 15 -----
 .../yolov5s/ixrt/datasets/__init__.py         | 14 -----
 .../yolov5s/ixrt/datasets/coco.py             | 15 -----
 .../yolov5s/ixrt/datasets/common.py           | 15 -----
 .../yolov5s/ixrt/datasets/post_process.py     | 15 -----
 .../yolov5s/ixrt/datasets/pre_process.py      | 15 -----
 .../yolov5s/ixrt/datasets/vision.py           | 15 -----
 .../object_detection/yolov5s/ixrt/deploy.py   | 60 +++++++-----------
 .../yolov5s/ixrt/inference.py                 | 61 ++++++++-----------
 .../yolov5s/ixrt/load_ixrt_plugin.py          | 15 -----
 .../yolov5s/ixrt/modify_batchsize.py          | 15 -----
 .../cv/object_detection/yolov5s/ixrt/quant.py | 18 +-----
 .../scripts/infer_yolov5s_fp16_accuracy.sh    | 37 ++++-------
 .../scripts/infer_yolov5s_fp16_performance.sh | 37 ++++-------
 .../scripts/infer_yolov5s_int8_accuracy.sh    | 37 ++++-------
 .../scripts/infer_yolov5s_int8_performance.sh | 40 +++++-------
 .../yolov5s/ixrt/simplify_model.py            | 15 -----
 24 files changed, 118 insertions(+), 428 deletions(-)

diff --git a/models/cv/object_detection/yolov5s/ixrt/README.md b/models/cv/object_detection/yolov5s/ixrt/README.md
index 079c9644..245205bf 100755
--- a/models/cv/object_detection/yolov5s/ixrt/README.md
+++ b/models/cv/object_detection/yolov5s/ixrt/README.md
@@ -54,10 +54,10 @@ popd
 
 ```bash
 export PROJ_DIR=/Path/to/yolov5s/ixrt
-export DATASETS_DIR=/Path/to/coco2017/
+export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-export EVAL_DIR=${DATASETS_DIR}/val2017
+export EVAL_DIR=${DATASETS_DIR}/images/val2017
 export RUN_DIR=${PROJ_DIR}/
 export CONFIG_DIR=config/YOLOV5S_CONFIG
 ```
diff --git a/models/cv/object_detection/yolov5s/ixrt/build_engine.py b/models/cv/object_detection/yolov5s/ixrt/build_engine.py
index a919bdd0..d47e45e5 100644
--- a/models/cv/object_detection/yolov5s/ixrt/build_engine.py
+++ b/models/cv/object_detection/yolov5s/ixrt/build_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import argparse
diff --git a/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py
index 3be0d83d..25f0ab8a 100644
--- a/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py
+++ b/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import argparse
 import torch
@@ -21,9 +6,11 @@ from onnx import helper
 from onnx import TensorProto, numpy_helper
 import tensorrt
 
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
 def create_onnx(args):
     nms = helper.make_node(
-        "NMS",
+        "DetectionNMS_IxRT",
         name="NMS",
         inputs=["nms_input"],
         outputs=["nms_output0", "nms_output1"],
diff --git a/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py
index de37775a..578e013d 100644
--- a/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py
+++ b/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py
@@ -1,22 +1,10 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import torch
 import torchvision.datasets
 from torch.utils.data import DataLoader
+
+
+
 from datasets.coco import CocoDetection
 
 def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
diff --git a/models/cv/object_detection/yolov5s/ixrt/coco_labels.py b/models/cv/object_detection/yolov5s/ixrt/coco_labels.py
index 43f5bd82..69d38878 100644
--- a/models/cv/object_detection/yolov5s/ixrt/coco_labels.py
+++ b/models/cv/object_detection/yolov5s/ixrt/coco_labels.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 labels = [
     "person",
     "bicycle",
diff --git a/models/cv/object_detection/yolov5s/ixrt/common.py b/models/cv/object_detection/yolov5s/ixrt/common.py
index 695e05ba..5f543555 100644
--- a/models/cv/object_detection/yolov5s/ixrt/common.py
+++ b/models/cv/object_detection/yolov5s/ixrt/common.py
@@ -1,22 +1,9 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import numpy as np
 from tqdm import tqdm
 
 import tensorrt
-import pycuda.driver as cuda
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 # input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
 # output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
@@ -80,13 +67,15 @@ def get_io_bindings(engine):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size,
         }
         print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
diff --git a/models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG b/models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG
index 8aa23b8e..c3f46cf8 100755
--- a/models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG
+++ b/models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # BSZ : 构建engine以及推理时的batchsize
 # IMGSIZE : 模型输入hw大小
 # RUN_MODE : [FPS, MAP]
diff --git a/models/cv/object_detection/yolov5s/ixrt/cut_model.py b/models/cv/object_detection/yolov5s/ixrt/cut_model.py
index e9ee19aa..af0a3a4f 100644
--- a/models/cv/object_detection/yolov5s/ixrt/cut_model.py
+++ b/models/cv/object_detection/yolov5s/ixrt/cut_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py
index 162e24b4..e69de29b 100755
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py
+++ b/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py b/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py
index 73c5df54..7f355b84 100755
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py
+++ b/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os.path
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/common.py b/models/cv/object_detection/yolov5s/ixrt/datasets/common.py
index ef36eba3..e120e00f 100755
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/common.py
+++ b/models/cv/object_detection/yolov5s/ixrt/datasets/common.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py
index 8590816a..a58c02f8 100755
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py
+++ b/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py
index c651f8ad..8cc643a8 100755
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py
+++ b/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py b/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py
index eadefb2c..32da4a78 100755
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py
+++ b/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov5s/ixrt/deploy.py b/models/cv/object_detection/yolov5s/ixrt/deploy.py
index 37c5f9ac..ec56b7ab 100644
--- a/models/cv/object_detection/yolov5s/ixrt/deploy.py
+++ b/models/cv/object_detection/yolov5s/ixrt/deploy.py
@@ -1,20 +1,5 @@
 # !/usr/bin/env python
 # -*- coding: utf-8 -*-
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import argparse
 from tensorrt.deploy.api import GraphTransform, create_source, create_target
 
@@ -77,17 +62,16 @@ def customize_ops(graph, args):
         stride=16,
         faster_impl=args.faster
     )
-    
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[num*2:num*2+1],
+        outputs=["decoder_32"],
+        op_type=args.decoder_type,
+        anchor=args.decoder32_anchor,
+        num_class=args.num_class,
+        stride=32,
+        faster_impl=args.faster
+    )
     if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:num*2+1],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
         graph = t.AddYoloDecoderOp(
             inputs=decoder_input[num*2+1:],
             outputs=["decoder_64"],
@@ -102,25 +86,24 @@ def customize_ops(graph, args):
             outputs=["output"],
             axis=1
         )
-    else:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
+    elif args.with_nms:
         graph = t.AddConcatOp(
             inputs=["decoder_32", "decoder_16", "decoder_8"],
             outputs=["output"],
             axis=1
         )
 
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
+        graph.outputs.clear()
+        graph.add_output("output")
+        graph.outputs["output"].dtype = "FLOAT"
+    else:
+        graph.outputs.clear()
+        graph.add_output("decoder_8")
+        graph.outputs["decoder_8"].dtype = "FLOAT"
+        graph.add_output("decoder_16")
+        graph.outputs["decoder_16"].dtype = "FLOAT"
+        graph.add_output("decoder_32")
+        graph.outputs["decoder_32"].dtype = "FLOAT"
     return graph
 
 def parse_args():
@@ -128,6 +111,7 @@ def parse_args():
     parser.add_argument("--src", type=str)
     parser.add_argument("--dst", type=str)
     parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
+    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
     parser.add_argument("--decoder_input_names", nargs='+', type=str)
     parser.add_argument("--decoder8_anchor", nargs='*', type=int)
     parser.add_argument("--decoder16_anchor", nargs='*', type=int)
diff --git a/models/cv/object_detection/yolov5s/ixrt/inference.py b/models/cv/object_detection/yolov5s/ixrt/inference.py
index ad87fe1e..5f5452d5 100644
--- a/models/cv/object_detection/yolov5s/ixrt/inference.py
+++ b/models/cv/object_detection/yolov5s/ixrt/inference.py
@@ -1,21 +1,6 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import argparse
 import glob
 import json
@@ -25,8 +10,8 @@ import sys
 
 import torch
 import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 from coco_labels import coco80_to_coco91_class, labels
 from common import save2json, box_class85to6
@@ -62,7 +47,7 @@ def main(config):
 
     bsz = config.bsz
     num_samples = 5000
-    if config.loop_count > 0 and config.loop_count < num_samples/bsz :
+    if config.loop_count > 0:
         num_samples = bsz * config.loop_count
     num_batch = len(dataloader)
     print("=" * 30)
@@ -109,17 +94,19 @@ def main(config):
         cur_bsz_sample = batch_data.shape[0]
 
         # Set input
-        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+        err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         # Forward
-        start_time = time.time()
+        # start_time = time.time()
         context.execute_v2(allocations)
-        end_time = time.time()
-        forward_time += end_time - start_time
+        # end_time = time.time()
+        # forward_time += end_time - start_time
 
         if config.test_mode == "MAP":
             # Fetch output
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 1 : prepare data to nms
             _, box_num, box_unit = output.shape
@@ -138,10 +125,13 @@ def main(config):
             if config.nms_type == "GPU":
 
                 # Set nms input
-                cuda.memcpy_htod(nms_inputs[0]["allocation"], nms_input)
+                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 nms_context.execute_v2(nms_allocations)
-                cuda.memcpy_dtoh(nms_output0, nms_outputs[0]["allocation"])
-                cuda.memcpy_dtoh(nms_output1, nms_outputs[1]["allocation"])
+                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 3 : post process + save
             pred_boxes = post_process_func(
@@ -154,9 +144,15 @@ def main(config):
             )
             save2json(batch_img_id, pred_boxes, json_result, class_map)
 
-    fps = num_samples / forward_time
+    # fps = num_samples / forward_time
 
     if config.test_mode == "FPS":
+        start_time = time.time()       
+        for i in range(config.loop_count):
+            context.execute_v2(allocations)  
+        end_time = time.time()  
+        forward_time = end_time - start_time      
+        fps = (config.loop_count*config.bsz) / forward_time
         print("FPS : ", fps)
         print(f"Performance Check : Test {fps} >= target {config.fps_target}")
         if fps >= config.fps_target:
@@ -164,12 +160,12 @@ def main(config):
             exit()
         else:
             print("failed!")
-            exit(1)
+            exit(10)
 
     if config.test_mode == "MAP":
         if len(json_result) == 0:
             print("Predict zero box!")
-            exit(1)
+            exit(10)
 
         if not os.path.exists(config.pred_dir):
             os.makedirs(config.pred_dir)
@@ -180,7 +176,6 @@ def main(config):
         with open(pred_json, "w") as f:
             json.dump(json_result, f)
 
-        start_time = time.time()
         anno_json = config.coco_gt
         anno = COCO(anno_json)  # init annotations api
         pred = anno.loadRes(pred_json)  # init predictions api
@@ -192,18 +187,16 @@ def main(config):
             f"==============================eval {config.model_name} {config.precision} coco map =============================="
         )
         eval.summarize()
-        e2e_time = time.time() - start_time
+
         map, map50 = eval.stats[:2]
-        print(F"E2E time : {e2e_time:.3f} seconds")
         print("MAP@0.5 : ", map50)
         print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        print(F"E2E time : {e2e_time:.3f} seconds")
         if map50 >= config.map_target:
             print("pass!")
             exit()
         else:
             print("failed!")
-            exit(1)
+            exit(10)
 
 def parse_config():
     parser = argparse.ArgumentParser()
diff --git a/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py
index ae47dc8e..932efbdf 100644
--- a/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py
+++ b/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
diff --git a/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py
index 3a88c160..00ed65dd 100644
--- a/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py
+++ b/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 
diff --git a/models/cv/object_detection/yolov5s/ixrt/quant.py b/models/cv/object_detection/yolov5s/ixrt/quant.py
index 36fd39a1..d73212ca 100644
--- a/models/cv/object_detection/yolov5s/ixrt/quant.py
+++ b/models/cv/object_detection/yolov5s/ixrt/quant.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import random
 import argparse
@@ -20,6 +5,9 @@ import numpy as np
 from tensorrt.deploy import static_quantize
 
 import torch
+import sys
+sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
+print(sys.path)
 from calibration_dataset import create_dataloaders
 
 def setseed(seed=42):
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
index 81b32fd1..b5cf3c97 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
@@ -1,31 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.56
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -48,14 +35,15 @@ ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
-echo COCO_GT : ${COCO_GT}
-echo EVAL_DIR : ${EVAL_DIR}
 echo RUN_DIR : ${RUN_DIR}
 echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -98,7 +86,7 @@ if [ $PRECISION == "int8" ];then
     echo;
     echo [STEP ${step}] : Quant Model
     if [[ -z ${QUANT_EXIST_ONNX} ]];then
-        QUANT_EXIST_ONNX=${CHECKPOINTS_DIR}/quantized_${MODEL_NAME}.onnx
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
     fi
     if [[ -f ${QUANT_EXIST_ONNX} ]];then
         CURRENT_MODEL=${QUANT_EXIST_ONNX}
@@ -112,7 +100,7 @@ if [ $PRECISION == "int8" ];then
             --data_process_type ${DATA_PROCESS_TYPE}        \
             --observer ${QUANT_OBSERVER}                    \
             --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
-            --save_dir ${CHECKPOINTS_DIR}                     \
+            --save_dir $CHECKPOINTS_DIR                     \
             --bsz   ${QUANT_BATCHSIZE}                      \
             --step  ${QUANT_STEP}                           \
             --seed  ${QUANT_SEED}                           \
@@ -127,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -135,6 +123,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
+            --with_nms             True                     \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -149,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -165,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
index 4ab4f9e4..f765679f 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
@@ -1,32 +1,19 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=840
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -48,14 +35,15 @@ ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
-echo COCO_GT : ${COCO_GT}
-echo EVAL_DIR : ${EVAL_DIR}
 echo RUN_DIR : ${RUN_DIR}
 echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -112,7 +100,7 @@ if [ $PRECISION == "int8" ];then
             --data_process_type ${DATA_PROCESS_TYPE}        \
             --observer ${QUANT_OBSERVER}                    \
             --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
-            --save_dir ${CHECKPOINTS_DIR}                     \
+            --save_dir $CHECKPOINTS_DIR                     \
             --bsz   ${QUANT_BATCHSIZE}                      \
             --step  ${QUANT_STEP}                           \
             --seed  ${QUANT_SEED}                           \
@@ -127,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_no_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -135,6 +123,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
+            --with_nms             False                    \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -149,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -165,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
index fc7988df..9b41db7d 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
@@ -1,31 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.56
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -48,14 +35,15 @@ ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
-echo COCO_GT : ${COCO_GT}
-echo EVAL_DIR : ${EVAL_DIR}
 echo RUN_DIR : ${RUN_DIR}
 echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -98,7 +86,7 @@ if [ $PRECISION == "int8" ];then
     echo;
     echo [STEP ${step}] : Quant Model
     if [[ -z ${QUANT_EXIST_ONNX} ]];then
-        QUANT_EXIST_ONNX=${CHECKPOINTS_DIR}/quantized_${MODEL_NAME}.onnx
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
     fi
     if [[ -f ${QUANT_EXIST_ONNX} ]];then
         CURRENT_MODEL=${QUANT_EXIST_ONNX}
@@ -112,7 +100,7 @@ if [ $PRECISION == "int8" ];then
             --data_process_type ${DATA_PROCESS_TYPE}        \
             --observer ${QUANT_OBSERVER}                    \
             --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
-            --save_dir ${CHECKPOINTS_DIR}                     \
+            --save_dir $CHECKPOINTS_DIR                     \
             --bsz   ${QUANT_BATCHSIZE}                      \
             --step  ${QUANT_STEP}                           \
             --seed  ${QUANT_SEED}                           \
@@ -127,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -135,6 +123,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
+            --with_nms             True                     \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -149,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -165,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
index dc912fa9..a2715061 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
@@ -1,32 +1,19 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=840
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -48,14 +35,15 @@ ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
-echo COCO_GT : ${COCO_GT}
-echo EVAL_DIR : ${EVAL_DIR}
 echo RUN_DIR : ${RUN_DIR}
 echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -92,7 +80,6 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
-
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -113,7 +100,7 @@ if [ $PRECISION == "int8" ];then
             --data_process_type ${DATA_PROCESS_TYPE}        \
             --observer ${QUANT_OBSERVER}                    \
             --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
-            --save_dir ${CHECKPOINTS_DIR}                     \
+            --save_dir $CHECKPOINTS_DIR                     \
             --bsz   ${QUANT_BATCHSIZE}                      \
             --step  ${QUANT_STEP}                           \
             --seed  ${QUANT_SEED}                           \
@@ -128,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_no_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -136,6 +123,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
+            --with_nms             False                    \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -150,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -166,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -193,7 +181,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}              \
+    --model_engine=${ENGINE_FILE}               \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov5s/ixrt/simplify_model.py b/models/cv/object_detection/yolov5s/ixrt/simplify_model.py
index 1400fd81..b4254b6f 100644
--- a/models/cv/object_detection/yolov5s/ixrt/simplify_model.py
+++ b/models/cv/object_detection/yolov5s/ixrt/simplify_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
-- 
Gitee


From 85365e1ebe9b1367bc7daf5bd6dded9419c75bd7 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Mon, 30 Jun 2025 15:49:09 +0800
Subject: [PATCH 03/15] sync yolov7

---
 .../cv/object_detection/yolov7/ixrt/README.md |  4 +-
 .../yolov7/ixrt/build_engine.py               | 15 -----
 .../yolov7/ixrt/build_nms_engine.py           | 19 +-----
 .../yolov7/ixrt/calibration_dataset.py        | 18 +----
 .../yolov7/ixrt/coco_labels.py                | 15 -----
 .../cv/object_detection/yolov7/ixrt/common.py | 22 ++-----
 .../yolov7/ixrt/config/YOLOV7_CONFIG          | 16 -----
 .../object_detection/yolov7/ixrt/cut_model.py | 15 -----
 .../yolov7/ixrt/datasets/__init__.py          | 14 ----
 .../yolov7/ixrt/datasets/coco.py              | 15 -----
 .../yolov7/ixrt/datasets/common.py            | 15 -----
 .../yolov7/ixrt/datasets/post_process.py      | 15 -----
 .../yolov7/ixrt/datasets/pre_process.py       | 15 -----
 .../yolov7/ixrt/datasets/vision.py            | 15 -----
 .../cv/object_detection/yolov7/ixrt/deploy.py | 43 +++---------
 .../object_detection/yolov7/ixrt/inference.py | 66 +++++++++----------
 .../yolov7/ixrt/load_ixrt_plugin.py           | 15 -----
 .../yolov7/ixrt/modify_batchsize.py           | 15 -----
 .../cv/object_detection/yolov7/ixrt/quant.py  | 18 +----
 .../scripts/infer_yolov7_fp16_accuracy.sh     | 28 +++-----
 .../scripts/infer_yolov7_fp16_performance.sh  | 33 ++++------
 .../scripts/infer_yolov7_int8_accuracy.sh     | 24 ++-----
 .../scripts/infer_yolov7_int8_performance.sh  | 26 +++-----
 .../yolov7/ixrt/simplify_model.py             | 15 -----
 tests/run_ixrt.py                             |  2 +-
 25 files changed, 92 insertions(+), 406 deletions(-)

diff --git a/models/cv/object_detection/yolov7/ixrt/README.md b/models/cv/object_detection/yolov7/ixrt/README.md
index b366ce6e..7d867750 100644
--- a/models/cv/object_detection/yolov7/ixrt/README.md
+++ b/models/cv/object_detection/yolov7/ixrt/README.md
@@ -48,10 +48,10 @@ mv yolov7.onnx /Path/to/checkpoints/yolov7m.onnx
 
 ```bash
 export PROJ_DIR=/Path/to/yolov7/ixrt
-export DATASETS_DIR=/Path/to/coco2017/
+export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-export EVAL_DIR=${DATASETS_DIR}/val2017
+export EVAL_DIR=${DATASETS_DIR}/images/val2017
 export RUN_DIR=/Path/to/yolov7/ixrt
 export CONFIG_DIR=config/YOLOV7_CONFIG
 ```
diff --git a/models/cv/object_detection/yolov7/ixrt/build_engine.py b/models/cv/object_detection/yolov7/ixrt/build_engine.py
index a919bdd0..d47e45e5 100644
--- a/models/cv/object_detection/yolov7/ixrt/build_engine.py
+++ b/models/cv/object_detection/yolov7/ixrt/build_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import argparse
diff --git a/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py
index 3be0d83d..25f0ab8a 100644
--- a/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py
+++ b/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import argparse
 import torch
@@ -21,9 +6,11 @@ from onnx import helper
 from onnx import TensorProto, numpy_helper
 import tensorrt
 
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
 def create_onnx(args):
     nms = helper.make_node(
-        "NMS",
+        "DetectionNMS_IxRT",
         name="NMS",
         inputs=["nms_input"],
         outputs=["nms_output0", "nms_output1"],
diff --git a/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py
index de37775a..578e013d 100644
--- a/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py
+++ b/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py
@@ -1,22 +1,10 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import torch
 import torchvision.datasets
 from torch.utils.data import DataLoader
+
+
+
 from datasets.coco import CocoDetection
 
 def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
diff --git a/models/cv/object_detection/yolov7/ixrt/coco_labels.py b/models/cv/object_detection/yolov7/ixrt/coco_labels.py
index 43f5bd82..69d38878 100644
--- a/models/cv/object_detection/yolov7/ixrt/coco_labels.py
+++ b/models/cv/object_detection/yolov7/ixrt/coco_labels.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 labels = [
     "person",
     "bicycle",
diff --git a/models/cv/object_detection/yolov7/ixrt/common.py b/models/cv/object_detection/yolov7/ixrt/common.py
index aba2117c..5f543555 100644
--- a/models/cv/object_detection/yolov7/ixrt/common.py
+++ b/models/cv/object_detection/yolov7/ixrt/common.py
@@ -1,23 +1,9 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import numpy as np
 from tqdm import tqdm
 
 import tensorrt
-import pycuda.driver as cuda
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 # input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
 # output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
@@ -81,13 +67,15 @@ def get_io_bindings(engine):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size,
         }
         print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
diff --git a/models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG b/models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG
index e6cb457d..4803e368 100644
--- a/models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG
+++ b/models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # BSZ : 构建engine以及推理时的batchsize
 # IMGSIZE : 模型输入hw大小
 # RUN_MODE : [FPS, MAP]
@@ -34,7 +19,6 @@ LAYER_FUSION=1
 DECODER_FASTER=1
 DECODER_NUM_CLASS=80
 DECODER_INPUT_NAMES=(/model/model.105/m.0/Conv_output_0 /model/model.105/m.1/Conv_output_0 /model/model.105/m.2/Conv_output_0)
-
 DECODER_8_ANCHOR=(12 16 19 36 40 28)
 DECODER_16_ANCHOR=(36 75 76 55 72 146)
 DECODER_32_ANCHOR=(142 110 192 243 459 401)
diff --git a/models/cv/object_detection/yolov7/ixrt/cut_model.py b/models/cv/object_detection/yolov7/ixrt/cut_model.py
index e9ee19aa..af0a3a4f 100644
--- a/models/cv/object_detection/yolov7/ixrt/cut_model.py
+++ b/models/cv/object_detection/yolov7/ixrt/cut_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py
index 162e24b4..e69de29b 100644
--- a/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py
+++ b/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/coco.py b/models/cv/object_detection/yolov7/ixrt/datasets/coco.py
index 73c5df54..7f355b84 100644
--- a/models/cv/object_detection/yolov7/ixrt/datasets/coco.py
+++ b/models/cv/object_detection/yolov7/ixrt/datasets/coco.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os.path
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/common.py b/models/cv/object_detection/yolov7/ixrt/datasets/common.py
index ef36eba3..e120e00f 100644
--- a/models/cv/object_detection/yolov7/ixrt/datasets/common.py
+++ b/models/cv/object_detection/yolov7/ixrt/datasets/common.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py
index 8590816a..a58c02f8 100644
--- a/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py
+++ b/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py
index c651f8ad..8cc643a8 100644
--- a/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py
+++ b/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/vision.py b/models/cv/object_detection/yolov7/ixrt/datasets/vision.py
index eadefb2c..32da4a78 100644
--- a/models/cv/object_detection/yolov7/ixrt/datasets/vision.py
+++ b/models/cv/object_detection/yolov7/ixrt/datasets/vision.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov7/ixrt/deploy.py b/models/cv/object_detection/yolov7/ixrt/deploy.py
index 8c2cc424..83f80a9e 100644
--- a/models/cv/object_detection/yolov7/ixrt/deploy.py
+++ b/models/cv/object_detection/yolov7/ixrt/deploy.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # !/usr/bin/env python
 # -*- coding: utf-8 -*-
 import argparse
@@ -77,17 +62,16 @@ def customize_ops(graph, args):
         stride=16,
         faster_impl=args.faster
     )
-    
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[num*2:num*2+1],
+        outputs=["decoder_32"],
+        op_type=args.decoder_type,
+        anchor=args.decoder32_anchor,
+        num_class=args.num_class,
+        stride=32,
+        faster_impl=args.faster
+    )
     if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:num*2+1],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
         graph = t.AddYoloDecoderOp(
             inputs=decoder_input[num*2+1:],
             outputs=["decoder_64"],
@@ -103,15 +87,6 @@ def customize_ops(graph, args):
             axis=1
         )
     else:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
         graph = t.AddConcatOp(
             inputs=["decoder_32", "decoder_16", "decoder_8"],
             outputs=["output"],
diff --git a/models/cv/object_detection/yolov7/ixrt/inference.py b/models/cv/object_detection/yolov7/ixrt/inference.py
index c0476b89..5637b839 100644
--- a/models/cv/object_detection/yolov7/ixrt/inference.py
+++ b/models/cv/object_detection/yolov7/ixrt/inference.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
@@ -25,8 +10,8 @@ import sys
 
 import torch
 import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 from coco_labels import coco80_to_coco91_class, labels
 from common import save2json, box_class85to6
@@ -62,7 +47,7 @@ def main(config):
 
     bsz = config.bsz
     num_samples = 5000
-    if config.loop_count > 0 and config.loop_count < num_samples/bsz :
+    if config.loop_count > 0:
         num_samples = bsz * config.loop_count
     num_batch = len(dataloader)
     print("=" * 30)
@@ -104,22 +89,26 @@ def main(config):
     for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
         batch_data = batch_data.numpy()
         batch_img_shape = [batch_img_shape[0].numpy(), batch_img_shape[1].numpy()]
+        
         # batch_img_id = batch_img_id.numpy()
 
         cur_bsz_sample = batch_data.shape[0]
+      
 
         # Set input
-        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+        err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         # Forward
-        start_time = time.time()
+        
         context.execute_v2(allocations)
-        end_time = time.time()
-        forward_time += end_time - start_time
+        
+        
 
         if config.test_mode == "MAP":
             # Fetch output
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 1 : prepare data to nms
             _, box_num, box_unit = output.shape
@@ -138,10 +127,13 @@ def main(config):
             if config.nms_type == "GPU":
 
                 # Set nms input
-                cuda.memcpy_htod(nms_inputs[0]["allocation"], nms_input)
+                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 nms_context.execute_v2(nms_allocations)
-                cuda.memcpy_dtoh(nms_output0, nms_outputs[0]["allocation"])
-                cuda.memcpy_dtoh(nms_output1, nms_outputs[1]["allocation"])
+                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 3 : post process + save
             pred_boxes = post_process_func(
@@ -153,10 +145,15 @@ def main(config):
                 max_det=config.max_det
             )
             save2json(batch_img_id, pred_boxes, json_result, class_map)
-
-    fps = num_samples / forward_time
-
+            
+            
     if config.test_mode == "FPS":
+        start_time = time.time()       
+        for i in range(config.loop_count):
+            context.execute_v2(allocations)  
+        end_time = time.time()  
+        forward_time = end_time - start_time      
+        fps = (config.loop_count*config.bsz) / forward_time
         print("FPS : ", fps)
         print(f"Performance Check : Test {fps} >= target {config.fps_target}")
         if fps >= config.fps_target:
@@ -164,12 +161,12 @@ def main(config):
             exit()
         else:
             print("failed!")
-            exit(1)
+            exit(10)
 
     if config.test_mode == "MAP":
         if len(json_result) == 0:
             print("Predict zero box!")
-            exit(1)
+            exit(10)
 
         if not os.path.exists(config.pred_dir):
             os.makedirs(config.pred_dir)
@@ -180,7 +177,6 @@ def main(config):
         with open(pred_json, "w") as f:
             json.dump(json_result, f)
 
-        start_time = time.time()
         anno_json = config.coco_gt
         anno = COCO(anno_json)  # init annotations api
         pred = anno.loadRes(pred_json)  # init predictions api
@@ -192,18 +188,16 @@ def main(config):
             f"==============================eval {config.model_name} {config.precision} coco map =============================="
         )
         eval.summarize()
-        e2e_time = time.time() - start_time
+
         map, map50 = eval.stats[:2]
-        print(F"E2E time : {e2e_time:.3f} seconds")
         print("MAP@0.5 : ", map50)
         print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        print(F"E2E time : {e2e_time:.3f} seconds")
         if map50 >= config.map_target:
             print("pass!")
             exit()
         else:
             print("failed!")
-            exit(1)
+            exit(10)
 
 def parse_config():
     parser = argparse.ArgumentParser()
diff --git a/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py
index ae47dc8e..932efbdf 100644
--- a/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py
+++ b/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
diff --git a/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py
index 3a88c160..00ed65dd 100644
--- a/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py
+++ b/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 
diff --git a/models/cv/object_detection/yolov7/ixrt/quant.py b/models/cv/object_detection/yolov7/ixrt/quant.py
index 36fd39a1..d73212ca 100644
--- a/models/cv/object_detection/yolov7/ixrt/quant.py
+++ b/models/cv/object_detection/yolov7/ixrt/quant.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import random
 import argparse
@@ -20,6 +5,9 @@ import numpy as np
 from tensorrt.deploy import static_quantize
 
 import torch
+import sys
+sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
+print(sys.path)
 from calibration_dataset import create_dataloaders
 
 def setseed(seed=42):
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
index 140ab8ac..30132700 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
@@ -1,31 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.68
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -125,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -147,7 +137,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
index 01542134..aca4b01c 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
@@ -1,32 +1,19 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=425
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -90,6 +80,7 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
+
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -125,7 +116,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -147,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -190,7 +181,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}               \
+    --model_engine=${ENGINE_FILE}              \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
index 18d11eff..d9cbd209 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
@@ -1,31 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.68
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
index 08525d28..051473b2 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
@@ -1,32 +1,19 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=425
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
diff --git a/models/cv/object_detection/yolov7/ixrt/simplify_model.py b/models/cv/object_detection/yolov7/ixrt/simplify_model.py
index 1400fd81..b4254b6f 100644
--- a/models/cv/object_detection/yolov7/ixrt/simplify_model.py
+++ b/models/cv/object_detection/yolov7/ixrt/simplify_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index 0bf60dbc..1ac40d2a 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -271,7 +271,7 @@ def run_detec_testcase(model):
         export PROJ_DIR=./
         export CHECKPOINTS_DIR=./checkpoints
         export COCO_GT=./{dataset_n}/annotations/instances_val2017.json
-        export EVAL_DIR=./{dataset_n}/val2017
+        export EVAL_DIR=./{dataset_n}/images/val2017
         export RUN_DIR=./
         export CONFIG_DIR=config/{config_name}_CONFIG
 
-- 
Gitee


From 1bbed5f088a2540cde5fab5334fd263372caf2f7 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Mon, 30 Jun 2025 18:09:28 +0800
Subject: [PATCH 04/15] update yolox

---
 .../yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh             | 2 +-
 .../yolox/ixrt/scripts/infer_yolox_fp16_performance.sh          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
index ed40e8dc..455a5483 100644
--- a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_accuracy.sh
@@ -18,7 +18,7 @@
 batchsize=${BATCH_SIZE:-"32"}
 model_path="yolox"
 datasets_path=${DATASETS_DIR}
-DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
+DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
 
 # cut onnx
 python3 python/cut_model.py                             \
diff --git a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
index c66562d6..913d9729 100644
--- a/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
+++ b/models/cv/object_detection/yolox/ixrt/scripts/infer_yolox_fp16_performance.sh
@@ -18,7 +18,7 @@
 batchsize=${BATCH_SIZE:-"32"}
 model_path="yolox"
 datasets_path=${DATASETS_DIR}
-DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
+DECODER_INPUT_NAMES="/head/obj_preds.0/Conv_output_0 /head/cls_preds.0/Conv_output_0 /head/reg_preds.1/Conv_output_0 /head/cls_preds.1/Conv_output_0 /head/reg_preds.2/Conv_output_0 /head/obj_preds.2/Conv_output_0 /head/cls_preds.2/Conv_output_0"
 
 # cut onnx
 python3 python/cut_model.py                             \
-- 
Gitee


From adf97e5ef011fda47eac7818102616f981e60a66 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Tue, 1 Jul 2025 09:25:59 +0800
Subject: [PATCH 05/15] sync yolov5

---
 .../cv/object_detection/yolov5/ixrt/README.md |  4 +-
 .../yolov5/ixrt/build_engine.py               | 15 -----
 .../yolov5/ixrt/build_nms_engine.py           | 19 +-----
 .../yolov5/ixrt/calibration_dataset.py        | 17 +-----
 .../yolov5/ixrt/coco_labels.py                | 15 -----
 .../cv/object_detection/yolov5/ixrt/common.py | 22 ++-----
 .../config/{YOLOV5_CONFIG => YOLOV5M_CONFIG}  | 15 -----
 .../object_detection/yolov5/ixrt/cut_model.py | 15 -----
 .../yolov5/ixrt/datasets/__init__.py          | 14 -----
 .../yolov5/ixrt/datasets/coco.py              | 15 -----
 .../yolov5/ixrt/datasets/common.py            | 15 -----
 .../yolov5/ixrt/datasets/post_process.py      | 15 -----
 .../yolov5/ixrt/datasets/pre_process.py       | 15 -----
 .../yolov5/ixrt/datasets/vision.py            | 15 -----
 .../cv/object_detection/yolov5/ixrt/deploy.py | 60 +++++++-----------
 .../object_detection/yolov5/ixrt/inference.py | 61 ++++++++-----------
 .../yolov5/ixrt/load_ixrt_plugin.py           | 15 -----
 .../yolov5/ixrt/modify_batchsize.py           | 15 -----
 .../cv/object_detection/yolov5/ixrt/quant.py  | 18 +-----
 .../scripts/infer_yolov5_fp16_accuracy.sh     | 31 ++++------
 .../scripts/infer_yolov5_fp16_performance.sh  | 36 +++++------
 .../scripts/infer_yolov5_int8_accuracy.sh     | 31 ++++------
 .../scripts/infer_yolov5_int8_performance.sh  | 33 ++++------
 .../yolov5/ixrt/simplify_model.py             | 15 -----
 tests/run_ixrt.py                             |  2 +
 25 files changed, 114 insertions(+), 414 deletions(-)
 rename models/cv/object_detection/yolov5/ixrt/config/{YOLOV5_CONFIG => YOLOV5M_CONFIG} (73%)

diff --git a/models/cv/object_detection/yolov5/ixrt/README.md b/models/cv/object_detection/yolov5/ixrt/README.md
index 6870c7d0..4394ba8c 100644
--- a/models/cv/object_detection/yolov5/ixrt/README.md
+++ b/models/cv/object_detection/yolov5/ixrt/README.md
@@ -58,9 +58,9 @@ export PROJ_DIR=/Path/to/yolov5/ixrt
 export DATASETS_DIR=/Path/to/coco2017/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-export EVAL_DIR=${DATASETS_DIR}/val2017
+export EVAL_DIR=${DATASETS_DIR}/images/val2017
 export RUN_DIR=/Path/to/yolov5/ixrt
-export CONFIG_DIR=config/YOLOV5_CONFIG
+export CONFIG_DIR=config/YOLOV5M_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov5/ixrt/build_engine.py b/models/cv/object_detection/yolov5/ixrt/build_engine.py
index a919bdd0..d47e45e5 100644
--- a/models/cv/object_detection/yolov5/ixrt/build_engine.py
+++ b/models/cv/object_detection/yolov5/ixrt/build_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import argparse
diff --git a/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py
index 3be0d83d..25f0ab8a 100644
--- a/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py
+++ b/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import argparse
 import torch
@@ -21,9 +6,11 @@ from onnx import helper
 from onnx import TensorProto, numpy_helper
 import tensorrt
 
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
 def create_onnx(args):
     nms = helper.make_node(
-        "NMS",
+        "DetectionNMS_IxRT",
         name="NMS",
         inputs=["nms_input"],
         outputs=["nms_output0", "nms_output1"],
diff --git a/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py
index de37775a..7d3e3e48 100644
--- a/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py
+++ b/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py
@@ -1,22 +1,9 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import torch
 import torchvision.datasets
 from torch.utils.data import DataLoader
+
+
 from datasets.coco import CocoDetection
 
 def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
diff --git a/models/cv/object_detection/yolov5/ixrt/coco_labels.py b/models/cv/object_detection/yolov5/ixrt/coco_labels.py
index 43f5bd82..69d38878 100644
--- a/models/cv/object_detection/yolov5/ixrt/coco_labels.py
+++ b/models/cv/object_detection/yolov5/ixrt/coco_labels.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 labels = [
     "person",
     "bicycle",
diff --git a/models/cv/object_detection/yolov5/ixrt/common.py b/models/cv/object_detection/yolov5/ixrt/common.py
index aba2117c..5f543555 100644
--- a/models/cv/object_detection/yolov5/ixrt/common.py
+++ b/models/cv/object_detection/yolov5/ixrt/common.py
@@ -1,23 +1,9 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import numpy as np
 from tqdm import tqdm
 
 import tensorrt
-import pycuda.driver as cuda
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 # input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
 # output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
@@ -81,13 +67,15 @@ def get_io_bindings(engine):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size,
         }
         print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
diff --git a/models/cv/object_detection/yolov5/ixrt/config/YOLOV5_CONFIG b/models/cv/object_detection/yolov5/ixrt/config/YOLOV5M_CONFIG
similarity index 73%
rename from models/cv/object_detection/yolov5/ixrt/config/YOLOV5_CONFIG
rename to models/cv/object_detection/yolov5/ixrt/config/YOLOV5M_CONFIG
index d6342be3..3eddc4f7 100644
--- a/models/cv/object_detection/yolov5/ixrt/config/YOLOV5_CONFIG
+++ b/models/cv/object_detection/yolov5/ixrt/config/YOLOV5M_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # BSZ : 构建engine以及推理时的batchsize
 # IMGSIZE : 模型输入hw大小
 # RUN_MODE : [FPS, MAP]
diff --git a/models/cv/object_detection/yolov5/ixrt/cut_model.py b/models/cv/object_detection/yolov5/ixrt/cut_model.py
index e9ee19aa..af0a3a4f 100644
--- a/models/cv/object_detection/yolov5/ixrt/cut_model.py
+++ b/models/cv/object_detection/yolov5/ixrt/cut_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py
index 162e24b4..e69de29b 100644
--- a/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py
+++ b/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py
@@ -1,14 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/coco.py b/models/cv/object_detection/yolov5/ixrt/datasets/coco.py
index 73c5df54..7f355b84 100644
--- a/models/cv/object_detection/yolov5/ixrt/datasets/coco.py
+++ b/models/cv/object_detection/yolov5/ixrt/datasets/coco.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os.path
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/common.py b/models/cv/object_detection/yolov5/ixrt/datasets/common.py
index ef36eba3..e120e00f 100644
--- a/models/cv/object_detection/yolov5/ixrt/datasets/common.py
+++ b/models/cv/object_detection/yolov5/ixrt/datasets/common.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py
index 8590816a..a58c02f8 100644
--- a/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py
+++ b/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py
index c651f8ad..8cc643a8 100644
--- a/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py
+++ b/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import cv2
 import math
 import numpy as np
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/vision.py b/models/cv/object_detection/yolov5/ixrt/datasets/vision.py
index eadefb2c..32da4a78 100644
--- a/models/cv/object_detection/yolov5/ixrt/datasets/vision.py
+++ b/models/cv/object_detection/yolov5/ixrt/datasets/vision.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 from typing import Any, Callable, List, Optional, Tuple
 
diff --git a/models/cv/object_detection/yolov5/ixrt/deploy.py b/models/cv/object_detection/yolov5/ixrt/deploy.py
index 8c2cc424..ec56b7ab 100644
--- a/models/cv/object_detection/yolov5/ixrt/deploy.py
+++ b/models/cv/object_detection/yolov5/ixrt/deploy.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # !/usr/bin/env python
 # -*- coding: utf-8 -*-
 import argparse
@@ -77,17 +62,16 @@ def customize_ops(graph, args):
         stride=16,
         faster_impl=args.faster
     )
-    
+    graph = t.AddYoloDecoderOp(
+        inputs=decoder_input[num*2:num*2+1],
+        outputs=["decoder_32"],
+        op_type=args.decoder_type,
+        anchor=args.decoder32_anchor,
+        num_class=args.num_class,
+        stride=32,
+        faster_impl=args.faster
+    )
     if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:num*2+1],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
         graph = t.AddYoloDecoderOp(
             inputs=decoder_input[num*2+1:],
             outputs=["decoder_64"],
@@ -102,25 +86,24 @@ def customize_ops(graph, args):
             outputs=["output"],
             axis=1
         )
-    else:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2:],
-            outputs=["decoder_32"],
-            op_type=args.decoder_type,
-            anchor=args.decoder32_anchor,
-            num_class=args.num_class,
-            stride=32,
-            faster_impl=args.faster
-        )
+    elif args.with_nms:
         graph = t.AddConcatOp(
             inputs=["decoder_32", "decoder_16", "decoder_8"],
             outputs=["output"],
             axis=1
         )
 
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
+        graph.outputs.clear()
+        graph.add_output("output")
+        graph.outputs["output"].dtype = "FLOAT"
+    else:
+        graph.outputs.clear()
+        graph.add_output("decoder_8")
+        graph.outputs["decoder_8"].dtype = "FLOAT"
+        graph.add_output("decoder_16")
+        graph.outputs["decoder_16"].dtype = "FLOAT"
+        graph.add_output("decoder_32")
+        graph.outputs["decoder_32"].dtype = "FLOAT"
     return graph
 
 def parse_args():
@@ -128,6 +111,7 @@ def parse_args():
     parser.add_argument("--src", type=str)
     parser.add_argument("--dst", type=str)
     parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
+    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
     parser.add_argument("--decoder_input_names", nargs='+', type=str)
     parser.add_argument("--decoder8_anchor", nargs='*', type=int)
     parser.add_argument("--decoder16_anchor", nargs='*', type=int)
diff --git a/models/cv/object_detection/yolov5/ixrt/inference.py b/models/cv/object_detection/yolov5/ixrt/inference.py
index c0476b89..5f5452d5 100644
--- a/models/cv/object_detection/yolov5/ixrt/inference.py
+++ b/models/cv/object_detection/yolov5/ixrt/inference.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
@@ -25,8 +10,8 @@ import sys
 
 import torch
 import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 from coco_labels import coco80_to_coco91_class, labels
 from common import save2json, box_class85to6
@@ -62,7 +47,7 @@ def main(config):
 
     bsz = config.bsz
     num_samples = 5000
-    if config.loop_count > 0 and config.loop_count < num_samples/bsz :
+    if config.loop_count > 0:
         num_samples = bsz * config.loop_count
     num_batch = len(dataloader)
     print("=" * 30)
@@ -109,17 +94,19 @@ def main(config):
         cur_bsz_sample = batch_data.shape[0]
 
         # Set input
-        cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+        err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         # Forward
-        start_time = time.time()
+        # start_time = time.time()
         context.execute_v2(allocations)
-        end_time = time.time()
-        forward_time += end_time - start_time
+        # end_time = time.time()
+        # forward_time += end_time - start_time
 
         if config.test_mode == "MAP":
             # Fetch output
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 1 : prepare data to nms
             _, box_num, box_unit = output.shape
@@ -138,10 +125,13 @@ def main(config):
             if config.nms_type == "GPU":
 
                 # Set nms input
-                cuda.memcpy_htod(nms_inputs[0]["allocation"], nms_input)
+                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 nms_context.execute_v2(nms_allocations)
-                cuda.memcpy_dtoh(nms_output0, nms_outputs[0]["allocation"])
-                cuda.memcpy_dtoh(nms_output1, nms_outputs[1]["allocation"])
+                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             # Step 3 : post process + save
             pred_boxes = post_process_func(
@@ -154,9 +144,15 @@ def main(config):
             )
             save2json(batch_img_id, pred_boxes, json_result, class_map)
 
-    fps = num_samples / forward_time
+    # fps = num_samples / forward_time
 
     if config.test_mode == "FPS":
+        start_time = time.time()       
+        for i in range(config.loop_count):
+            context.execute_v2(allocations)  
+        end_time = time.time()  
+        forward_time = end_time - start_time      
+        fps = (config.loop_count*config.bsz) / forward_time
         print("FPS : ", fps)
         print(f"Performance Check : Test {fps} >= target {config.fps_target}")
         if fps >= config.fps_target:
@@ -164,12 +160,12 @@ def main(config):
             exit()
         else:
             print("failed!")
-            exit(1)
+            exit(10)
 
     if config.test_mode == "MAP":
         if len(json_result) == 0:
             print("Predict zero box!")
-            exit(1)
+            exit(10)
 
         if not os.path.exists(config.pred_dir):
             os.makedirs(config.pred_dir)
@@ -180,7 +176,6 @@ def main(config):
         with open(pred_json, "w") as f:
             json.dump(json_result, f)
 
-        start_time = time.time()
         anno_json = config.coco_gt
         anno = COCO(anno_json)  # init annotations api
         pred = anno.loadRes(pred_json)  # init predictions api
@@ -192,18 +187,16 @@ def main(config):
             f"==============================eval {config.model_name} {config.precision} coco map =============================="
         )
         eval.summarize()
-        e2e_time = time.time() - start_time
+
         map, map50 = eval.stats[:2]
-        print(F"E2E time : {e2e_time:.3f} seconds")
         print("MAP@0.5 : ", map50)
         print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        print(F"E2E time : {e2e_time:.3f} seconds")
         if map50 >= config.map_target:
             print("pass!")
             exit()
         else:
             print("failed!")
-            exit(1)
+            exit(10)
 
 def parse_config():
     parser = argparse.ArgumentParser()
diff --git a/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py
index ae47dc8e..932efbdf 100644
--- a/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py
+++ b/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
diff --git a/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py
index 3a88c160..00ed65dd 100644
--- a/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py
+++ b/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 
diff --git a/models/cv/object_detection/yolov5/ixrt/quant.py b/models/cv/object_detection/yolov5/ixrt/quant.py
index 36fd39a1..d73212ca 100644
--- a/models/cv/object_detection/yolov5/ixrt/quant.py
+++ b/models/cv/object_detection/yolov5/ixrt/quant.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import random
 import argparse
@@ -20,6 +5,9 @@ import numpy as np
 from tensorrt.deploy import static_quantize
 
 import torch
+import sys
+sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
+print(sys.path)
 from calibration_dataset import create_dataloaders
 
 def setseed(seed=42):
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
index 140ab8ac..cd65d210 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
@@ -1,31 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.626
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -125,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -133,6 +123,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
+            --with_nms             True                     \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -147,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -163,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
index 01542134..3f841458 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
@@ -1,32 +1,19 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=735
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -90,6 +80,7 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
+
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -125,7 +116,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -133,6 +124,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
+            --with_nms             False                    \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -147,7 +139,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -163,7 +155,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -190,7 +182,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}               \
+    --model_engine=${ENGINE_FILE}              \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
index 18d11eff..24829da8 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
@@ -1,31 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=-1
+TGT=0.626
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -125,7 +115,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -133,6 +123,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
+            --with_nms             True                     \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -147,7 +138,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -163,7 +154,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
index 08525d28..8afcb722 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
@@ -1,32 +1,19 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=-1
-LOOP_COUNT=10
+TGT=735
+LOOP_COUNT=100
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -54,6 +41,9 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
+
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -126,7 +116,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -134,6 +124,7 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
+            --with_nms             False                    \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -148,7 +139,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -164,7 +155,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5/ixrt/simplify_model.py b/models/cv/object_detection/yolov5/ixrt/simplify_model.py
index 1400fd81..b4254b6f 100644
--- a/models/cv/object_detection/yolov5/ixrt/simplify_model.py
+++ b/models/cv/object_detection/yolov5/ixrt/simplify_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index 1ac40d2a..90692c6b 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -259,6 +259,8 @@ def run_detec_testcase(model):
     run_script(prepare_script)
 
     config_name = model_name.upper()
+    if model_name == "yolov5":
+        config_name = "YOLOV5M"
 
     for prec in model["precisions"]:
         logging.info(f"Start running {model_name} {prec} test case")
-- 
Gitee


From 31b589aa5c447928dc4dd6e106d7bcfb2d5a8d39 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Tue, 1 Jul 2025 13:07:07 +0800
Subject: [PATCH 06/15] sync fcos

---
 .../cv/object_detection/fcos/ixrt/README.md   |  33 +-
 .../fcos/ixrt/build_engine.py                 |  78 +----
 .../fcos/ixrt/calibration_dataset.py          | 105 +-----
 .../object_detection/fcos/ixrt/ci/prepare.sh  |  34 +-
 .../cv/object_detection/fcos/ixrt/common.py   |  43 ++-
 .../fcos/ixrt/datasets/__init__.py            |   0
 .../fcos/ixrt/datasets/coco.py                | 116 +++++++
 .../fcos/ixrt/datasets/common.py              |  68 ++++
 .../fcos/ixrt/datasets/post_process.py        | 157 +++++++++
 .../fcos/ixrt/datasets/pre_process.py         |  76 ++++
 .../fcos/ixrt/datasets/vision.py              | 136 ++++++++
 ...nreg-giou_r50_caffe_fpn_gn-head_1x_coco.py |  69 ----
 .../fcos/ixrt/fcos_ixrt_inference.py          | 202 -----------
 .../fcos_r50_caffe_fpn_gn-head_1x_coco.py     | 326 +++++++++++++-----
 .../fcos/ixrt/inference_mmdet.py              | 192 +++++++++++
 .../fcos/ixrt/load_ixrt_plugin.py             |  12 +
 .../fcos/ixrt/modify_batchsize.py             |  37 ++
 .../fcos/ixrt/requirements.txt                |  11 +-
 .../ixrt/scripts/infer_fcos_fp16_accuracy.sh  |  56 +--
 .../scripts/infer_fcos_fp16_performance.sh    |  56 +--
 .../fcos/ixrt/simplify_model.py               |  15 -
 tests/run_ixrt.py                             |  17 +
 22 files changed, 1162 insertions(+), 677 deletions(-)
 create mode 100644 models/cv/object_detection/fcos/ixrt/datasets/__init__.py
 create mode 100644 models/cv/object_detection/fcos/ixrt/datasets/coco.py
 create mode 100644 models/cv/object_detection/fcos/ixrt/datasets/common.py
 create mode 100644 models/cv/object_detection/fcos/ixrt/datasets/post_process.py
 create mode 100644 models/cv/object_detection/fcos/ixrt/datasets/pre_process.py
 create mode 100644 models/cv/object_detection/fcos/ixrt/datasets/vision.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/fcos_ixrt_inference.py
 create mode 100644 models/cv/object_detection/fcos/ixrt/inference_mmdet.py
 create mode 100644 models/cv/object_detection/fcos/ixrt/load_ixrt_plugin.py
 create mode 100644 models/cv/object_detection/fcos/ixrt/modify_batchsize.py

diff --git a/models/cv/object_detection/fcos/ixrt/README.md b/models/cv/object_detection/fcos/ixrt/README.md
index b11d07f0..dcc15b7c 100755
--- a/models/cv/object_detection/fcos/ixrt/README.md
+++ b/models/cv/object_detection/fcos/ixrt/README.md
@@ -34,47 +34,22 @@ apt install -y libgl1-mesa-glx
 pip3 install -r requirements.txt
 ```
 
-The inference of the FCOS model requires a dependency on a well-adapted mmcv-v1.7.0 library. Please inquire with the staff to obtain the relevant libraries.
-
-You can follow the script [prepare_mmcv.sh](https://gitee.com/deep-spark/deepsparkhub/blob/master/toolbox/MMDetection/prepare_mmcv.sh) to build:
-
-```bash
-cd mmcv
-sh build_mmcv.sh
-sh install_mmcv.sh
-```
-
 ### Model Conversion
 
 MMDetection is an open source object detection toolbox based on PyTorch. It is a part of the OpenMMLab project.It is utilized for model conversion. In MMDetection, Execute model conversion command, and the checkpoints folder needs to be created, (mkdir checkpoints) in project
 
 ```bash
 mkdir -p checkpoints
-git clone -b v2.25.0 https://github.com/open-mmlab/mmdetection.git
-cd mmdetection
-python3 tools/deployment/pytorch2onnx.py \
-    /Path/to/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py \
-    checkpoints/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth \
-    --output-file /Path/To/ixrt/data/checkpoints/r50_fcos.onnx \
-    --input-img demo/demo.jpg \
-    --test-img tests/data/color.jpg \
-    --shape 800 800 \
-    --show \
-    --verify \
-    --skip-postprocess \
-    --dynamic-export \
-    --cfg-options \
-      model.test_cfg.deploy_nms_pre=-1
+cd checkpoints
+wget http://files.deepspark.org.cn:880/deepspark/fcos_opt.onnx
 ```
 
-If there are issues such as input parameter mismatch during model export, it may be due to ONNX version. To resolve this, please delete the last parameter (dynamic_slice) from the return value of the_slice_helper function in the /usr/local/lib/python3.10/site-packages/mmcv/onnx/onnx_utils/symbolic_helper.py file.
-
 ## Model Inference
 
 ```bash
 export PROJ_DIR=./
-export DATASETS_DIR=/Path/to/coco/
-export CHECKPOINTS_DIR=/Path/to/checkpoints
+export DATASETS_DIR=./coco/
+export CHECKPOINTS_DIR=./checkpoints
 export RUN_DIR=./
 ```
 
diff --git a/models/cv/object_detection/fcos/ixrt/build_engine.py b/models/cv/object_detection/fcos/ixrt/build_engine.py
index af649916..d47e45e5 100755
--- a/models/cv/object_detection/fcos/ixrt/build_engine.py
+++ b/models/cv/object_detection/fcos/ixrt/build_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import argparse
@@ -20,55 +5,12 @@ import numpy as np
 
 import torch
 import tensorrt
-from calibration_dataset import getdataloader
-import cuda.cudart as cudart
-
-def assertSuccess(err):
-    assert(err == cudart.cudaError_t.cudaSuccess)
-
-class EngineCalibrator(tensorrt.IInt8EntropyCalibrator2):
-
-    def __init__(self, cache_file, datasets_dir, loop_count=10, bsz=1, img_sz=800):
-        super().__init__()
-        self.cache_file = cache_file
-        self.image_batcher  = getdataloader(datasets_dir, loop_count, batch_size=bsz, img_sz=img_sz)
-        self.batch_generator = iter(self.image_batcher)
-        size = img_sz*img_sz*3*bsz
-        __import__('pdb').set_trace()
-        err, self.batch_allocation = cudart.cudaMalloc(size)
-        assertSuccess(err)
-
-    def __del__(self):
-        err,= cudart.cudaFree(self.batch_allocation)
-        assertSuccess(err)
 
-    def get_batch_size(self):
-        return self.image_batcher.batch_size
-
-    def get_batch(self, names):
-        try:
-            batch, _ = next(self.batch_generator)
-            batch = batch.numpy()
-            __import__('pdb').set_trace()
-            cudart.cudaMemcpy(self.batch_allocation,
-                              np.ascontiguousarray(batch),
-                              batch.nbytes,
-                              cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
-            return [int(self.batch_allocation)]
-        except StopIteration:
-            return None
-
-    def read_calibration_cache(self):
-        if os.path.exists(self.cache_file):
-            with open(self.cache_file, "rb") as f:
-                return f.read()
-
-    def write_calibration_cache(self, cache):
-        with open(self.cache_file, "wb") as f:
-            f.write(cache)
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
 
 def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.VERBOSE)
+    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
     builder = tensorrt.Builder(IXRT_LOGGER)
     EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
     network = builder.create_network(EXPLICIT_BATCH)
@@ -76,8 +18,8 @@ def main(config):
     parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
     parser.parse_from_file(config.model)
 
-    precision = tensorrt.BuilderFlag.FP16
-    print("precision : ", precision)
+    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+    # print("precision : ", precision)
     build_config.set_flag(precision)
 
     plan = builder.build_serialized_network(network, build_config)
@@ -88,13 +30,11 @@ def main(config):
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", type=str)
+    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+            help="The precision of datatype")
+    # engine args
     parser.add_argument("--engine", type=str, default=None)
-    parser.add_argument(
-        "--datasets_dir",
-        type=str,
-        default="",
-        help="ImageNet dir",
-    )
+
     args = parser.parse_args()
     return args
 
diff --git a/models/cv/object_detection/fcos/ixrt/calibration_dataset.py b/models/cv/object_detection/fcos/ixrt/calibration_dataset.py
index d7525d51..2473f7d0 100644
--- a/models/cv/object_detection/fcos/ixrt/calibration_dataset.py
+++ b/models/cv/object_detection/fcos/ixrt/calibration_dataset.py
@@ -1,81 +1,19 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
-
 import torch
 import torchvision.datasets
 from torch.utils.data import DataLoader
-from torchvision import models
-from torchvision import transforms as T
-
-
-class CalibrationImageNet(torchvision.datasets.ImageFolder):
-    def __init__(self, *args, **kwargs):
-        super(CalibrationImageNet, self).__init__(*args, **kwargs)
-        img2label_path = os.path.join(self.root, "val_map.txt")
-        if not os.path.exists(img2label_path):
-            raise FileNotFoundError(f"Not found label file `{img2label_path}`.")
-
-        self.img2label_map = self.make_img2label_map(img2label_path)
-
-    def make_img2label_map(self, path):
-        with open(path) as f:
-            lines = f.readlines()
-
-        img2lable_map = dict()
-        for line in lines:
-            line = line.lstrip().rstrip().split("\t")
-            if len(line) != 2:
-                continue
-            img_name, label = line
-            img_name = img_name.strip()
-            if img_name in [None, ""]:
-                continue
-            label = int(label.strip())
-            img2lable_map[img_name] = label
-        return img2lable_map
-
-    def __getitem__(self, index):
-        path, target = self.samples[index]
-        sample = self.loader(path)
-        if self.transform is not None:
-            sample = self.transform(sample)
-        # if self.target_transform is not None:
-        #     target = self.target_transform(target)
-        img_name = os.path.basename(path)
-        target = self.img2label_map[img_name]
-
-        return sample, target
-
-
-def create_dataloaders(data_path, num_samples=1024, img_sz=224, batch_size=2, workers=0):
-    dataset = CalibrationImageNet(
-        data_path,
-        transform=T.Compose(
-            [
-                T.Resize(256),
-                T.CenterCrop(img_sz),
-                T.ToTensor(),
-                T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-            ]
-        ),
+from datasets.coco import CocoDetection
+
+def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
+    dataset = CocoDetection(
+        root=data_path,
+        annFile=annFile,
+        img_size=img_sz,
+        data_process_type=data_process_type
     )
-
     calibration_dataset = dataset
-    if num_samples is not None:
+    num_samples = min(5000, batch_size * step)
+    if num_samples > 0:
         calibration_dataset = torch.utils.data.Subset(
             dataset, indices=range(num_samples)
         )
@@ -87,27 +25,4 @@ def create_dataloaders(data_path, num_samples=1024, img_sz=224, batch_size=2, wo
         drop_last=False,
         num_workers=workers,
     )
-
-    verify_dataloader = DataLoader(
-        dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-
-    return calibration_dataloader, verify_dataloader
-
-
-def getdataloader(dataset_dir, step=20, batch_size=32, workers=2, img_sz=224, total_sample=50000):
-    num_samples = min(total_sample, step * batch_size)
-    if step < 0:
-        num_samples = None
-    calibration_dataloader, _ = create_dataloaders(
-        dataset_dir,
-        img_sz=img_sz,
-        batch_size=batch_size,
-        workers=workers,
-        num_samples=num_samples,
-    )
     return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/ci/prepare.sh b/models/cv/object_detection/fcos/ixrt/ci/prepare.sh
index a19f4811..a0469775 100644
--- a/models/cv/object_detection/fcos/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/fcos/ixrt/ci/prepare.sh
@@ -25,36 +25,6 @@ else
     echo "Not Support Os"
 fi
 pip3 install -r requirements.txt
-cp -r /root/data/3rd_party/mmcv-v1.7.1 ./mmcv
-cp -r -T /root/data/repos/deepsparkhub/toolbox/MMDetection/patch/mmcv/v1.7.1 ./mmcv
-cd mmcv
-rm -rf mmcv/ops/csrc/common/cuda/spconv/ mmcv/ops/csrc/common/utils/spconv/
-rm -f mmcv/ops/csrc/pytorch/cpu/sparse_*
-rm -f mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
-rm -f mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
-rm -f mmcv/ops/csrc/pytorch/cuda/sparse_*
-rm -f mmcv/ops/csrc/pytorch/sp*
-
-sed -i 's/return _slice(g, input, axes, starts, ends, steps, dynamic_slice)/return _slice(g, input, axes, starts, ends, steps)/' mmcv/onnx/onnx_utils/symbolic_helper.py
-
-bash clean_mmcv.sh
-bash build_mmcv.sh
-bash install_mmcv.sh
-cd ..
-
+pip install /root/data/install/mmcv_full-1.7.0+corex.20250108131027-cp310-cp310-linux_x86_64.whl
 mkdir -p checkpoints
-cp -r /root/data/3rd_party/mmdetection-v2.25.0 ./mmdetection
-cd mmdetection
-python3 tools/deployment/pytorch2onnx.py \
-    ../fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py \
-    /root/data/checkpoints/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco-0a0d75a8.pth \
-    --output-file ../checkpoints/r50_fcos.onnx \
-    --input-img demo/demo.jpg \
-    --test-img tests/data/color.jpg \
-    --shape 800 800 \
-    --show \
-    --verify \
-    --skip-postprocess \
-    --dynamic-export \
-    --cfg-options \
-      model.test_cfg.deploy_nms_pre=-1
\ No newline at end of file
+cp /root/data/checkpoints/fcos_opt.onnx checkpoints/
diff --git a/models/cv/object_detection/fcos/ixrt/common.py b/models/cv/object_detection/fcos/ixrt/common.py
index b18a2439..7d9a078e 100644
--- a/models/cv/object_detection/fcos/ixrt/common.py
+++ b/models/cv/object_detection/fcos/ixrt/common.py
@@ -1,23 +1,8 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import numpy as np
 from tqdm import tqdm
 
 import tensorrt
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 
 # input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
 # output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
@@ -33,7 +18,25 @@ def box_class85to6(input):
     nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
     return nms_input
 
-def save2json(batch_img_id, pred_boxes, json_result):
+def save2json(batch_img_id, pred_boxes, json_result, class_trans):
+    for i, boxes in enumerate(pred_boxes):
+        if boxes is not None:
+            image_id = int(batch_img_id[i])
+            # have no target
+            if image_id == -1:
+                continue
+            for x, y, w, h, c, p in boxes:
+                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
+                c = int(c)
+                json_result.append(
+                    {
+                        "image_id": image_id,
+                        "category_id": class_trans[c - 1],
+                        "bbox": [x, y, w, h],
+                        "score": p,
+                    }
+                )
+def save2json_nonms(batch_img_id, pred_boxes, json_result):
     for i, boxes in enumerate(pred_boxes):
         image_id = int(batch_img_id)
         if boxes is not None:
@@ -80,15 +83,17 @@ def get_io_bindings(engine):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size,
         }
-        # print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
+        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
         if engine.binding_is_input(i):
             inputs.append(binding)
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/__init__.py b/models/cv/object_detection/fcos/ixrt/datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/coco.py b/models/cv/object_detection/fcos/ixrt/datasets/coco.py
new file mode 100644
index 00000000..7f355b84
--- /dev/null
+++ b/models/cv/object_detection/fcos/ixrt/datasets/coco.py
@@ -0,0 +1,116 @@
+import os.path
+from typing import Any, Callable, List, Optional, Tuple
+
+import cv2
+
+from .vision import VisionDataset
+from .pre_process import get_post_process
+class CocoDetection(VisionDataset):
+    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
+
+    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+
+    Args:
+        root (string): Root directory where images are downloaded to.
+        annFile (string): Path to json annotation file.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+
+    def __init__(
+        self,
+        root: str,
+        annFile: str,
+        img_size: int,
+        data_process_type: str,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        transforms: Optional[Callable] = None,
+        
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        from pycocotools.coco import COCO
+
+        self.coco = COCO(annFile)
+        self.ids = list(sorted(self.coco.imgs.keys()))
+        self.img_size = img_size
+        
+        self.transforms = get_post_process(data_process_type)
+
+    def _load_image(self, id: int):
+        path = self.coco.loadImgs(id)[0]["file_name"]
+        data = cv2.imread(os.path.join(self.root, path))
+        return data
+
+    def _load_target(self, id: int) -> List[Any]:
+        return self.coco.loadAnns(self.coco.getAnnIds(id))
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        id = self.ids[index]
+        image = self._load_image(id)
+        target = self._load_target(id)
+        origin_shape = image.shape[:2]
+
+        if self.transforms is not None:
+            image = self.transforms(image, self.img_size)
+
+        if len(target) > 0:
+            image_id = target[0]["image_id"]
+        else:
+            # have no target
+            image_id = -1
+        return image, origin_shape, image_id
+
+    def __len__(self) -> int:
+        return len(self.ids)
+
+
+class CocoCaptions(CocoDetection):
+    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
+
+    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+
+    Args:
+        root (string): Root directory where images are downloaded to.
+        annFile (string): Path to json annotation file.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+
+    Example:
+
+        .. code:: python
+
+            import torchvision.datasets as dset
+            import torchvision.transforms as transforms
+            cap = dset.CocoCaptions(root = 'dir where images are',
+                                    annFile = 'json annotation file',
+                                    transform=transforms.PILToTensor())
+
+            print('Number of samples: ', len(cap))
+            img, target = cap[3] # load 4th sample
+
+            print("Image Size: ", img.size())
+            print(target)
+
+        Output: ::
+
+            Number of samples: 82783
+            Image Size: (3L, 427L, 640L)
+            [u'A plane emitting smoke stream flying over a mountain.',
+            u'A plane darts across a bright blue sky behind a mountain covered in snow',
+            u'A plane leaves a contrail above the snowy mountain top.',
+            u'A mountain that has a plane flying overheard in the distance.',
+            u'A mountain view with a plume of smoke in the background']
+
+    """
+
+    def _load_target(self, id: int) -> List[str]:
+        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/common.py b/models/cv/object_detection/fcos/ixrt/datasets/common.py
new file mode 100644
index 00000000..a8e5e6e7
--- /dev/null
+++ b/models/cv/object_detection/fcos/ixrt/datasets/common.py
@@ -0,0 +1,68 @@
+import cv2
+import math
+import numpy as np
+
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+    # Resize and pad image while meeting stride-multiple constraints
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return im, ratio, (dw, dh)
+
+def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
+    # Rescale boxes (xyxy) from net_shape to ori_shape
+
+    if use_letterbox:
+
+        gain = min(
+            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
+        )  # gain  = new / old
+        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
+            net_shape[0] - ori_shape[0] * gain
+        ) / 2.0
+
+        boxes[:, [0, 2]] -= pad[0]  # x padding
+        boxes[:, [1, 3]] -= pad[1]  # y padding
+        boxes[:, :4] /= gain
+    else:
+        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
+
+        boxes[:, 0] /= x_scale
+        boxes[:, 1] /= y_scale
+        boxes[:, 2] /= x_scale
+        boxes[:, 3] /= y_scale
+
+    clip_boxes(boxes, ori_shape)
+    return boxes
+
+def clip_boxes(boxes, shape):
+
+    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
+    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+    
+    return boxes
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/post_process.py b/models/cv/object_detection/fcos/ixrt/datasets/post_process.py
new file mode 100644
index 00000000..7b411a50
--- /dev/null
+++ b/models/cv/object_detection/fcos/ixrt/datasets/post_process.py
@@ -0,0 +1,157 @@
+import cv2
+import math
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from .common import letterbox, scale_boxes, clip_boxes
+
+def get_post_process(data_process_type):
+    if data_process_type == "yolov5":
+        return Yolov5Postprocess
+    elif data_process_type == "yolov3":
+        return Yolov3Postprocess
+    elif data_process_type == "yolox":
+        return YoloxPostprocess
+    elif data_process_type == "detr":
+        return DetrPostprocess
+    return None
+
+def Yolov3Postprocess(
+    ori_img_shape,
+    imgsz,
+    box_datas,
+    box_nums,
+    sample_num,
+    max_det=1000,
+):
+    all_box = []
+    data_offset = 0
+
+    box_datas = box_datas.flatten()
+    box_nums = box_nums.flatten()
+
+    for i in range(sample_num):
+        box_num = box_nums[i]
+        if box_num == 0:
+            boxes = None
+        else:
+            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
+            boxes = scale_boxes(
+                (imgsz[0], imgsz[1]),
+                cur_box,
+                (ori_img_shape[0][i], ori_img_shape[1][i]),
+                use_letterbox=False
+            )
+            # xyxy2xywh
+            boxes[:, 2] -= boxes[:, 0]
+            boxes[:, 3] -= boxes[:, 1]
+
+        all_box.append(boxes)
+        data_offset += max_det * 6
+
+    return all_box
+
+def Yolov5Postprocess(
+    ori_img_shape,
+    imgsz,
+    box_datas,
+    box_nums,
+    sample_num,
+    max_det=1000,
+):
+    all_box = []
+    data_offset = 0
+
+    box_datas = box_datas.flatten()
+    box_nums = box_nums.flatten()
+
+    for i in range(sample_num):
+        box_num = box_nums[i]
+        if box_num == 0:
+            boxes = None
+        else:
+            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
+            boxes = scale_boxes(
+                (imgsz[0], imgsz[1]),
+                cur_box,
+                (ori_img_shape[0][i], ori_img_shape[1][i]),
+                use_letterbox=True
+            )
+            # xyxy2xywh
+            boxes[:, 2] -= boxes[:, 0]
+            boxes[:, 3] -= boxes[:, 1]
+
+        all_box.append(boxes)
+        data_offset += max_det * 6
+
+    return all_box
+
+def YoloxPostprocess(
+    ori_img_shape,
+    imgsz,
+    box_datas,
+    box_nums,
+    sample_num,
+    max_det=1000,
+):
+    all_box = []
+    data_offset = 0
+    box_datas = box_datas.flatten()
+    box_nums = box_nums.flatten()
+
+    for i in range(sample_num):
+        box_num = box_nums[i]
+        if box_num == 0:
+            boxes = None
+        else:
+            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
+            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
+            boxes[:, :4] /= r
+            # xyxy2xywh
+            boxes[:, 2] -= boxes[:, 0]
+            boxes[:, 3] -= boxes[:, 1]
+            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
+
+        all_box.append(boxes)
+        data_offset += max_det * 6
+
+    return all_box
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(-1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+
+def DetrPostprocess(pred_logits, pred_boxes, target_sizes):
+    
+    out_logits = torch.from_numpy(pred_logits) 
+    out_bbox = torch.from_numpy(pred_boxes)
+    assert len(target_sizes) == 2
+    
+    prob = F.softmax(out_logits, -1)
+    scores, labels = prob[..., :-1].max(-1)
+    
+    # convert to [x0, y0, x1, y1] format 
+    boxes = box_cxcywh_to_xyxy(out_bbox)
+    # and from relative [0, 1] to absolute [0, height] coordinates
+    img_w, img_h = target_sizes
+    scale_fct = torch.tensor([img_w, img_h, img_w, img_h])
+    boxes = boxes * scale_fct
+   
+    boxes = clip_boxes(boxes, target_sizes)
+    boxes = convert_to_xywh(boxes)
+
+    labels = labels.unsqueeze(1)
+    scores =scores.unsqueeze(1)
+    pred_boxes = torch.cat([
+            boxes, 
+            labels, 
+            scores], dim=1).numpy().tolist()
+    return pred_boxes
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/pre_process.py b/models/cv/object_detection/fcos/ixrt/datasets/pre_process.py
new file mode 100644
index 00000000..e5b4ddfb
--- /dev/null
+++ b/models/cv/object_detection/fcos/ixrt/datasets/pre_process.py
@@ -0,0 +1,76 @@
+import cv2
+import math
+import numpy as np
+
+from .common import letterbox
+
+def get_post_process(data_process_type):
+    if data_process_type == "yolov5":
+        return Yolov5Preprocess
+    elif data_process_type == "yolov3":
+        return Yolov3Preprocess
+    elif data_process_type == "yolox":
+        return YoloxPreprocess
+    elif data_process_type == "detr":
+        return DetrPreprocess
+    return None
+
+def Yolov3Preprocess(image, img_size):
+
+    h0, w0 = image.shape[:2]  # orig hw
+    r = img_size / max(h0, w0)  # ratio
+
+    image = cv2.resize(image, (img_size, img_size))
+    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
+    return image
+
+def Yolov5Preprocess(image, img_size, augment=False):
+
+    h0, w0 = image.shape[:2]  # orig hw
+    r = img_size / max(h0, w0)  # ratio
+
+    if r != 1:  # if sizes are not equal
+        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
+        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
+
+    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
+
+    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
+    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
+    return image
+
+def YoloxPreprocess(img, img_size, swap=(2,0,1)):
+
+    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
+    r = min(img_size / img.shape[0], img_size / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR, 
+    ).astype(np.uint8)
+
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+
+    return padded_img
+
+def DetrPreprocess(image, img_size):    
+    # img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    # img = img.resize((img_size, img_size))
+    
+    std = [0.485, 0.456, 0.406] 
+    mean = [0.229, 0.224, 0.225]
+    
+    image = cv2.resize(image, (img_size, img_size))
+    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
+    
+    image[0,:,:] = (image[0,:,:]- std[0])/mean[0]
+    image[1,:,:] = (image[1,:,:]- std[1])/mean[1]
+    image[2,:,:] = (image[2,:,:]- std[2])/mean[2]
+    
+    return image
+    
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/vision.py b/models/cv/object_detection/fcos/ixrt/datasets/vision.py
new file mode 100644
index 00000000..32da4a78
--- /dev/null
+++ b/models/cv/object_detection/fcos/ixrt/datasets/vision.py
@@ -0,0 +1,136 @@
+import os
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+import torch.utils.data as data
+
+from types import FunctionType
+
+def _log_api_usage_once(obj: Any) -> None:
+
+    """
+    Logs API usage(module and name) within an organization.
+    In a large ecosystem, it's often useful to track the PyTorch and
+    TorchVision APIs usage. This API provides the similar functionality to the
+    logging module in the Python stdlib. It can be used for debugging purpose
+    to log which methods are used and by default it is inactive, unless the user
+    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
+    Please note it is triggered only once for the same API call within a process.
+    It does not collect any data from open-source users since it is no-op by default.
+    For more information, please refer to
+    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
+    * Logging policy: https://github.com/pytorch/vision/issues/5052;
+
+    Args:
+        obj (class instance or method): an object to extract info from.
+    """
+    module = obj.__module__
+    if not module.startswith("torchvision"):
+        module = f"torchvision.internal.{module}"
+    name = obj.__class__.__name__
+    if isinstance(obj, FunctionType):
+        name = obj.__name__
+    torch._C._log_api_usage_once(f"{module}.{name}")
+
+class VisionDataset(data.Dataset):
+    """
+    Base Class For making datasets which are compatible with torchvision.
+    It is necessary to override the ``__getitem__`` and ``__len__`` method.
+
+    Args:
+        root (string): Root directory of dataset.
+        transforms (callable, optional): A function/transforms that takes in
+            an image and a label and returns the transformed versions of both.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+
+    .. note::
+
+        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
+    """
+
+    _repr_indent = 4
+
+    def __init__(
+        self,
+        root: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+    ) -> None:
+        _log_api_usage_once(self)
+        if isinstance(root, str):
+            root = os.path.expanduser(root)
+        self.root = root
+
+        has_transforms = transforms is not None
+        has_separate_transform = transform is not None or target_transform is not None
+        if has_transforms and has_separate_transform:
+            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
+
+        # for backwards-compatibility
+        self.transform = transform
+        self.target_transform = target_transform
+
+        if has_separate_transform:
+            transforms = StandardTransform(transform, target_transform)
+        self.transforms = transforms
+
+    def __getitem__(self, index: int) -> Any:
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            (Any): Sample and meta data, optionally transformed by the respective transforms.
+        """
+        raise NotImplementedError
+
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+    def __repr__(self) -> str:
+        head = "Dataset " + self.__class__.__name__
+        body = [f"Number of datapoints: {self.__len__()}"]
+        if self.root is not None:
+            body.append(f"Root location: {self.root}")
+        body += self.extra_repr().splitlines()
+        if hasattr(self, "transforms") and self.transforms is not None:
+            body += [repr(self.transforms)]
+        lines = [head] + [" " * self._repr_indent + line for line in body]
+        return "\n".join(lines)
+
+    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+        lines = transform.__repr__().splitlines()
+        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
+
+    def extra_repr(self) -> str:
+        return ""
+
+
+class StandardTransform:
+    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
+        self.transform = transform
+        self.target_transform = target_transform
+
+    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
+        if self.transform is not None:
+            input = self.transform(input)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return input, target
+
+    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+        lines = transform.__repr__().splitlines()
+        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
+
+    def __repr__(self) -> str:
+        body = [self.__class__.__name__]
+        if self.transform is not None:
+            body += self._format_transform_repr(self.transform, "Transform: ")
+        if self.target_transform is not None:
+            body += self._format_transform_repr(self.target_transform, "Target transform: ")
+
+        return "\n".join(body)
diff --git a/models/cv/object_detection/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py b/models/cv/object_detection/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
deleted file mode 100644
index 72d17de8..00000000
--- a/models/cv/object_detection/fcos/ixrt/fcos_center-normbbox-centeronreg-giou_r50_caffe_fpn_gn-head_1x_coco.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-_base_ = 'fcos_r50_caffe_fpn_gn-head_1x_coco.py'
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
-    bbox_head=dict(
-        norm_on_bbox=True,
-        centerness_on_reg=True,
-        dcn_on_last_conv=False,
-        center_sampling=True,
-        conv_bias=True,
-        loss_bbox=dict(type='GIoULoss', loss_weight=1.0)),
-    # training and testing settings
-    test_cfg=dict(nms=dict(type='nms', iou_threshold=0.6)))
-
-# dataset settings
-img_norm_cfg = dict(
-    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', with_bbox=True),
-    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size_divisor=32),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(1333, 800),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    samples_per_gpu=2,
-    workers_per_gpu=2,
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
-optimizer_config = dict(_delete_=True, grad_clip=None)
-
-lr_config = dict(warmup='linear')
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/fcos_ixrt_inference.py b/models/cv/object_detection/fcos/ixrt/fcos_ixrt_inference.py
deleted file mode 100644
index 9218ea3a..00000000
--- a/models/cv/object_detection/fcos/ixrt/fcos_ixrt_inference.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import os
-import sys
-from tqdm import tqdm
-import numpy as np
-
-import argparse
-
-import torch
-import mmcv
-from mmdet.datasets import build_dataloader, build_dataset
-from mmdet.models import build_detector
-from mmdet.core import bbox2result
-import cv2
-import numpy as np
-import onnxruntime as rt
-
-import time
-
-import os 
-import copy
-from common import create_engine_context, get_io_bindings
-import pycuda.autoinit
-import pycuda.driver as cuda
-import tensorrt
-from tensorrt import Dims
-
-def check_target(inference, target):
-    satisfied = False
-    if inference > target:
-        satisfied = True  
-    return satisfied    
-
-def get_dataloder(args):
-    cfg_path = args.cfg_file
-    cfg = mmcv.Config.fromfile(cfg_path)
-    datasets_path = args.data_path
-    cfg['data']['val']['img_prefix'] = os.path.join(datasets_path, 'val2017')
-    cfg['data']['val']['ann_file'] = os.path.join(datasets_path, 'annotations/instances_val2017.json')   
-    dataset = build_dataset(cfg.data.val)
-    data_loader = build_dataloader(dataset, samples_per_gpu=args.batch_size, workers_per_gpu=args.num_workers, shuffle=False)
-    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
-    return dataset, data_loader, model
-    
-def eval_coco(args, inputs, outputs, allocations, context):
-    dataset, dataloader, model = get_dataloder(args)
-
-    # Prepare the output data
-    outputs_651 = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    outputs_766 = np.zeros(outputs[1]["shape"], outputs[1]["dtype"])
-    outputs_881 = np.zeros(outputs[2]["shape"], outputs[2]["dtype"])
-    outputs_996 = np.zeros(outputs[3]["shape"], outputs[3]["dtype"])
-    outputs_1111 = np.zeros(outputs[4]["shape"], outputs[4]["dtype"])
-    outputs_713 = np.zeros(outputs[5]["shape"], outputs[5]["dtype"])
-    outputs_828 = np.zeros(outputs[6]["shape"], outputs[6]["dtype"])
-    outputs_943 = np.zeros(outputs[7]["shape"], outputs[7]["dtype"])
-    outputs_1058 = np.zeros(outputs[8]["shape"], outputs[8]["dtype"])
-    outputs_1173 = np.zeros(outputs[9]["shape"], outputs[9]["dtype"])
-    outputs_705 = np.zeros(outputs[10]["shape"], outputs[10]["dtype"])
-    outputs_820 = np.zeros(outputs[11]["shape"], outputs[11]["dtype"])
-    outputs_935 = np.zeros(outputs[12]["shape"], outputs[12]["dtype"])
-    outputs_1050 = np.zeros(outputs[13]["shape"], outputs[13]["dtype"])
-    outputs_1165 = np.zeros(outputs[14]["shape"], outputs[14]["dtype"])
-
-    preds = []
-    for batch in tqdm(dataloader):
-        image = batch['img'][0].data.numpy()
-        image = image.astype(inputs[0]["dtype"])
-        # Set input
-        image = np.ascontiguousarray(image) 
-        cuda.memcpy_htod(inputs[0]["allocation"], image)
-        context.execute_v2(allocations)
-        # # Fetch output
-        cuda.memcpy_dtoh(outputs_651, outputs[0]["allocation"])
-        cuda.memcpy_dtoh(outputs_766, outputs[1]["allocation"])
-        cuda.memcpy_dtoh(outputs_881, outputs[2]["allocation"])
-        cuda.memcpy_dtoh(outputs_996, outputs[3]["allocation"])
-        cuda.memcpy_dtoh(outputs_1111, outputs[4]["allocation"])
-        cuda.memcpy_dtoh(outputs_713, outputs[5]["allocation"])
-        cuda.memcpy_dtoh(outputs_828, outputs[6]["allocation"])
-        cuda.memcpy_dtoh(outputs_943, outputs[7]["allocation"])
-        cuda.memcpy_dtoh(outputs_1058, outputs[8]["allocation"])
-        cuda.memcpy_dtoh(outputs_1173, outputs[9]["allocation"])
-        cuda.memcpy_dtoh(outputs_705, outputs[10]["allocation"])
-        cuda.memcpy_dtoh(outputs_820, outputs[11]["allocation"])
-        cuda.memcpy_dtoh(outputs_935, outputs[12]["allocation"])
-        cuda.memcpy_dtoh(outputs_1050, outputs[13]["allocation"])
-        cuda.memcpy_dtoh(outputs_1165, outputs[14]["allocation"])
-
-        cls_score = []
-        box_reg = []
-        score_factors = []
-        cls_score.append(torch.from_numpy(outputs_651))
-        cls_score.append(torch.from_numpy(outputs_766))
-        cls_score.append(torch.from_numpy(outputs_881))
-        cls_score.append(torch.from_numpy(outputs_996))
-        cls_score.append(torch.from_numpy(outputs_1111))
-
-        box_reg.append(torch.from_numpy(outputs_713))
-        box_reg.append(torch.from_numpy(outputs_828))
-        box_reg.append(torch.from_numpy(outputs_943))
-        box_reg.append(torch.from_numpy(outputs_1058))
-        box_reg.append(torch.from_numpy(outputs_1173))
-
-        score_factors.append(torch.from_numpy(outputs_705))
-        score_factors.append(torch.from_numpy(outputs_820))
-        score_factors.append(torch.from_numpy(outputs_935))
-        score_factors.append(torch.from_numpy(outputs_1050))
-        score_factors.append(torch.from_numpy(outputs_1165))
-
-        cls_score.sort(key=lambda x: x.shape[3],reverse=True)
-        box_reg.sort(key=lambda x: x.shape[3],reverse=True)       
-        score_factors.sort(key=lambda x: x.shape[3],reverse=True)
-
-        pred = model.bbox_head.get_bboxes(cls_score, box_reg, score_factors=score_factors, img_metas=batch['img_metas'][0].data[0], rescale=True)
-        bbox_results = [
-            bbox2result(det_bboxes, det_labels, model.bbox_head.num_classes)
-            for det_bboxes, det_labels in pred
-        ]
-        preds.extend(bbox_results)
-    eval_results = dataset.evaluate(preds, metric=['bbox'])
-    print(eval_results)
-    
-    map50 = eval_results['bbox_mAP_50']
-    return map50   
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    # engine args
-    parser.add_argument("--engine", type=str, default="./r50_fcos.engine")
-    parser.add_argument("--cfg_file", type=str, default="fcos_r50_caffe_fpn_gn-head_1x_coco.py")
-    parser.add_argument("--data_path", type=str, default="/home/datasets/cv/coco")
-    parser.add_argument("--batch_size", type=int, default=16)
-    parser.add_argument("--num_workers", type=int, default=4)
-    parser.add_argument("--image_file", type=str, default="/home/fangjian.hu/workspace/ixrt/data/fcos_test/test_800.jpg")
-    parser.add_argument("--warp_up", type=int, default=40)
-    parser.add_argument("--loop_count", type=int, default=50)
-    
-    parser.add_argument("--target_map", default=0.56, type=float, help="target map0.5")
-    parser.add_argument("--target_fps", default=50, type=float, help="target fps")
-    parser.add_argument("--task", default="precision", type=str, help="precision or pref")
-    
-    
-    args = parser.parse_args()
-    return args
-
-def main():
-    args= parse_args()
-    host_mem = tensorrt.IHostMemory
-    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-
-    # Load Engine
-    engine, context = create_engine_context(args.engine, logger)
-    inputs, outputs, allocations = get_io_bindings(engine)
-        
-    if args.task=="precision":
-        start_time = time.time()
-        map50= eval_coco(args,inputs, outputs, allocations, context)
-        end_time = time.time()
-        e2e_time = end_time - start_time
-        print(F"E2E time : {e2e_time:.3f} seconds")
-        
-        print("="*40)
-        print("MAP50:{0}".format(round(map50,3)))
-        print("="*40)
-        print(f"Check MAP50 Test : {round(map50,3)}  Target:{args.target_map} State : {'Pass' if round(map50,3) >= args.target_map else 'Fail'}")
-        status_map = check_target(map50, args.target_map)
-        sys.exit(int(not (status_map)))
-        
-    else:
-        torch.cuda.synchronize()
-        start_time = time.time()
-        for i in range(args.loop_count):
-            context.execute_v2(allocations)  
-        torch.cuda.synchronize()
-        end_time = time.time()
-        forward_time = end_time - start_time
-        fps = args.loop_count * args.batch_size / forward_time
-        print("="*40)
-        print("fps:{0}".format(round(fps,2)))
-        print("="*40)
-        print(f"Check fps Test : {round(fps,3)}  Target:{args.target_fps} State : {'Pass' if  fps >= args.target_fps else 'Fail'}")
-        status_fps = check_target(fps, args.target_fps)
-        sys.exit(int(not (status_fps)))
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py b/models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py
index 758d1d88..07b17960 100644
--- a/models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py
+++ b/models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py
@@ -1,103 +1,253 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-# model settings
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backend_args = None
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=1, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+load_from = None
+log_level = 'ERROR'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
 model = dict(
-    type='FCOS',
     backbone=dict(
-        type='ResNet',
         depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
         frozen_stages=1,
-        norm_cfg=dict(type='BN', requires_grad=False),
+        init_cfg=dict(
+            checkpoint='open-mmlab://detectron/resnet50_caffe',
+            type='Pretrained'),
+        norm_cfg=dict(requires_grad=False, type='BN'),
         norm_eval=True,
+        num_stages=4,
+        out_indices=(
+            0,
+            1,
+            2,
+            3,
+        ),
         style='caffe',
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint='open-mmlab://detectron/resnet50_caffe')),
-    neck=dict(
-        type='FPN',
-        in_channels=[256, 512, 1024, 2048],
-        out_channels=256,
-        start_level=1,
-        add_extra_convs='on_output',  # use P5
-        num_outs=5,
-        relu_before_extra_convs=True),
+        type='ResNet'),
     bbox_head=dict(
-        type='FCOSHead',
-        num_classes=80,
-        in_channels=256,
-        stacked_convs=4,
         feat_channels=256,
-        strides=[8, 16, 32, 64, 128],
+        in_channels=256,
+        loss_bbox=dict(loss_weight=1.0, type='IoULoss'),
+        loss_centerness=dict(
+            loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=True),
         loss_cls=dict(
-            type='FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
             alpha=0.25,
-            loss_weight=1.0),
-        loss_bbox=dict(type='IoULoss', loss_weight=1.0),
-        loss_centerness=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
-    # training and testing settings
-    train_cfg=dict(
-        assigner=dict(
-            type='MaxIoUAssigner',
-            pos_iou_thr=0.5,
-            neg_iou_thr=0.4,
-            min_pos_iou=0,
-            ignore_iof_thr=-1),
-        allowed_border=-1,
-        pos_weight=-1,
-        debug=False),
+            gamma=2.0,
+            loss_weight=1.0,
+            type='FocalLoss',
+            use_sigmoid=True),
+        num_classes=80,
+        stacked_convs=4,
+        strides=[
+            8,
+            16,
+            32,
+            64,
+            128,
+        ],
+        type='FCOSHead'),
+    data_preprocessor=dict(
+        bgr_to_rgb=False,
+        mean=[
+            102.9801,
+            115.9465,
+            122.7717,
+        ],
+        pad_size_divisor=32,
+        std=[
+            1.0,
+            1.0,
+            1.0,
+        ],
+        type='DetDataPreprocessor'),
+    neck=dict(
+        add_extra_convs='on_output',
+        in_channels=[
+            256,
+            512,
+            1024,
+            2048,
+        ],
+        num_outs=5,
+        out_channels=256,
+        relu_before_extra_convs=True,
+        start_level=1,
+        type='FPN'),
     test_cfg=dict(
-        nms_pre=1000,
+        max_per_img=100,
         min_bbox_size=0,
-        score_thr=0.05,
-        nms=dict(type='nms', iou_threshold=0.5),
-        max_per_img=100))
-
-# dataset settings
-dataset_type = 'CocoDataset'
-data_root = 'data/coco/'
-img_norm_cfg = dict(
-    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
-
+        nms=dict(iou_threshold=0.5, type='nms'),
+        nms_pre=1000,
+        score_thr=0.05),
+    type='FCOS')
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=35, norm_type=2),
+    optimizer=dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0001),
+    paramwise_cfg=dict(bias_decay_mult=0.0, bias_lr_mult=2.0),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        begin=0,
+        by_epoch=False,
+        end=500,
+        factor=0.3333333333333333,
+        type='ConstantLR'),
+    dict(
+        begin=0,
+        by_epoch=True,
+        end=12,
+        gamma=0.1,
+        milestones=[
+            8,
+            11,
+        ],
+        type='MultiStepLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=32,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='/home/xiaomei.wang/ixrt-modelzoo-new/data/datasets/coco2017',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    ann_file=
+    '/home/xiaomei.wang/ixrt-modelzoo-new/data/datasets/coco2017/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
 test_pipeline = [
-    dict(type='LoadImageFromFile'),
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(keep_ratio=False, scale=(
+        800,
+        800,
+    ), type='Resize'),
+    dict(type='LoadAnnotations', with_bbox=True),
     dict(
-        type='MultiScaleFlipAug',
-        img_scale=(800, 800),
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size=(800, 800)),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+        ),
+        type='PackDetInputs'),
+]
+train_cfg = dict(max_epochs=12, type='EpochBasedTrainLoop', val_interval=1)
+train_dataloader = dict(
+    batch_sampler=dict(type='AspectRatioBatchSampler'),
+    batch_size=2,
+    dataset=dict(
+        ann_file='annotations/instances_train2017.json',
+        backend_args=None,
+        data_prefix=dict(img='train2017/'),
+        data_root='data/coco/',
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(keep_ratio=True, scale=(
+                1333,
+                800,
+            ), type='Resize'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(type='PackDetInputs'),
+        ],
+        type='CocoDataset'),
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(keep_ratio=True, scale=(
+        1333,
+        800,
+    ), type='Resize'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(type='PackDetInputs'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                800,
+                800,
+            ), type='Resize'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    ann_file='data/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    type='CocoMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
 ]
-data = dict(
-    samples_per_gpu=32,
-    workers_per_gpu=1,
-    val=dict(
-        type=dataset_type,
-        ann_file=data_root + 'annotations/instances_val2017.json',
-        img_prefix=data_root + 'images/val2017/',
-        pipeline=test_pipeline)
-    )
-evaluation = dict(interval=1, metric='bbox')
\ No newline at end of file
+visualizer = dict(
+    name='visualizer',
+    type='DetLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './'
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/inference_mmdet.py b/models/cv/object_detection/fcos/ixrt/inference_mmdet.py
new file mode 100644
index 00000000..18fd4755
--- /dev/null
+++ b/models/cv/object_detection/fcos/ixrt/inference_mmdet.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import os
+import time
+import argparse
+import tensorrt
+import torch
+import torchvision
+import numpy as np
+from tensorrt import Dims
+from cuda import cuda, cudart
+from tqdm import tqdm
+from mmdet.registry import RUNNERS
+from mmengine.config import Config
+
+from common import create_engine_context, get_io_bindings
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument("--engine", 
+                        type=str, 
+                        required=True, 
+                        help="ixrt engine path.")
+    
+    # parser.add_argument("--model_name", type=str, default="")
+    
+    parser.add_argument("--cfg_file", type=str, default="")
+    
+    parser.add_argument("--batchsize",
+                        type=int,
+                        required=True, 
+                        help="inference batch size.")
+    
+    parser.add_argument("--datasets", 
+                        type=str, 
+                        required=True, 
+                        help="datasets path.")
+    
+    parser.add_argument("--warmup", 
+                        type=int, 
+                        default=3, 
+                        help="number of warmup before test.")           
+    
+    parser.add_argument("--acc_target",
+                        type=float,
+                        default=None,
+                        help="Model inference Accuracy target.")
+    
+    parser.add_argument("--fps_target",
+                        type=float,
+                        default=None,
+                        help="Model inference FPS target.")
+
+    parser.add_argument("--perf_only",
+                        type=bool,
+                        default=False,
+                        help="Run performance test only")
+    
+    args = parser.parse_args()
+
+    return args
+
+def main():
+    args = parse_args()
+
+    batch_size = args.batchsize
+
+    host_mem = tensorrt.IHostMemory
+    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+    # Load Engine && I/O bindings
+    engine, context = create_engine_context(args.engine, logger)
+    inputs, outputs, allocations = get_io_bindings(engine)
+    
+    if args.warmup > 0:
+        print("\nWarm Start.")
+        for i in range(args.warmup):
+            context.execute_v2(allocations)
+        print("Warm Done.")
+    
+    # just run perf test
+    if args.perf_only:
+        torch.cuda.synchronize()
+        start_time = time.time()
+
+        for i in range(10):
+            context.execute_v2(allocations)
+
+        torch.cuda.synchronize()
+        end_time = time.time()
+        forward_time = end_time - start_time
+        num_samples = 10 * args.batchsize
+        fps = num_samples / forward_time
+
+        print("FPS : ", fps)
+        print(f"Performance Check : Test {fps} >= target {args.fps_target}")
+        if fps >= args.fps_target:
+            print("pass!")
+            exit()
+        else:
+            print("failed!")
+            exit(1)
+    else:
+         # Runner config
+        cfg = Config.fromfile(args.cfg_file)
+        cfg.work_dir = "./"
+        
+        cfg['test_dataloader']['batch_size'] = batch_size
+        cfg['test_dataloader']['dataset']['data_root'] = args.datasets
+        cfg['test_dataloader']['dataset']['data_prefix']['img'] = 'val2017/'
+        cfg['test_evaluator']['ann_file'] = os.path.join(args.datasets, 'annotations/instances_val2017.json')
+        cfg['log_level'] = 'ERROR'
+
+        runner = RUNNERS.build(cfg)
+        
+        for input_data in tqdm(runner.test_dataloader):
+            
+            input_data = runner.model.data_preprocessor(input_data, False)
+            image = input_data['inputs'].cpu()
+            image = image.numpy().astype(inputs[0]["dtype"])
+            pad_batch = len(image) != batch_size
+            if pad_batch:
+                origin_size = len(image)
+                image = np.resize(image, (batch_size, *image.shape[1:]))
+            image = np.ascontiguousarray(image)
+
+            (err,) = cudart.cudaMemcpy(
+                inputs[0]["allocation"],
+                image,
+                image.nbytes,
+                cudart.cudaMemcpyKind.cudaMemcpyHostToDevice,
+            )
+            assert err == cudart.cudaError_t.cudaSuccess
+
+            context.execute_v2(allocations)
+            
+            cls_score = []
+            box_reg = []
+            score_factors = []
+            for i in range(len(outputs)):
+                output = np.zeros(outputs[i]["shape"], outputs[i]["dtype"])
+                (err,) = cudart.cudaMemcpy(
+                    output,
+                    outputs[i]["allocation"],
+                    outputs[i]["nbytes"],
+                    cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost,
+                )
+                assert err == cudart.cudaError_t.cudaSuccess
+
+                if pad_batch:
+                    output = output[:origin_size]
+
+                output = torch.from_numpy(output)
+                
+                if output.shape[1] == 80:
+                    cls_score.append(output)
+                elif output.shape[1] == 4:
+                    box_reg.append(output)
+                else:
+                    score_factors.append(output)
+
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in input_data['data_samples']
+            ]
+            
+            if "fovea_r50" or "fsaf" in args.cfg_file:
+                results_list = runner.model.bbox_head.predict_by_feat(cls_score, box_reg, batch_img_metas=batch_img_metas, rescale=True)
+            else:
+                results_list = runner.model.bbox_head.predict_by_feat(cls_score, box_reg, score_factors, batch_img_metas=batch_img_metas, rescale=True)
+
+            batch_data_samples = runner.model.add_pred_to_datasample(input_data['data_samples'], results_list)
+
+            runner.test_evaluator.process(data_samples=batch_data_samples, data_batch=input_data)
+
+        metrics = runner.test_evaluator.evaluate(len(runner.test_dataloader.dataset))
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/fcos/ixrt/load_ixrt_plugin.py
new file mode 100644
index 00000000..932efbdf
--- /dev/null
+++ b/models/cv/object_detection/fcos/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,12 @@
+import ctypes
+import tensorrt
+from os.path import join, dirname, exists
+def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path)
+    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/modify_batchsize.py b/models/cv/object_detection/fcos/ixrt/modify_batchsize.py
new file mode 100644
index 00000000..00ed65dd
--- /dev/null
+++ b/models/cv/object_detection/fcos/ixrt/modify_batchsize.py
@@ -0,0 +1,37 @@
+import onnx
+import argparse
+
+def change_input_dim(model, bsz):
+    batch_size = bsz
+
+    # The following code changes the first dimension of every input to be batch_size
+    # Modify as appropriate ... note that this requires all inputs to
+    # have the same batch_size
+    inputs = model.graph.input
+    for input in inputs:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = input.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        if isinstance(batch_size, str):
+            # set dynamic batch size
+            dim1.dim_param = batch_size
+        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
+            # set given batch size
+            dim1.dim_value = int(batch_size)
+        else:
+            # set batch size of 1
+            dim1.dim_value = 1
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int)
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+model = onnx.load(args.origin_model)
+change_input_dim(model, args.batch_size)
+onnx.save(model, args.output_model)
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/requirements.txt b/models/cv/object_detection/fcos/ixrt/requirements.txt
index a0763974..c6f9129a 100644
--- a/models/cv/object_detection/fcos/ixrt/requirements.txt
+++ b/models/cv/object_detection/fcos/ixrt/requirements.txt
@@ -1,10 +1,9 @@
+yapf==0.40.2
+addict==2.4.0
+mmdet==3.3.0
 tqdm
 onnx
 onnxsim
-ultralytics
 pycocotools
-addict
-yapf
-pycuda
-mmdet==2.28.2
-opencv-python==4.6.0.66
\ No newline at end of file
+opencv-python==4.6.0.66
+mmengine
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_accuracy.sh b/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_accuracy.sh
index b6ccfe62..aa081403 100755
--- a/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_accuracy.sh
+++ b/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_accuracy.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
@@ -24,7 +9,7 @@ check_status()
 }
 
 # Run paraments
-BSZ=1
+BSZ=32
 WARM_UP=-1
 TGT=-1
 LOOP_COUNT=-1
@@ -44,15 +29,15 @@ do
     esac
 done
 
-MODEL_NAME="r50_fcos"
+MODEL_NAME="fcos_opt"
+ORIGINE_MODEL="${CHECKPOINTS_DIR}/fcos_opt.onnx"
 
-echo PROJ_DIR ${PROJ_DIR}
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
 
 step=0
-
+CURRENT_MODEL=${ORIGINE_MODEL}
 # Simplify Model
 let step++
 echo;
@@ -62,22 +47,40 @@ if [ -f ${SIM_MODEL} ];then
     echo "  "Simplify Model Skipped, ${SIM_MODEL} has been existed
 else
     python3 ${RUN_DIR}/simplify_model.py \
-            --origin_model ${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx    \
+            --origin_model ${CURRENT_MODEL}   \
             --output_model ${SIM_MODEL}
     echo "  "Generate ${SIM_MODEL}
 fi
 
+CURRENT_MODEL=${SIM_MODEL}
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
 
 # Build Engine
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs{BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --model ${SIM_MODEL}                \
+        --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
@@ -86,11 +89,10 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Inference
-python3 ${RUN_DIR}/fcos_ixrt_inference.py \
+python3 ${RUN_DIR}/inference_mmdet.py \
         --engine ${ENGINE_FILE} \
         --cfg_file ${RUN_DIR}/fcos_r50_caffe_fpn_gn-head_1x_coco.py \
-        --task "precision" \
-        --data_path ${DATASETS_DIR} \
-        --batch_size 1 \
-        --target_map 0.54; check_status
+        --datasets ${DATASETS_DIR} \
+        --batchsize ${BSZ} \
+        --acc_target ${TGT}; check_status
 exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_performance.sh b/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_performance.sh
index 2bcf4d56..26a6bc83 100755
--- a/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_performance.sh
+++ b/models/cv/object_detection/fcos/ixrt/scripts/infer_fcos_fp16_performance.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
@@ -24,7 +9,7 @@ check_status()
 }
 
 # Run paraments
-BSZ=1
+BSZ=32
 WARM_UP=-1
 TGT=-1
 LOOP_COUNT=-1
@@ -44,15 +29,15 @@ do
     esac
 done
 
-MODEL_NAME="r50_fcos"
+MODEL_NAME="fcos_opt"
+ORIGINE_MODEL="${CHECKPOINTS_DIR}/fcos_opt.onnx"
 
-echo PROJ_DIR ${PROJ_DIR}
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
 
 step=0
-
+CURRENT_MODEL=${ORIGINE_MODEL}
 # Simplify Model
 let step++
 echo;
@@ -62,22 +47,40 @@ if [ -f ${SIM_MODEL} ];then
     echo "  "Simplify Model Skipped, ${SIM_MODEL} has been existed
 else
     python3 ${RUN_DIR}/simplify_model.py \
-            --origin_model ${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx    \
+            --origin_model ${CURRENT_MODEL}   \
             --output_model ${SIM_MODEL}
     echo "  "Generate ${SIM_MODEL}
 fi
 
+CURRENT_MODEL=${SIM_MODEL}
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
 
 # Build Engine
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs{BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --model ${SIM_MODEL}                \
+        --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
@@ -86,10 +89,11 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Inference
-python3 ${RUN_DIR}/fcos_ixrt_inference.py \
+python3 ${RUN_DIR}/inference_mmdet.py \
         --engine ${ENGINE_FILE} \
         --cfg_file ${RUN_DIR}/fcos_r50_caffe_fpn_gn-head_1x_coco.py \
-        --task "pref" \
-        --batch_size 1 \
-        --target_fps 40; check_status
+        --perf_only True \
+        --datasets ${DATASETS_DIR} \
+        --batchsize ${BSZ} \
+        --fps_target ${TGT}; check_status
 exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/simplify_model.py b/models/cv/object_detection/fcos/ixrt/simplify_model.py
index 1400fd81..b4254b6f 100644
--- a/models/cv/object_detection/fcos/ixrt/simplify_model.py
+++ b/models/cv/object_detection/fcos/ixrt/simplify_model.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import argparse
 from onnxsim import simplify
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index 90692c6b..39e112ba 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -378,6 +378,23 @@ def run_segmentation_and_face_testcase(model):
             result["result"][prec].update(get_metric_result(m))
         if len(matchs) == 2:
             result["result"][prec]["status"] = "PASS"
+        else:
+            patterns = {
+                "FPS": r"FPS\s*:\s*(\d+\.?\d*)",
+                "Accuracy": r"Accuracy\s*:\s*(\d+\.?\d*)"
+            }
+
+            combined_pattern = re.compile("|".join(f"(?P<{name}>{pattern})" for name, pattern in patterns.items()))
+            matchs = combined_pattern.finditer(sout)
+            match_count = 0
+            for match in matchs:
+                for name, value in match.groupdict().items():
+                    if value:
+                        match_count += 1
+                        result["result"][prec][name] = float(f"{float(value.split(':')[1].strip()):.3f}")
+                        break
+            if match_count == len(patterns):
+                result["result"][prec]["status"] = "PASS"
 
         result["result"][prec]["Cost time (s)"] = t
         logging.debug(f"matchs:\n{matchs}")
-- 
Gitee


From 9419441264a519d6c54770fb4fa6f2f11827baa5 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Tue, 1 Jul 2025 14:04:00 +0800
Subject: [PATCH 07/15] sync facenet

---
 .../face_recognition/facenet/ixrt/README.md   |  16 +-
 .../facenet/ixrt/build_engine.py              |  19 +-
 .../facenet/ixrt/ci/prepare.sh                |  12 +-
 .../face_recognition/facenet/ixrt/common.py   |  21 +-
 .../facenet/ixrt/config/FACENET_CONFIG        |  15 -
 .../face_recognition/facenet/ixrt/deploy.py   |  29 +-
 .../facenet/ixrt/inference.py                 |  39 +-
 .../facenet/ixrt/load_ixrt_plugin.py          |  15 -
 .../facenet/ixrt/modify_batchsize.py          |  39 ++
 .../cv/face_recognition/facenet/ixrt/quant.py |  26 +-
 .../facenet/ixrt/requirements.txt             |  15 +-
 .../scripts/infer_facenet_fp16_accuracy.sh    |  31 +-
 .../scripts/infer_facenet_fp16_performance.sh |  15 -
 .../scripts/infer_facenet_int8_accuracy.sh    |  15 -
 .../scripts/infer_facenet_int8_performance.sh |  15 -
 .../facenet/ixrt/tensorflow2pytorch.py        | 387 ------------------
 .../cv/face_recognition/facenet/ixrt/utils.py |  15 -
 tests/model_info.json                         |   2 +-
 18 files changed, 96 insertions(+), 630 deletions(-)
 create mode 100644 models/cv/face_recognition/facenet/ixrt/modify_batchsize.py
 delete mode 100644 models/cv/face_recognition/facenet/ixrt/tensorflow2pytorch.py

diff --git a/models/cv/face_recognition/facenet/ixrt/README.md b/models/cv/face_recognition/facenet/ixrt/README.md
index 62d4de80..cd58c9b9 100644
--- a/models/cv/face_recognition/facenet/ixrt/README.md
+++ b/models/cv/face_recognition/facenet/ixrt/README.md
@@ -18,12 +18,6 @@ Pretrained model: <https://drive.google.com/open?id=1R77HmFADxe87GmoLwzfgMu_HY0I
 
 Dataset: <https://vis-www.cs.umass.edu/lfw/lfw.tgz> to download the lfw dataset.
 
-```bash
-cd ${DeepSparkInference_PATH}/models/cv/face/facenet/ixrt
-# download and unzip 20180408-102900.zip
-unzip 20180408-102900.zip
-```
-
 ### Install Dependencies
 
 ```bash
@@ -41,14 +35,8 @@ pip3 install -r requirements.txt
 ```bash
 mkdir -p checkpoints
 mkdir -p facenet_weights
-git clone https://github.com/timesler/facenet-pytorch
-# facenet-pytorch/dependencies/facenet is submodule, pls make sure it has been cloned or you can clone directly from https://github.com/davidsandberg/facenet/tree/096ed770f163957c1e56efa7feeb194773920f6e
-mv /Path/facenet/ixrt/tensorflow2pytorch.py facenet-pytorch
-python3 ./facenet-pytorch/tensorflow2pytorch.py \
-        --facenet_weights_path ./facenet_weights \
-        --facenet_pb_path ./20180408-102900 \
-        --onnx_save_name facenet_export.onnx
-mv facenet_export.onnx ./facenet_weights
+cd facenet_weights
+wget http://files.deepspark.org.cn:880/deepspark/facenet_export.onnx
 ```
 
 ### Data preprocessing
diff --git a/models/cv/face_recognition/facenet/ixrt/build_engine.py b/models/cv/face_recognition/facenet/ixrt/build_engine.py
index 74a62202..057587f8 100644
--- a/models/cv/face_recognition/facenet/ixrt/build_engine.py
+++ b/models/cv/face_recognition/facenet/ixrt/build_engine.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import argparse
@@ -28,13 +13,13 @@ from load_ixrt_plugin import load_ixrt_plugin
 load_ixrt_plugin()
 
 def add_facenet_norm(onnx_model):
-    norm = helper.make_node('FacenetNorm_IxRT', inputs=['/last_bn/BatchNormalization_output_0'] , outputs=['/Pow_1_output_0'], name='facenet_norm_1', size=512)
+    norm = helper.make_node('FacenetNorm_IxRT', inputs=['1189'] , outputs=['1190'], name='facenet_norm_1', size=512)
     
     onnx_model = onnx.load(onnx_model)
     graph = onnx_model.graph
     nodes = graph.node
     graph.node.append(norm)
-    output = onnx.helper.make_tensor_value_info('/Pow_1_output_0', TensorProto.FLOAT, [64, 512, 1, 1])
+    output = onnx.helper.make_tensor_value_info('1190', TensorProto.FLOAT, [64, 512, 1, 1])
     graph = onnx.helper.make_graph(
         graph.node,
         "facenet model",
diff --git a/models/cv/face_recognition/facenet/ixrt/ci/prepare.sh b/models/cv/face_recognition/facenet/ixrt/ci/prepare.sh
index 9d7081e7..44ffa453 100644
--- a/models/cv/face_recognition/facenet/ixrt/ci/prepare.sh
+++ b/models/cv/face_recognition/facenet/ixrt/ci/prepare.sh
@@ -26,16 +26,6 @@ else
 fi
 
 pip3 install -r requirements.txt
-unzip -q /root/data/checkpoints/20180408-102900.zip -d ./
 unzip -q /root/data/datasets/facenet_datasets.zip -d ./
-mkdir -p checkpoints
 mkdir -p facenet_weights
-cp -r /root/data/3rd_party/facenet-pytorch ./
-cp ./tensorflow2pytorch.py facenet-pytorch
-python3 ./facenet-pytorch/tensorflow2pytorch.py \
-        --facenet_weights_path ./facenet_weights \
-        --facenet_pb_path ./20180408-102900 \
-        --onnx_save_name facenet_export.onnx
-mv facenet_export.onnx ./facenet_weights
-
-sed -i -e 's#/last_bn/BatchNormalization_output_0#1187#g' -e 's#/avgpool_1a/GlobalAveragePool_output_0#1178#g' deploy.py build_engine.py
\ No newline at end of file
+cp /root/data/checkpoints/facenet_export.onnx ./facenet_weights
diff --git a/models/cv/face_recognition/facenet/ixrt/common.py b/models/cv/face_recognition/facenet/ixrt/common.py
index 9db1327a..4b9ae114 100644
--- a/models/cv/face_recognition/facenet/ixrt/common.py
+++ b/models/cv/face_recognition/facenet/ixrt/common.py
@@ -1,25 +1,10 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import cv2
 import glob
 import torch
 import tensorrt
 import numpy as np
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 
 from torch.utils.data import DataLoader, SubsetRandomSampler, SequentialSampler
 from torchvision import datasets, transforms
@@ -53,13 +38,15 @@ def get_io_bindings(engine):
         size = np.dtype(tensorrt.nptype(dtype)).itemsize
         for s in shape:
             size *= s
-        allocation = cuda.mem_alloc(size)
+        err, allocation = cudart.cudaMalloc(size)
+        assert err == cudart.cudaError_t.cudaSuccess
         binding = {
             "index": i,
             "name": name,
             "dtype": np.dtype(tensorrt.nptype(dtype)),
             "shape": list(shape),
             "allocation": allocation,
+            "nbytes": size,
         }
         print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
         allocations.append(allocation)
diff --git a/models/cv/face_recognition/facenet/ixrt/config/FACENET_CONFIG b/models/cv/face_recognition/facenet/ixrt/config/FACENET_CONFIG
index 3b3282ef..74ea45fe 100644
--- a/models/cv/face_recognition/facenet/ixrt/config/FACENET_CONFIG
+++ b/models/cv/face_recognition/facenet/ixrt/config/FACENET_CONFIG
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 # IMGSIZE : 模型输入hw大小
 # MODEL_NAME : 生成onnx/engine的basename
 # ORIGINE_MODEL : 原始onnx文件名称
diff --git a/models/cv/face_recognition/facenet/ixrt/deploy.py b/models/cv/face_recognition/facenet/ixrt/deploy.py
index 79f4ce58..3036363f 100644
--- a/models/cv/face_recognition/facenet/ixrt/deploy.py
+++ b/models/cv/face_recognition/facenet/ixrt/deploy.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import onnx
 import os
 import simplejson as json
@@ -34,7 +19,7 @@ def onnx_sim(onnx_name, save_name):
 
 def cut_model(onnx_name):
     input_names = ["input"]
-    output_names = ["/last_bn/BatchNormalization_output_0"]
+    output_names = ["1189"]
     onnx.utils.extract_model(onnx_name, onnx_name, input_names, output_names) 
 
 def fuse_matmul(onnx_name, save_onnx_name):
@@ -94,10 +79,10 @@ def fuse_matmul(onnx_name, save_onnx_name):
             graph.initializer.append(conv_bias_new_initializer)
 
             pre_node.op_type = "Conv"
-            pre_node.input[0] = "/avgpool_1a/GlobalAveragePool_output_0"
+            pre_node.input[0] = "1180"
             pre_node.input[1] = "conv_weights_new"
             pre_node.input.append("conv_bias_new")
-            pre_node.output[0] = "/last_bn/BatchNormalization_output_0"
+            pre_node.output[0] = "1189"
             dilations = onnx.helper.make_attribute("dilations", [1,1])
             group = onnx.helper.make_attribute("group", 1)
             kernel_shape = onnx.helper.make_attribute("kernel_shape", [1,1])
@@ -119,7 +104,7 @@ def fuse_matmul(onnx_name, save_onnx_name):
             graph.node.remove(node)
 
     if find_matmul==1:
-        output = onnx.helper.make_tensor_value_info('/last_bn/BatchNormalization_output_0', TensorProto.FLOAT, [64, 512, 1, 1])
+        output = onnx.helper.make_tensor_value_info('1189', TensorProto.FLOAT, [64, 512, 1, 1])
         graph = onnx.helper.make_graph(
             graph.node,
             "facenet model",
@@ -389,10 +374,10 @@ def add_facenet_norm(cfg_name):
 
     graph_json["nodes"]["facenet_norm_1"] = {
             "inputs": [
-                "/last_bn/BatchNormalization_output_0"
+                "1189"
             ],
             "outputs": [
-                "/Pow_1_output_0"
+                "1190"
             ],
             "op_type": "FacenetNorm",
             "attrbiute": {
@@ -400,7 +385,7 @@ def add_facenet_norm(cfg_name):
             }
         }
     graph_json["output"] = []
-    graph_json["output"].append({"name":"/Pow_1_output_0", "type":"float32"})
+    graph_json["output"].append({"name":"1190", "type":"float32"})
 
     with open(cfg_name, "w") as fh:
         json.dump(graph_json, fh, indent=4)
diff --git a/models/cv/face_recognition/facenet/ixrt/inference.py b/models/cv/face_recognition/facenet/ixrt/inference.py
index eaed8b27..74a43f3e 100644
--- a/models/cv/face_recognition/facenet/ixrt/inference.py
+++ b/models/cv/face_recognition/facenet/ixrt/inference.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
@@ -25,8 +10,7 @@ from tqdm import tqdm
 
 import cv2
 import numpy as np
-import pycuda.autoinit
-import pycuda.driver as cuda
+from cuda import cuda, cudart
 import torch
 import tensorrt
 from tensorrt.utils import topk
@@ -58,7 +42,6 @@ def main(config):
         print("Warm Done.")
 
     # Inference
-    metricResult = {"metricResult": {}}
     if config.test_mode == "FPS":
         torch.cuda.synchronize()
         start_time = time.time()
@@ -74,7 +57,6 @@ def main(config):
 
         print("FPS : ", fps)
         print(f"Performance Check : Test {fps} >= target {config.fps_target}")
-        metricResult["metricResult"]["FPS"] = round(fps, 3)
         if fps >= config.fps_target:
             print("pass!")
             exit()
@@ -86,7 +68,7 @@ def main(config):
 
         classes = []
         embeddings = []
-        start_time = time.time()
+
         for xb, yb in tqdm(embed_loader):
         
             output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
@@ -94,9 +76,11 @@ def main(config):
             xb = xb.numpy()
             xb = np.ascontiguousarray(xb)
 
-            cuda.memcpy_htod(inputs[0]["allocation"], xb)
+            err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], xb, xb.nbytes)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
             context.execute_v2(allocations)
-            cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
 
             output = output.reshape(output.shape[0],output.shape[1])
             #print("output shape ",output.shape)
@@ -104,8 +88,7 @@ def main(config):
             classes.extend(yb[0:current_imgs_num].numpy())
             embeddings.extend(output)
 
-        e2e_time = time.time() - start_time
-        print(f"E2E time: {e2e_time:.3f} seconds")
+
         embeddings_dict = dict(zip(crop_paths,embeddings))
 
         pairs = read_pairs(config.datasets_dir + config.pairs_name)
@@ -122,9 +105,6 @@ def main(config):
         #eer = brentq(lambda x: 1. - x - interpolate.interp1d(fpr, tpr, fill_value="extrapolate")(x), 0., 1.)
         #print('Equal Error Rate (EER): %1.3f' % eer)
 
-        metricResult["metricResult"]["E2E time"] = round(e2e_time, 3)
-        metricResult["metricResult"]["AUC"] = round(auc, 3)
-        metricResult["metricResult"]["Acc"] = round(np.mean(accuracy), 3)
         acc = np.mean(accuracy)
         print(f"Accuracy Check : Test {acc} >= target {config.acc_target}")
         if acc >= config.acc_target:
@@ -133,7 +113,6 @@ def main(config):
         else:
             print("failed!")
             exit(1)
-    print(metricResult)
 
 def parse_config():
     parser = argparse.ArgumentParser()
@@ -157,7 +136,7 @@ def parse_config():
         "--img",
         "--img-size",
         type=int,
-        default=160,
+        default=224,
         help="inference size h,w",
     )
     parser.add_argument("--use_async", action="store_true")
@@ -173,4 +152,4 @@ def parse_config():
 
 if __name__ == "__main__":
     config = parse_config()
-    main(config)
\ No newline at end of file
+    main(config)
diff --git a/models/cv/face_recognition/facenet/ixrt/load_ixrt_plugin.py b/models/cv/face_recognition/facenet/ixrt/load_ixrt_plugin.py
index ae47dc8e..932efbdf 100644
--- a/models/cv/face_recognition/facenet/ixrt/load_ixrt_plugin.py
+++ b/models/cv/face_recognition/facenet/ixrt/load_ixrt_plugin.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
diff --git a/models/cv/face_recognition/facenet/ixrt/modify_batchsize.py b/models/cv/face_recognition/facenet/ixrt/modify_batchsize.py
new file mode 100644
index 00000000..f329119d
--- /dev/null
+++ b/models/cv/face_recognition/facenet/ixrt/modify_batchsize.py
@@ -0,0 +1,39 @@
+import onnx
+import numpy as np
+import argparse
+
+def change_dim(model, bsz):
+    batch_size = bsz
+
+    # The following code changes the first dimension of every input to be batch_size
+    # Modify as appropriate ... note that this requires all inputs to
+    # have the same batch_size
+    for input in model.graph.input:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = input.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        if isinstance(batch_size, str):
+            # set dynamic batch size
+            dim1.dim_param = batch_size
+        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
+            # set given batch size
+            dim1.dim_value = int(batch_size)
+        else:
+            # set batch size of 1
+            dim1.dim_value = 1
+
+    return model
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int)
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+model = onnx.load(args.origin_model)
+change_dim(model, args.batch_size)
+onnx.save(model, args.output_model)
diff --git a/models/cv/face_recognition/facenet/ixrt/quant.py b/models/cv/face_recognition/facenet/ixrt/quant.py
index 26413e3e..e4bb3780 100644
--- a/models/cv/face_recognition/facenet/ixrt/quant.py
+++ b/models/cv/face_recognition/facenet/ixrt/quant.py
@@ -1,19 +1,5 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
+
 import torch
 from tensorrt.deploy.api import *
 from tensorrt.deploy.utils.seed import manual_seed
@@ -93,6 +79,14 @@ def quantize_model(args, model_name, model, dataloader):
         quant_format="qdq",
         disable_quant_names=None)
 
+def add_1190_scale(cfg_name):
+    graph_json = json.load(open(cfg_name))
+
+    graph_json["quant_info"]["1190"] = graph_json["quant_info"]["1189"]
+
+    with open(cfg_name, "w") as fh:
+        json.dump(graph_json, fh, indent=4)
+
 def create_argparser(*args, **kwargs):
     parser = ArgumentParser(*args, **kwargs)
     parser.add_argument("--batch_size", type=int, default=64)
@@ -128,6 +122,8 @@ def main():
     else:
         print("[Error] file name not correct ", args.model)
     quantize_model(args, model_name, model, dataloader)
+    json_name = f"./facenet_weights/{model_name}.json"
+    add_1190_scale(json_name)
 
 if __name__ == "__main__":
     main()
diff --git a/models/cv/face_recognition/facenet/ixrt/requirements.txt b/models/cv/face_recognition/facenet/ixrt/requirements.txt
index b1b549a8..09895311 100644
--- a/models/cv/face_recognition/facenet/ixrt/requirements.txt
+++ b/models/cv/face_recognition/facenet/ixrt/requirements.txt
@@ -1,12 +1,9 @@
-tensorflow
-onnxsim
-scikit-learn
-tf_slim
 tqdm
-pycuda
-onnx
 tabulate
 scipy==1.8.0
-pycocotools
-opencv-python==4.6.0.66
-simplejson
\ No newline at end of file
+scikit-learn
+onnx
+onnxsim
+simplejson
+numpy==1.23.5
+opencv-python==4.6.0.66
\ No newline at end of file
diff --git a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
index 27e5e8ad..d1c79729 100644
--- a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
+++ b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
@@ -43,6 +28,7 @@ do
       --tgt) TGT=${arguments[index]};;
     esac
 done
+
 PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
 echo PROJ_DIR : ${PROJ_DIR}
 RUN_DIR="${PROJ_DIR}/ixrt/"
@@ -102,13 +88,24 @@ if [ $PRECISION == "int8" ];then
     fi
 fi
 
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+    python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+        --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+    echo "  "Generate ${FINAL_MODEL}
+fi
 
 # Build Engine
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
 ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
-FINAL_MODEL=${SIM_MODEL}
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -130,7 +127,7 @@ python3 ${RUN_DIR}/inference.py     \
     --warm_up=${WARM_UP}            \
     --loop_count ${LOOP_COUNT}      \
     --test_mode ${RUN_MODE}         \
-    --fps_target ${TGT}             \
+    --acc_target ${TGT}             \
     --bsz ${BSZ}; check_status
 
 exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
index 401658ca..5e0cf780 100644
--- a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
+++ b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
diff --git a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
index c2c2f176..ea7cb3ec 100644
--- a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
+++ b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
diff --git a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_performance.sh b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
index 7574347c..21c419d7 100644
--- a/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
+++ b/models/cv/face_recognition/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 #!/bin/bash
 
 EXIT_STATUS=0
diff --git a/models/cv/face_recognition/facenet/ixrt/tensorflow2pytorch.py b/models/cv/face_recognition/facenet/ixrt/tensorflow2pytorch.py
deleted file mode 100644
index f76ba0ff..00000000
--- a/models/cv/face_recognition/facenet/ixrt/tensorflow2pytorch.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import tensorflow.compat.v1 as tf
-tf.disable_v2_behavior()
-import torch
-import json
-import os, sys
-
-from dependencies.facenet.src import facenet
-from dependencies.facenet.src.models import inception_resnet_v1 as tf_mdl
-from dependencies.facenet.src.align import detect_face
-
-from models.inception_resnet_v1 import InceptionResnetV1
-from models.mtcnn import PNet, RNet, ONet
-
-
-def import_tf_params(tf_mdl_dir, sess):
-    """Import tensorflow model from save directory.
-    
-    Arguments:
-        tf_mdl_dir {str} -- Location of protobuf, checkpoint, meta files.
-        sess {tensorflow.Session} -- Tensorflow session object.
-    
-    Returns:
-        (list, list, list) -- Tuple of lists containing the layer names,
-            parameter arrays as numpy ndarrays, parameter shapes.
-    """
-    print('\nLoading tensorflow model\n')
-    if callable(tf_mdl_dir):
-        tf_mdl_dir(sess)
-    else:
-        facenet.load_model(tf_mdl_dir)
-
-    print('\nGetting model weights\n')
-    images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
-    print(images_placeholder)
-    tf_layers = tf.trainable_variables()
-    tf_params = sess.run(tf_layers)
-    print(tf.get_default_graph())
-
-    tf_shapes = [p.shape for p in tf_params]
-    tf_layers = [l.name for l in tf_layers]
-    
-    print(tf_shapes)
-    print(tf_shapes)
-    
-    if not callable(tf_mdl_dir):
-        path = os.path.join(tf_mdl_dir, 'layer_description.json')
-    else:
-        path = 'data/layer_description.json'
-    with open(path, 'w') as f:
-        json.dump({l: s for l, s in zip(tf_layers, tf_shapes)}, f)
-
-    return tf_layers, tf_params, tf_shapes
-
-
-def get_layer_indices(layer_lookup, tf_layers):
-    """Giving a lookup of model layer attribute names and tensorflow variable names,
-    find matching parameters.
-    
-    Arguments:
-        layer_lookup {dict} -- Dictionary mapping pytorch attribute names to (partial)
-            tensorflow variable names. Expects dict of the form {'attr': ['tf_name', ...]}
-            where the '...'s are ignored.
-        tf_layers {list} -- List of tensorflow variable names.
-    
-    Returns:
-        list -- The input dictionary with the list of matching inds appended to each item.
-    """
-    layer_inds = {}
-    for name, value in layer_lookup.items():
-        layer_inds[name] = value + [[i for i, n in enumerate(tf_layers) if value[0] in n]]
-    return layer_inds
-
-
-def load_tf_batchNorm(weights, layer):
-    """Load tensorflow weights into nn.BatchNorm object.
-    
-    Arguments:
-        weights {list} -- Tensorflow parameters.
-        layer {torch.nn.Module} -- nn.BatchNorm.
-    """
-    layer.bias.data = torch.tensor(weights[0]).view(layer.bias.data.shape)
-    layer.weight.data = torch.ones_like(layer.weight.data)
-    layer.running_mean = torch.tensor(weights[1]).view(layer.running_mean.shape)
-    layer.running_var = torch.tensor(weights[2]).view(layer.running_var.shape)
-
-
-def load_tf_conv2d(weights, layer, transpose=False):
-    """Load tensorflow weights into nn.Conv2d object.
-    
-    Arguments:
-        weights {list} -- Tensorflow parameters.
-        layer {torch.nn.Module} -- nn.Conv2d.
-    """
-    if isinstance(weights, list):
-        if len(weights) == 2:
-            layer.bias.data = (
-                torch.tensor(weights[1])
-                    .view(layer.bias.data.shape)
-            )
-        weights = weights[0]
-    
-    if transpose:
-        dim_order = (3, 2, 1, 0)
-    else:
-        dim_order = (3, 2, 0, 1)
-
-    layer.weight.data = (
-        torch.tensor(weights)
-            .permute(dim_order)
-            .view(layer.weight.data.shape)
-    )
-
-
-def load_tf_conv2d_trans(weights, layer):
-    return load_tf_conv2d(weights, layer, transpose=True)
-
-
-def load_tf_basicConv2d(weights, layer):
-    """Load tensorflow weights into grouped Conv2d+BatchNorm object.
-    
-    Arguments:
-        weights {list} -- Tensorflow parameters.
-        layer {torch.nn.Module} -- Object containing Conv2d+BatchNorm.
-    """
-    load_tf_conv2d(weights[0], layer.conv)
-    load_tf_batchNorm(weights[1:], layer.bn)
-
-
-def load_tf_linear(weights, layer):
-    """Load tensorflow weights into nn.Linear object.
-    
-    Arguments:
-        weights {list} -- Tensorflow parameters.
-        layer {torch.nn.Module} -- nn.Linear.
-    """
-    if isinstance(weights, list):
-        if len(weights) == 2:
-            layer.bias.data = (
-                torch.tensor(weights[1])
-                    .view(layer.bias.data.shape)
-            )
-        weights = weights[0]
-    layer.weight.data = (
-        torch.tensor(weights)
-            .transpose(-1, 0)
-            .view(layer.weight.data.shape)
-    )
-
-
-# High-level parameter-loading functions:
-
-def load_tf_block35(weights, layer):
-    load_tf_basicConv2d(weights[:4], layer.branch0)
-    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
-    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
-    load_tf_basicConv2d(weights[12:16], layer.branch2[0])
-    load_tf_basicConv2d(weights[16:20], layer.branch2[1])
-    load_tf_basicConv2d(weights[20:24], layer.branch2[2])
-    load_tf_conv2d(weights[24:26], layer.conv2d)
-
-
-def load_tf_block17_8(weights, layer):
-    load_tf_basicConv2d(weights[:4], layer.branch0)
-    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
-    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
-    load_tf_basicConv2d(weights[12:16], layer.branch1[2])
-    load_tf_conv2d(weights[16:18], layer.conv2d)
-
-
-def load_tf_mixed6a(weights, layer):
-    if len(weights) != 16:
-        raise ValueError(f'Number of weight arrays ({len(weights)}) not equal to 16')
-    load_tf_basicConv2d(weights[:4], layer.branch0)
-    load_tf_basicConv2d(weights[4:8], layer.branch1[0])
-    load_tf_basicConv2d(weights[8:12], layer.branch1[1])
-    load_tf_basicConv2d(weights[12:16], layer.branch1[2])
-
-
-def load_tf_mixed7a(weights, layer):
-    if len(weights) != 28:
-        raise ValueError(f'Number of weight arrays ({len(weights)}) not equal to 28')
-    load_tf_basicConv2d(weights[:4], layer.branch0[0])
-    load_tf_basicConv2d(weights[4:8], layer.branch0[1])
-    load_tf_basicConv2d(weights[8:12], layer.branch1[0])
-    load_tf_basicConv2d(weights[12:16], layer.branch1[1])
-    load_tf_basicConv2d(weights[16:20], layer.branch2[0])
-    load_tf_basicConv2d(weights[20:24], layer.branch2[1])
-    load_tf_basicConv2d(weights[24:28], layer.branch2[2])
-
-
-def load_tf_repeats(weights, layer, rptlen, subfun):
-    if len(weights) % rptlen != 0:
-        raise ValueError(f'Number of weight arrays ({len(weights)}) not divisible by {rptlen}')
-    weights_split = [weights[i:i+rptlen] for i in range(0, len(weights), rptlen)]
-    for i, w in enumerate(weights_split):
-        subfun(w, getattr(layer, str(i)))
-
-
-def load_tf_repeat_1(weights, layer):
-    load_tf_repeats(weights, layer, 26, load_tf_block35)
-
-
-def load_tf_repeat_2(weights, layer):
-    load_tf_repeats(weights, layer, 18, load_tf_block17_8)
-
-
-def load_tf_repeat_3(weights, layer):
-    load_tf_repeats(weights, layer, 18, load_tf_block17_8)
-
-
-def test_loaded_params(mdl, tf_params, tf_layers):
-    """Check each parameter in a pytorch model for an equivalent parameter
-    in a list of tensorflow variables.
-    
-    Arguments:
-        mdl {torch.nn.Module} -- Pytorch model.
-        tf_params {list} -- List of ndarrays representing tensorflow variables.
-        tf_layers {list} -- Corresponding list of tensorflow variable names.
-    """
-    tf_means = torch.stack([torch.tensor(p).mean() for p in tf_params])
-    for name, param in mdl.named_parameters():
-        pt_mean = param.data.mean()
-        matching_inds = ((tf_means - pt_mean).abs() < 1e-8).nonzero()
-        print(f'{name} equivalent to {[tf_layers[i] for i in matching_inds]}')
-
-
-def compare_model_outputs(pt_mdl, sess, test_data):
-    """Given some testing data, compare the output of pytorch and tensorflow models.
-    
-    Arguments:
-        pt_mdl {torch.nn.Module} -- Pytorch model.
-        sess {tensorflow.Session} -- Tensorflow session object.
-        test_data {torch.Tensor} -- Pytorch tensor.
-    """
-    print('\nPassing test data through TF model\n')
-    if isinstance(sess, tf.Session):
-        images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
-        phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
-        embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
-        feed_dict = {images_placeholder: test_data.numpy(), phase_train_placeholder: False}
-        tf_output = torch.tensor(sess.run(embeddings, feed_dict=feed_dict))
-    else:
-        tf_output = sess(test_data)
-
-    print(tf_output.shape, tf_output)
-
-    print('\nPassing test data through PT model\n')
-    pt_output = pt_mdl(test_data.permute(0, 3, 1, 2))
-    print(pt_output.shape, pt_output)
-
-    distance = (tf_output - pt_output).norm()
-    print(f'\nDistance {distance}\n')
-
-
-def compare_mtcnn(pt_mdl, tf_fun, sess, ind, test_data):
-    tf_mdls = tf_fun(sess)
-    tf_mdl = tf_mdls[ind]
-
-    print('\nPassing test data through TF model\n')
-    tf_output = tf_mdl(test_data.numpy())
-    tf_output = [torch.tensor(out) for out in tf_output]
-    print('\n'.join([str(o.view(-1)[:10]) for o in tf_output]))
-
-    print('\nPassing test data through PT model\n')
-    with torch.no_grad():
-        pt_output = pt_mdl(test_data.permute(0, 3, 2, 1))
-    pt_output = [torch.tensor(out) for out in pt_output]
-    for i in range(len(pt_output)):
-        if len(pt_output[i].shape) == 4:
-            pt_output[i] = pt_output[i].permute(0, 3, 2, 1).contiguous()
-    print('\n'.join([str(o.view(-1)[:10]) for o in pt_output]))
-
-    distance = [(tf_o - pt_o).norm() for tf_o, pt_o in zip(tf_output, pt_output)]
-    print(f'\nDistance {distance}\n')
-
-
-def load_tf_model_weights(mdl, layer_lookup, tf_mdl_dir, is_resnet=True, arg_num=None):
-    """Load tensorflow parameters into a pytorch model.
-    
-    Arguments:
-        mdl {torch.nn.Module} -- Pytorch model.
-        layer_lookup {[type]} -- Dictionary mapping pytorch attribute names to (partial)
-            tensorflow variable names, and a function suitable for loading weights.
-            Expects dict of the form {'attr': ['tf_name', function]}. 
-        tf_mdl_dir {str} -- Location of protobuf, checkpoint, meta files.
-    """
-    tf.reset_default_graph()
-    with tf.Session() as sess:
-        tf_layers, tf_params, tf_shapes = import_tf_params(tf_mdl_dir, sess)
-        layer_info = get_layer_indices(layer_lookup, tf_layers)
-
-        for layer_name, info in layer_info.items():
-            print(f'Loading {info[0]}/* into {layer_name}')
-            weights = [tf_params[i] for i in info[2]]
-            layer = getattr(mdl, layer_name)
-            info[1](weights, layer)
-
-        test_loaded_params(mdl, tf_params, tf_layers)
-
-        if is_resnet:
-            compare_model_outputs(mdl, sess, torch.randn(5, 160, 160, 3).detach())
-
-
-def tensorflow2pytorch(args):
-    lookup_inception_resnet_v1 = {
-        'conv2d_1a': ['InceptionResnetV1/Conv2d_1a_3x3', load_tf_basicConv2d],
-        'conv2d_2a': ['InceptionResnetV1/Conv2d_2a_3x3', load_tf_basicConv2d],
-        'conv2d_2b': ['InceptionResnetV1/Conv2d_2b_3x3', load_tf_basicConv2d],
-        'conv2d_3b': ['InceptionResnetV1/Conv2d_3b_1x1', load_tf_basicConv2d],
-        'conv2d_4a': ['InceptionResnetV1/Conv2d_4a_3x3', load_tf_basicConv2d],
-        'conv2d_4b': ['InceptionResnetV1/Conv2d_4b_3x3', load_tf_basicConv2d],
-        'repeat_1': ['InceptionResnetV1/Repeat/block35', load_tf_repeat_1],
-        'mixed_6a': ['InceptionResnetV1/Mixed_6a', load_tf_mixed6a],
-        'repeat_2': ['InceptionResnetV1/Repeat_1/block17', load_tf_repeat_2],
-        'mixed_7a': ['InceptionResnetV1/Mixed_7a', load_tf_mixed7a],
-        'repeat_3': ['InceptionResnetV1/Repeat_2/block8', load_tf_repeat_3],
-        'block8': ['InceptionResnetV1/Block8', load_tf_block17_8],
-        'last_linear': ['InceptionResnetV1/Bottleneck/weights', load_tf_linear],
-        'last_bn': ['InceptionResnetV1/Bottleneck/BatchNorm', load_tf_batchNorm],
-        # 'logits': ['Logits', load_tf_linear],
-    }
-
-    print('\nLoad CASIA-Webface-trained weights and save\n')
-    mdl = InceptionResnetV1(num_classes=10575).eval()
-    tf_mdl_dir = args.facenet_pb_path
-
-    load_tf_model_weights(mdl, lookup_inception_resnet_v1, tf_mdl_dir)
-    # print(f'????????')
-    # data_name = 'casia-webfacexxxxxxx'
-    # state_dict = mdl.state_dict()
-    # torch.save(state_dict, f'{tf_mdl_dir}-{data_name}.pt')
-
-    x = torch.rand(64, 3, 160, 160)#.cuda()
-    # y = resnet(x)
-    # print(y.shape)
-
-
-    f = f"{args.facenet_weights_path}/{args.onnx_save_name}"
-    torch.onnx.export(mdl, x, f, verbose=False, opset_version=11, 
-                    input_names=['input'], output_names=['output'], dynamic_axes=None)
-
-
-    
-import argparse
-def parse_args():
-    parser = argparse.ArgumentParser("deploy facenet")
-    parser.add_argument("--facenet_weights_path", default="", help="onnx model path")
-    parser.add_argument("--facenet_pb_path", default="", help="")
-    parser.add_argument("--onnx_save_name", default="", help="")
-
-    return parser.parse_args()
-args = parse_args()
-
-tensorflow2pytorch(args)
-
-
-# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-# print('Running on device: {}'.format(device))
-
-# # Load pretrained resnet model
-# resnet = InceptionResnetV1(
-#     classify=False,
-#     pretrained='casia-webface'
-# )#.to(device)
-
-# x = torch.rand(64, 3, 160, 160)#.cuda()
-# y = resnet(x)
-# print(y.shape)
-
-
-# f = f"{args.facenet_weights_path}/{args.onnx_save_name}"
-# torch.onnx.export(resnet, x, f, verbose=False, opset_version=11, input_names=['input'], output_names=['output'], dynamic_axes=None)
diff --git a/models/cv/face_recognition/facenet/ixrt/utils.py b/models/cv/face_recognition/facenet/ixrt/utils.py
index ab8f213b..f908e887 100644
--- a/models/cv/face_recognition/facenet/ixrt/utils.py
+++ b/models/cv/face_recognition/facenet/ixrt/utils.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import os
 import math
 
diff --git a/tests/model_info.json b/tests/model_info.json
index a717db67..ac68ee0a 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -3254,7 +3254,7 @@
             "github_repo": "",
             "github_branch": "",
             "github_path": "",
-            "datasets": "local/tmp",
+            "datasets": "local/facenet_datasets",
             "download_url": "https://drive.google.com/open?id=1R77HmFADxe87GmoLwzfgMu_HY0IhcyBz",
             "need_third_part": "",
             "precisions": [
-- 
Gitee


From 8ef7edfc4b8238e8f01c626c6caf36dcd8ab7e24 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 3 Jul 2025 14:49:19 +0800
Subject: [PATCH 08/15] update solov1

---
 .../solov1/ixrt/ci/prepare.sh                     | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/models/cv/instance_segmentation/solov1/ixrt/ci/prepare.sh b/models/cv/instance_segmentation/solov1/ixrt/ci/prepare.sh
index 66c8f9d0..107ffda4 100644
--- a/models/cv/instance_segmentation/solov1/ixrt/ci/prepare.sh
+++ b/models/cv/instance_segmentation/solov1/ixrt/ci/prepare.sh
@@ -27,20 +27,7 @@ fi
 
 pip install -r requirements.txt
 
-cp -r /root/data/3rd_party/mmcv-v1.7.1 ./mmcv
-cp -r -T /root/data/repos/deepsparkhub/toolbox/MMDetection/patch/mmcv/v1.7.1 ./mmcv
-cd mmcv
-rm -rf mmcv/ops/csrc/common/cuda/spconv/ mmcv/ops/csrc/common/utils/spconv/
-rm -f mmcv/ops/csrc/pytorch/cpu/sparse_*
-rm -f mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
-rm -f mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
-rm -f mmcv/ops/csrc/pytorch/cuda/sparse_*
-rm -f mmcv/ops/csrc/pytorch/sp*
-
-bash clean_mmcv.sh
-bash build_mmcv.sh
-bash install_mmcv.sh
-cd ..
+pip install /root/data/install/mmcv_full-1.7.0+corex.20250108131027-cp310-cp310-linux_x86_64.whl
 
 mkdir -p checkpoints
 ln -s /root/data/checkpoints/solo_r50_fpn_3x_coco_20210901_012353-11d224d7.pth ./
-- 
Gitee


From 9e39b06482dd71f8d3030116764e8420f0ce76ae Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 2 Jul 2025 11:18:20 +0800
Subject: [PATCH 09/15] update vllm idefics3 minicpm_v llava_next_video_7b

---
 .../idefics3/vllm/README.md                   |   9 +-
 .../idefics3/vllm/ci/prepare.sh               |   1 +
 .../vllm/offline_inference_vision_language.py | 276 +++++++++++-----
 .../sample_demo_1.mp4                         | Bin
 .../minicpm_v/vllm/README.md                  |   8 +-
 .../vllm/offline_inference_vision_language.py | 309 +++++++++++++-----
 tests/model_info.json                         |   2 +-
 tests/run_vllm.py                             |  11 +-
 8 files changed, 430 insertions(+), 186 deletions(-)
 rename models/multimodal/vision_language_model/llava_next_video_7b/vllm/{video-eample-data => video-example-data}/sample_demo_1.mp4 (100%)
 mode change 100755 => 100644

diff --git a/models/multimodal/vision_language_model/idefics3/vllm/README.md b/models/multimodal/vision_language_model/idefics3/vllm/README.md
index 5117a327..78d4117c 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/README.md
+++ b/models/multimodal/vision_language_model/idefics3/vllm/README.md
@@ -22,8 +22,8 @@ significantly enhancing capabilities around OCR, document understanding and visu
 ```bash
 cp -r ../../vllm_public_assets/ ./
 
-# Download model from the website and make sure the model's path is "data/Aria"
-mkdir data
+# Download model from the website and make sure the model's path is "idefics3"
+mkdir HuggingFaceM4
 ```
 
 ### Install Dependencies
@@ -36,13 +36,14 @@ In order to run the model smoothly, you need to get the sdk from [resource cente
 yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
+
+pip install transformers==4.50.3
 ```
 
 ## Model Inference
 
 ```bash
-export VLLM_ASSETS_CACHE=../vllm/
-python3 offline_inference_vision_language.py --model data/Idefics3-8B-Llama3 -tp 4 --max-tokens 256 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache
+python3 offline_inference_vision_language.py --model-type idefics3
 ```
 
 ## Model Results
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
index 7232aa29..26f7a3ff 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
+++ b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
@@ -25,3 +25,4 @@ else
 fi
 
 cp -r ../../vllm_public_assets/ ./
+pip install transformers==4.50.3
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
index 958131c6..c2593603 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
+++ b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
@@ -1,55 +1,67 @@
-#!/bin/bash
-# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
-with the correct prompt format on vision language models.
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
-import sys
-from pathlib import Path
 import os
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import argparse
-import dataclasses
-import inspect
-from vllm.assets.image import ImageAsset
-from vllm.assets.video import VideoAsset
+import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
 
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
 from vllm import LLM, EngineArgs, SamplingParams
-from utils import sampling_add_cli_args
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.lora.request import LoRARequest
+from vllm.utils import FlexibleArgumentParser
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
 
 # Idefics3-8B-Llama3
-def run_idefics3(question: str, engine_params, modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+    model_name = "./idefics3"
 
-    llm = LLM(**engine_params)
-    prompt = (
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
+    prompts = [(
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    ) for question in questions]
 
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
+model_example_map = {
+    "idefics3": run_idefics3,
+}
 
 def get_multi_modal_input(args):
     """
@@ -60,92 +72,188 @@ def get_multi_modal_input(args):
     """
     if args.modality == "image":
         # Input image and question
-        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-    
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser = EngineArgs.add_cli_args(parser)
-    parser = sampling_add_cli_args(parser)
-    args = parser.parse_args()
-    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-    sampling_args = [
-        param.name
-        for param in list(
-            inspect.signature(SamplingParams).parameters.values()
-        )
-    ]
-    engine_params = {attr: getattr(args, attr) for attr in engine_args}
-    sampling_params = {
-        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
-    }
-    
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompts[i % len(prompts)],
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
+
+    req_data = model_example_map[model](questions, modality)
 
-    llm, prompt, stop_token_ids = run_idefics3(question,engine_params,args.modality)
-    sampling_params['stop_token_ids'] = stop_token_ids
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(**sampling_params)
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=req_data.stop_token_ids)
 
     assert args.num_prompts > 0
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompts,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for i in range(args.num_prompts)]
+
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
-        print(generated_text)
\ No newline at end of file
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=4,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['image', 'video'],
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4 b/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4
old mode 100755
new mode 100644
similarity index 100%
rename from models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4
rename to models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
index a404f6ec..ea1c8d74 100644
--- a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
@@ -16,13 +16,12 @@ techniques, making it suitable for deployment in resource-constrained environmen
 
 ### Prepare Resources
 
-- Model: <https://huggingface.co/openbmb/MiniCPM-V-2>
+- Model: <https://huggingface.co/openbmb/MiniCPM-V-2_6>
 
 ```bash
 cp -r ../../vllm_public_assets/ ./
 
-# Download model from the website and make sure the model's path is "data/Aria"
-mkdir data
+# Download model from the website and make sure the model's path is "./minicpm_v"
 ```
 
 ### Install Dependencies
@@ -42,8 +41,7 @@ pip install timm==0.9.10
 ## Model Inference
 
 ```bash
-export VLLM_ASSETS_CACHE=../vllm/
-PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model data/MiniCPM-V-2 --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+python3 offline_inference_vision_language.py --model-type minicpmv
 ```
 
 ## Model Results
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
index 2fc88f46..f6df6f98 100644
--- a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
@@ -1,42 +1,42 @@
-#!/bin/bash
-# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
-with the correct prompt format on vision language models.
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
-import sys
-from pathlib import Path
 import os
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import argparse
-import dataclasses
-import inspect
+import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
+
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm import LLM, EngineArgs, SamplingParams
-from utils import sampling_add_cli_args
+from vllm.lora.request import LoRARequest
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
 
 # MiniCPM-V
-def run_minicpmv(question, engine_params, model,modality):
-    assert modality == "image"
+def run_minicpmv_base(questions: list[str], modality: str, model_name):
+    assert modality in ["image", "video"]
+    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
+
     # 2.0
     # The official repo doesn't work yet, so we need to use a fork for now
     # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
@@ -45,10 +45,25 @@ def run_minicpmv(question, engine_params, model,modality):
     # 2.5
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
-    #2.6
-    tokenizer = AutoTokenizer.from_pretrained(model,
+    # 2.6
+    # model_name = "openbmb/MiniCPM-V-2_6"
+    # o2.6
+
+    # modality supports
+    # 2.0: image
+    # 2.5: image
+    # 2.6: image, video
+    # o2.6: image, video, audio
+    # model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    llm = LLM(**engine_params)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
     # stop_token_ids = [tokenizer.eos_id]
@@ -56,18 +71,38 @@ def run_minicpmv(question, engine_params, model,modality):
     # 2.5
     # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
 
-    # 2.6
+    # 2.6 / o2.6
     stop_tokens = ['<|im_end|>', '<|endoftext|>']
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
-    messages = [{
-        'role': 'user',
-        'content': f'(<image>./</image>)\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-    return llm, prompt, stop_token_ids
+    modality_placeholder = {
+        "image": "(<image>./</image>)",
+        "video": "(<video>./</video>)",
+    }
+
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{
+                'role': 'user',
+                'content': f"{modality_placeholder[modality]}\n{question}"
+            }],
+            tokenize=False,
+            add_generation_prompt=True) for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
+    return run_minicpmv_base(questions, modality, "./minicpm_v")
+
+
+model_example_map = {
+    "minicpmv": run_minicpmv,
+}
 
 
 def get_multi_modal_input(args):
@@ -79,92 +114,188 @@ def get_multi_modal_input(args):
     """
     if args.modality == "image":
         # Input image and question
-        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-    
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser = EngineArgs.add_cli_args(parser)
-    parser = sampling_add_cli_args(parser)
-    args = parser.parse_args()
-    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-    sampling_args = [
-        param.name
-        for param in list(
-            inspect.signature(SamplingParams).parameters.values()
-        )
-    ]
-    engine_params = {attr: getattr(args, attr) for attr in engine_args}
-    sampling_params = {
-        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
-    }
-    
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompts[i % len(prompts)],
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
 
-    llm, prompt, stop_token_ids = run_minicpmv(question,engine_params, args.model, args.modality)
-    sampling_params['stop_token_ids'] = stop_token_ids
+    req_data = model_example_map[model](questions, modality)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(**sampling_params)
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=req_data.stop_token_ids)
 
     assert args.num_prompts > 0
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompts,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for i in range(args.num_prompts)]
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
+
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
-        print(generated_text)
\ No newline at end of file
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=4,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['image', 'video'],
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/tests/model_info.json b/tests/model_info.json
index ac68ee0a..054ae60e 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -7098,7 +7098,7 @@
             "github_branch": "",
             "github_path": "",
             "datasets": "",
-            "download_url": "https://huggingface.co/openbmb/MiniCPM-V-2",
+            "download_url": "https://huggingface.co/openbmb/MiniCPM-V-2_6",
             "need_third_part": false,
             "precisions": [
                 "fp16"
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index c6100a40..be795462 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -229,7 +229,13 @@ def run_nlp_testcase(model):
             export VLLM_ASSETS_CACHE=../vllm/
             python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
             """
-        elif model_name == "h2vol" or model_name == "idefics3":
+        elif model_name == "idefics3":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            python3 offline_inference_vision_language.py --model-type idefics3
+            """
+        elif model_name == "h2vol":
             script = f"""
             set -x
             cd ../{model['model_path']}
@@ -240,8 +246,7 @@ def run_nlp_testcase(model):
             script = f"""
             set -x
             cd ../{model['model_path']}
-            export VLLM_ASSETS_CACHE=../vllm/
-            PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+            python3 offline_inference_vision_language.py --model-type minicpmv
             """
         elif model_name == "llama-3.2":
             script = f"""
-- 
Gitee


From 6fd94e801a829b193a7a5f17e0de6418e3bf3947 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 3 Jul 2025 09:31:03 +0800
Subject: [PATCH 10/15] update ixrt yolov4

---
 .../cv/object_detection/yolov4/ixrt/README.md |  23 +-
 .../yolov4/ixrt/build_engine.py               |  58 +----
 .../yolov4/ixrt/calibration_dataset.py        |  31 +++
 .../yolov4/ixrt/ci/prepare.sh                 |   8 +-
 .../yolov4/ixrt/coco_labels.py                |  14 --
 .../yolov4/ixrt/config/YOLOV4_CONFIG          |  49 ++++
 .../object_detection/yolov4/ixrt/cut_model.py |  16 +-
 .../cv/object_detection/yolov4/ixrt/deploy.py | 104 ++-------
 .../cv/object_detection/yolov4/ixrt/export.py |   7 +-
 .../yolov4/ixrt/load_ixrt_plugin.py           |  16 +-
 .../yolov4/ixrt/modify_batchsize.py           |  54 +++++
 .../cv/object_detection/yolov4/ixrt/quant.py  |  92 ++------
 .../scripts/infer_yolov4_fp16_accuracy.sh     | 201 +++++++++++-----
 .../scripts/infer_yolov4_fp16_performance.sh  | 202 +++++++++++-----
 .../scripts/infer_yolov4_int8_accuracy.sh     | 215 +++++++++++------
 .../scripts/infer_yolov4_int8_performance.sh  | 216 ++++++++++++------
 .../yolov4/ixrt/simplify_model.py             |  21 ++
 tests/model_info.json                         |   2 +-
 tests/run_ixrt.py                             |   5 +-
 19 files changed, 801 insertions(+), 533 deletions(-)
 create mode 100644 models/cv/object_detection/yolov4/ixrt/calibration_dataset.py
 create mode 100644 models/cv/object_detection/yolov4/ixrt/config/YOLOV4_CONFIG
 create mode 100644 models/cv/object_detection/yolov4/ixrt/modify_batchsize.py
 create mode 100644 models/cv/object_detection/yolov4/ixrt/simplify_model.py

diff --git a/models/cv/object_detection/yolov4/ixrt/README.md b/models/cv/object_detection/yolov4/ixrt/README.md
index e425710b..f6bd831e 100644
--- a/models/cv/object_detection/yolov4/ixrt/README.md
+++ b/models/cv/object_detection/yolov4/ixrt/README.md
@@ -38,21 +38,26 @@ pip3 install -r requirements.txt
 git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git yolov4
 
 # download weight
-mkdir data
-wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights -P data
+mkdir checkpoints
+wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights -P checkpoints
 
 # export onnx model
-python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight data/yolov4.weights --batchsize 16 --output data/yolov4.onnx
-mv yolov4_16_3_608_608_static.onnx data/yolov4.onnx
-
-# Use onnxsim optimize onnx model
-onnxsim data/yolov4.onnx data/yolov4_sim.onnx
-
-# Make sure the dataset path is "data/coco"
+python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight yolov4.weights --output yolov4.onnx
+mv yolov4.onnx checkpoints/yolov4.onnx
 ```
 
 ## Model Inference
 
+```bash
+export PROJ_DIR=./
+export DATASETS_DIR=./coco/
+export CHECKPOINTS_DIR=./checkpoints
+export COCO_GT=./coco/annotations/instances_val2017.json
+export EVAL_DIR=./coco/images/val2017
+export RUN_DIR=./
+export CONFIG_DIR=config/YOLOV4_CONFIG
+```
+
 ### FP16
 
 ```bash
diff --git a/models/cv/object_detection/yolov4/ixrt/build_engine.py b/models/cv/object_detection/yolov4/ixrt/build_engine.py
index ec4080ed..d47e45e5 100644
--- a/models/cv/object_detection/yolov4/ixrt/build_engine.py
+++ b/models/cv/object_detection/yolov4/ixrt/build_engine.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import os
 import cv2
 import argparse
@@ -19,13 +5,11 @@ import numpy as np
 
 import torch
 import tensorrt
-from tensorrt import Dims
 
 from load_ixrt_plugin import load_ixrt_plugin
 load_ixrt_plugin()
 
-
-def build_engine_trtapi_staticshape(config):
+def main(config):
     IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
     builder = tensorrt.Builder(IXRT_LOGGER)
     EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
@@ -42,42 +26,6 @@ def build_engine_trtapi_staticshape(config):
     engine_file_path = config.engine
     with open(engine_file_path, "wb") as f:
         f.write(plan)
-    print("Build static shape engine done!")
-
-
-def build_engine_trtapi_dynamicshape(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-
-    profile = builder.create_optimization_profile()
-    profile.set_shape("input",
-                        Dims([1, 3, 608, 608]),
-                        Dims([32, 3, 608, 608]),
-                        Dims([64, 3, 608, 608]),
-    )
-    build_config.add_optimization_profile(profile)
-
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    # set dynamic
-    num_inputs = network.num_inputs
-    for i in range(num_inputs):
-        input_tensor = network.get_input(i)
-        input_tensor.shape = Dims([-1, 3, 608, 608])
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-    print("Build dynamic shape engine done!")
-
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -90,8 +38,6 @@ def parse_args():
     args = parser.parse_args()
     return args
 
-
 if __name__ == "__main__":
     args = parse_args()
-    build_engine_trtapi_staticshape(args)
-    # build_engine_trtapi_dynamicshape(args)
+    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov4/ixrt/calibration_dataset.py
new file mode 100644
index 00000000..578e013d
--- /dev/null
+++ b/models/cv/object_detection/yolov4/ixrt/calibration_dataset.py
@@ -0,0 +1,31 @@
+import os
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+
+
+
+from datasets.coco import CocoDetection
+
+def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
+    dataset = CocoDetection(
+        root=data_path,
+        annFile=annFile,
+        img_size=img_sz,
+        data_process_type=data_process_type
+    )
+    calibration_dataset = dataset
+    num_samples = min(5000, batch_size * step)
+    if num_samples > 0:
+        calibration_dataset = torch.utils.data.Subset(
+            dataset, indices=range(num_samples)
+        )
+
+    calibration_dataloader = DataLoader(
+        calibration_dataset,
+        shuffle=False,
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=workers,
+    )
+    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov4/ixrt/ci/prepare.sh
index f5381ef3..63b53420 100644
--- a/models/cv/object_detection/yolov4/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov4/ixrt/ci/prepare.sh
@@ -30,10 +30,8 @@ pip3 install -r requirements.txt
 # clone yolov4
 cp -r /root/data/3rd_party/yolov4 ./
 
-mkdir data
+mkdir checkpoints
 # export onnx model
-python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight /root/data/checkpoints/yolov4.weights --batchsize 16 --output data/yolov4.onnx
-mv yolov4_16_3_608_608_static.onnx data/yolov4.onnx
+python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight /root/data/checkpoints/yolov4.weights --output yolov4.onnx
+mv yolov4.onnx checkpoints/yolov4.onnx
 
-# Use onnxsim optimize onnx model
-onnxsim data/yolov4.onnx data/yolov4_sim.onnx
diff --git a/models/cv/object_detection/yolov4/ixrt/coco_labels.py b/models/cv/object_detection/yolov4/ixrt/coco_labels.py
index 5fc21282..69d38878 100644
--- a/models/cv/object_detection/yolov4/ixrt/coco_labels.py
+++ b/models/cv/object_detection/yolov4/ixrt/coco_labels.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 labels = [
     "person",
     "bicycle",
diff --git a/models/cv/object_detection/yolov4/ixrt/config/YOLOV4_CONFIG b/models/cv/object_detection/yolov4/ixrt/config/YOLOV4_CONFIG
new file mode 100644
index 00000000..c0499494
--- /dev/null
+++ b/models/cv/object_detection/yolov4/ixrt/config/YOLOV4_CONFIG
@@ -0,0 +1,49 @@
+# BSZ : 构建engine以及推理时的batchsize
+# IMGSIZE : 模型输入hw大小
+# RUN_MODE : [FPS, MAP]
+# PRECISION : [float16, int8]
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件
+# COCO_GT : COCOEVAL标签文件
+# DATASET_DIR : 量化/推理数据集路径
+# CHECKPOINTS_DIR : 存放生成的onnx/engine路径
+# LAYER_FUSION : decoder部分走融合算子实现  0不融合 1融合
+# DECODER_FASTER : 有两种融合实现,faster版本速度快且可以直接对接gpu nms;另一种实现的输出和onnx保持一致.  1:faster
+IMGSIZE=416
+MODEL_NAME=yolov4
+ORIGINE_MODEL=yolov4.onnx
+DATA_PROCESS_TYPE=yolov4
+MODEL_INPUT_NAMES=(input)
+
+LAYER_FUSION=1
+DECODER_FASTER=1
+DECODER_NUM_CLASS=80
+DECODER_INPUT_NAMES=(/models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0)
+DECODER_8_ANCHOR=(12 16 19 36 40 28)
+DECODER_16_ANCHOR=(36 75 76 55 72 146)
+DECODER_32_ANCHOR=(142 110 192 243 459 401)
+
+# NMS CONFIG
+    # IOU_THRESH : iou阈值
+    # SCORE_THRESH : bbox置信度阈值
+    # MAX_BOX_PRE_IMG : 每张图片预测bbox的数量上限
+    # ALL_BOX_NUM : nms接收每张图片的box数量
+    # NMS_TYPE : GPU/CPU(TODO)
+IOU_THRESH=0.6
+SCORE_THRESH=0.001
+MAX_BOX_PRE_IMG=1000
+ALL_BOX_NUM=10647
+NMS_TYPE=GPU
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+    # QUANT_OBSERVER : 量化策略，可选 [hist_percentile, percentile, minmax, entropy, ema]
+    # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致，有些op可能推导shape错误(比如Reshape)
+    # QUANT_STEP : 量化步数
+    # QUANT_SEED : 随机种子 保证量化结果可复现
+    # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=hist_percentile
+QUANT_BATCHSIZE=1
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=()
+QUANT_EXIST_ONNX=
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/cut_model.py b/models/cv/object_detection/yolov4/ixrt/cut_model.py
index cf4f88da..af0a3a4f 100644
--- a/models/cv/object_detection/yolov4/ixrt/cut_model.py
+++ b/models/cv/object_detection/yolov4/ixrt/cut_model.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import onnx
 import argparse
 from onnxsim import simplify
@@ -27,4 +13,4 @@ def parse_args():
 
 args = parse_args()
 onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
-print("  Cut Model Done.")
+print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/deploy.py b/models/cv/object_detection/yolov4/ixrt/deploy.py
index 084356ec..ec56b7ab 100644
--- a/models/cv/object_detection/yolov4/ixrt/deploy.py
+++ b/models/cv/object_detection/yolov4/ixrt/deploy.py
@@ -1,90 +1,8 @@
 # !/usr/bin/env python
 # -*- coding: utf-8 -*-
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import argparse
-import copy
-
-from typing import Union, Callable, List
-
-from tensorrt.deploy.api import *
-from tensorrt.deploy.backend.onnx.converter import default_converter
-from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
-from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
-from tensorrt.deploy.ir.operator_type import OperatorType as OP
-from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
-from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
-from tensorrt.deploy.ir import Graph
-from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
-from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
 from tensorrt.deploy.api import GraphTransform, create_source, create_target
 
-class FuseMishPass(BasePass):
-    def process(self, graph: Graph) -> Graph:
-        pattern = build_sequence_graph([OP.SOFTPLUS, OP.TANH, OP.MUL])
-
-        matcher = GraphMatcher(pattern, strict=False)
-        self.transform = GraphTransform(graph)
-        matcher.findall(graph, self.fuse_mish)
-        return graph
-
-    def fuse_mish(self, graph: Graph, pattern_graph: PatternGraph):
-        softplus = pattern_graph.nodes[0].operator
-        mul = pattern_graph.nodes[-1].operator
-
-        if not self.can_fused(graph, pattern_graph):
-            return
-
-        self.transform.delete_operators_between_op_op(softplus, mul)
-
-        mish_op = Operator(
-            name=generate_operator_name(graph, pattern="Mish_{idx}"),
-            op_type=OP.MISH,
-            inputs=copy.copy(softplus.inputs),
-            outputs=copy.copy(mul.outputs),
-        )
-        mish_op.is_quant_operator = softplus.is_quant_operator and mul.is_quant_operator
-        graph.add_operator(mish_op)
-
-    def can_fused(self, graph: Graph, pattern_graph: PatternGraph):
-        softplus = pattern_graph.nodes[0].operator
-        mul = pattern_graph.nodes[-1].operator
-
-        # 检查 Softplus, tanh 的输出是不是只有一个 OP 使用
-        # 如果有多个 OP 使用，则不能融合
-        for node in pattern_graph.nodes[:2]:
-            next_ops = graph.get_next_operators(node.operator)
-            if len(next_ops) != 1:
-                return False
-
-        # 检查 Mul 的输入是不是和 Softplus 是同源的
-        softplus_prev_op = graph.get_previous_operators(softplus)
-        if len(softplus_prev_op) != 1:
-            return False
-
-        mul_prev_op = graph.get_previous_operators(mul)
-        if len(mul_prev_op) != 2:
-            return False
-
-        for op in mul_prev_op:
-            if op is softplus_prev_op[0]:
-                return True
-
-        return False
-
-
 class Transform:
     def __init__(self, graph):
         self.t = GraphTransform(graph)
@@ -168,24 +86,32 @@ def customize_ops(graph, args):
             outputs=["output"],
             axis=1
         )
-    else:
+    elif args.with_nms:
         graph = t.AddConcatOp(
             inputs=["decoder_32", "decoder_16", "decoder_8"],
             outputs=["output"],
             axis=1
         )
 
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
+        graph.outputs.clear()
+        graph.add_output("output")
+        graph.outputs["output"].dtype = "FLOAT"
+    else:
+        graph.outputs.clear()
+        graph.add_output("decoder_8")
+        graph.outputs["decoder_8"].dtype = "FLOAT"
+        graph.add_output("decoder_16")
+        graph.outputs["decoder_16"].dtype = "FLOAT"
+        graph.add_output("decoder_32")
+        graph.outputs["decoder_32"].dtype = "FLOAT"
     return graph
 
-
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--src", type=str)
     parser.add_argument("--dst", type=str)
     parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
+    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
     parser.add_argument("--decoder_input_names", nargs='+', type=str)
     parser.add_argument("--decoder8_anchor", nargs='*', type=int)
     parser.add_argument("--decoder16_anchor", nargs='*', type=int)
@@ -199,12 +125,10 @@ def parse_args():
     args = parser.parse_args()
     return args
 
-
 if __name__ == "__main__":
 
     args = parse_args()
     graph = create_source(args.src)()
     graph = customize_ops(graph, args)
-    graph = FuseMishPass().process(graph)
     create_target(saved_path=args.dst).export(graph)
-    print("Surged onnx lies on", args.dst)
+    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/export.py b/models/cv/object_detection/yolov4/ixrt/export.py
index 7c8bbfa5..db7e06cc 100644
--- a/models/cv/object_detection/yolov4/ixrt/export.py
+++ b/models/cv/object_detection/yolov4/ixrt/export.py
@@ -32,11 +32,6 @@ def parse_args():
                     required=True, 
                     help="darknet weights path.")
     
-    parser.add_argument("--batchsize", 
-                    type=int, 
-                    required=True, 
-                    help="Onnx model batchsize.")
-    
     parser.add_argument("--output", 
                     type=str, 
                     required=True, 
@@ -49,7 +44,7 @@ def parse_args():
 def main():
     args = parse_args()
 
-    transform_to_onnx(args.cfg, args.weight, args.batchsize, args.output)
+    transform_to_onnx(args.cfg, args.weight, -1, args.output)
     
 if __name__ == "__main__":
     main()
diff --git a/models/cv/object_detection/yolov4/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov4/ixrt/load_ixrt_plugin.py
index 2bb0abc2..932efbdf 100644
--- a/models/cv/object_detection/yolov4/ixrt/load_ixrt_plugin.py
+++ b/models/cv/object_detection/yolov4/ixrt/load_ixrt_plugin.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import ctypes
 import tensorrt
 from os.path import join, dirname, exists
@@ -23,4 +9,4 @@ def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="",
             f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
     ctypes.CDLL(dynamic_path)
     tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
+    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov4/ixrt/modify_batchsize.py
new file mode 100644
index 00000000..f696ae55
--- /dev/null
+++ b/models/cv/object_detection/yolov4/ixrt/modify_batchsize.py
@@ -0,0 +1,54 @@
+import onnx
+import argparse
+import copy
+import numpy as np
+
+def change_input_dim(model, bsz):
+    batch_size = bsz
+
+    # The following code changes the first dimension of every input to be batch_size
+    # Modify as appropriate ... note that this requires all inputs to
+    # have the same batch_size
+    inputs = model.graph.input
+    for input in inputs:
+        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+        # Add checks as needed.
+        dim1 = input.type.tensor_type.shape.dim[0]
+        # update dim to be a symbolic value
+        if isinstance(batch_size, str):
+            # set dynamic batch size
+            dim1.dim_param = batch_size
+        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
+            # set given batch size
+            dim1.dim_value = int(batch_size)
+        else:
+            # set batch size of 1
+            dim1.dim_value = 1
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int)
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+def modify_resize_nodes(model, bsz):
+    print("modify resize")
+    for node in model.graph.node:
+        if node.op_type == "Resize":
+            if len(node.input) >= 4 and node.input[3]:
+                sizes_name = node.input[3]
+                for initializer in model.graph.initializer:
+                    if initializer.name == sizes_name:
+                        shape = copy.deepcopy(onnx.numpy_helper.to_array(initializer))
+                        shape[0] = shape[0] * bsz
+                        new_sizes = np.array(shape, dtype=np.int64)
+                        initializer.CopyFrom(onnx.numpy_helper.from_array(new_sizes, name=initializer.name))
+                        break
+    
+args = parse_args()
+model = onnx.load(args.origin_model)
+change_input_dim(model, args.batch_size)
+modify_resize_nodes(model, args.batch_size)
+onnx.save(model, args.output_model)
diff --git a/models/cv/object_detection/yolov4/ixrt/quant.py b/models/cv/object_detection/yolov4/ixrt/quant.py
index 70265cbc..d73212ca 100644
--- a/models/cv/object_detection/yolov4/ixrt/quant.py
+++ b/models/cv/object_detection/yolov4/ixrt/quant.py
@@ -1,50 +1,34 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import os
-import cv2
 import random
 import argparse
 import numpy as np
 from tensorrt.deploy import static_quantize
 
 import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-from common import letterbox
-
+import sys
+sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
+print(sys.path)
+from calibration_dataset import create_dataloaders
 
 def setseed(seed=42):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
 
-
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_name", type=str)
-    parser.add_argument("--model", type=str,  default="yolov4_bs16_without_decoder.onnx")
+    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
+    parser.add_argument("--data_process_type", type=str,  default="none")
     parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
     parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
     parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
     parser.add_argument("--disable_quant_names", nargs='*', type=str)
-    parser.add_argument("--save_quant_model", type=str,  help="save the quantization model path", default=None)
-    parser.add_argument("--bsz", type=int, default=16)
-    parser.add_argument("--step", type=int, default=32)
+    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
+    parser.add_argument("--bsz", type=int, default=32)
+    parser.add_argument("--step", type=int, default=20)
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--imgsz", type=int, default=608)
-    parser.add_argument("--use_letterbox", action="store_true")
+    parser.add_argument("--imgsz", type=int, default=640)
     args = parser.parse_args()
     return args
 
@@ -52,54 +36,20 @@ args = parse_args()
 setseed(args.seed)
 model_name = args.model_name
 
-
-def get_dataloader(data_dir, step=32, batch_size=16, new_shape=[608, 608], use_letterbox=False):
-    num = step * batch_size
-    val_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
-    random.shuffle(val_list)
-    pic_list = val_list[:num]
-
-    calibration_dataset = []
-    for file_path in pic_list:
-        pic_data = cv2.imread(file_path)
-        org_img = pic_data
-        assert org_img is not None, 'Image not Found ' + file_path
-        h0, w0 = org_img.shape[:2]
-
-        if use_letterbox:
-            img, ratio, dwdh = letterbox(org_img, new_shape=(new_shape[1], new_shape[0]), auto=False, scaleup=True)
-        else:
-            img = cv2.resize(org_img, new_shape)
-        img = img.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-        img = np.ascontiguousarray(img) / 255.0  # 0~1 np array
-        img = torch.from_numpy(img).float()
-
-        calibration_dataset.append(img)
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=True,
-        batch_size=batch_size,
-        drop_last=True
-    )
-    return calibration_dataloader
-
-dataloader = get_dataloader(
-    data_dir=args.dataset_dir,
-    step=args.step,
+out_dir = args.save_dir
+dataloader = create_dataloaders(
+    data_path=args.dataset_dir,
+    annFile=args.ann_file,
+    img_sz=args.imgsz,
     batch_size=args.bsz,
-    new_shape=(args.imgsz, args.imgsz),
-    use_letterbox=args.use_letterbox
+    step=args.step,
+    data_process_type=args.data_process_type
 )
-
-dirname = os.path.dirname(args.save_quant_model)
-quant_json_path = os.path.join(dirname, f"quantized_{model_name}.json")
-
+# print("disable_quant_names : ", args.disable_quant_names)
 static_quantize(args.model,
         calibration_dataloader=dataloader,
-        save_quant_onnx_path=args.save_quant_model,
-        save_quant_params_path=quant_json_path,
+        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
         observer=args.observer,
-        data_preprocess=lambda x: x.to("cuda"),
+        data_preprocess=lambda x: x[0].to("cuda"),
         quant_format="qdq",
-        disable_quant_names=args.disable_quant_names)
+        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_accuracy.sh b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_accuracy.sh
index c33dc591..c86762e0 100644
--- a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_accuracy.sh
@@ -1,92 +1,185 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
-PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
-DATASETS_DIR=${DATASETS_DIR:-"${PROJ_DIR}/data/coco"}
-COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-EVAL_DIR=${DATASETS_DIR}/images/val2017
-CHECKPOINTS_DIR="${PROJ_DIR}/data"
-RUN_DIR="${PROJ_DIR}"
-ORIGINE_MODEL=${CHECKPOINTS_DIR}
+# Run paraments
+BSZ=32
+WARM_UP=-1
+TGT=0.65
+LOOP_COUNT=-1
+RUN_MODE=MAP
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
-echo Model Name : yolov4_darknet
+echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-BATCH_SIZE=16
-CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
 
-# Cut decoder part
-echo "Cut decoder part"
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
 else
-    python3 ${RUN_DIR}/cut_model.py  \
-        --input_model ${CURRENT_MODEL}     \
-        --output_model ${FINAL_MODEL}      \
-        --input_names input                \
-        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
+fi
+CURRENT_MODEL=${SIM_MODEL}
+
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
+fi
+CURRENT_MODEL=${NO_DECODER_MODEL}
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
 
-# add decoder op
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV3Decoder             \
+            --with_nms             True                   \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
-    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
-    python3 ${RUN_DIR}/deploy.py             \
-        --src ${CURRENT_MODEL}               \
-        --dst ${FINAL_MODEL}                 \
-        --decoder_type YoloV3Decoder          \
-        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
-        --decoder8_anchor 12 16 19 36 40 28            \
-        --decoder16_anchor 36 75 76 55 72 146          \
-        --decoder32_anchor 142 110 192 243 459 401
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
     echo "  "Generate ${FINAL_MODEL}
 fi
 CURRENT_MODEL=${FINAL_MODEL}
 
 # Build Engine
-echo Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_fp16.engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --precision float16                        \
+        --precision ${PRECISION}                \
         --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
 
 # Inference
-echo Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
 RUN_BATCH_SIZE=16
 python3 ${RUN_DIR}/inference.py                 \
     --test_mode MAP                             \
@@ -100,4 +193,4 @@ python3 ${RUN_DIR}/inference.py                 \
     --pred_dir ${CHECKPOINTS_DIR}               \
     --precision float16                            \
     --map_target 0.30; check_status
-exit ${EXIT_STATUS}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_performance.sh b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_performance.sh
index a4a83ce7..dabe655b 100644
--- a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_performance.sh
+++ b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_fp16_performance.sh
@@ -1,92 +1,186 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
-PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
-DATASETS_DIR=${DATASETS_DIR:-"${PROJ_DIR}/data/coco"}
-COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-EVAL_DIR=${DATASETS_DIR}/images/val2017
-CHECKPOINTS_DIR="${PROJ_DIR}/data"
-RUN_DIR="${PROJ_DIR}"
-ORIGINE_MODEL=${CHECKPOINTS_DIR}
+# Run paraments
+BSZ=32
+WARM_UP=3
+TGT=1010
+LOOP_COUNT=100
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
-echo Model Name : yolov4_darknet
+echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-BATCH_SIZE=16
-CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
 
-# Cut decoder part
-echo "Cut decoder part"
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
 else
-    python3 ${RUN_DIR}/cut_model.py  \
-        --input_model ${CURRENT_MODEL}     \
-        --output_model ${FINAL_MODEL}      \
-        --input_names input                \
-        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
+CURRENT_MODEL=${SIM_MODEL}
+
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
+else
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
+fi
+CURRENT_MODEL=${NO_DECODER_MODEL}
 
-# add decoder op
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
+fi
+
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV3Decoder             \
+            --with_nms             False                   \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
-    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
-    python3 ${RUN_DIR}/deploy.py             \
-        --src ${CURRENT_MODEL}               \
-        --dst ${FINAL_MODEL}                 \
-        --decoder_type YoloV3Decoder          \
-        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
-        --decoder8_anchor 12 16 19 36 40 28            \
-        --decoder16_anchor 36 75 76 55 72 146          \
-        --decoder32_anchor 142 110 192 243 459 401
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
     echo "  "Generate ${FINAL_MODEL}
 fi
 CURRENT_MODEL=${FINAL_MODEL}
 
 # Build Engine
-echo Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_fp16.engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --precision float16                        \
+        --precision ${PRECISION}                \
         --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
 
 # Inference
-echo Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
 RUN_BATCH_SIZE=16
 python3 ${RUN_DIR}/inference.py                 \
     --test_mode FPS                             \
@@ -100,4 +194,4 @@ python3 ${RUN_DIR}/inference.py                 \
     --pred_dir ${CHECKPOINTS_DIR}               \
     --precision float16                            \
     --map_target 0.30; check_status
-exit ${EXIT_STATUS}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_accuracy.sh b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_accuracy.sh
index 20e59378..646b115f 100644
--- a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_accuracy.sh
@@ -1,110 +1,185 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
-PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
-DATASETS_DIR=${DATASETS_DIR:-"${PROJ_DIR}/data/coco"}
-COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-EVAL_DIR=${DATASETS_DIR}/images/val2017
-CHECKPOINTS_DIR="${PROJ_DIR}/data"
-RUN_DIR="${PROJ_DIR}"
-ORIGINE_MODEL=${CHECKPOINTS_DIR}
+# Run paraments
+BSZ=32
+WARM_UP=-1
+TGT=0.65
+LOOP_COUNT=-1
+RUN_MODE=MAP
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
-echo Model Name : yolov4_darknet
+echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-BATCH_SIZE=16
-CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
 
-# Cut decoder part
-echo "Cut decoder part"
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
 else
-    python3 ${RUN_DIR}/cut_model.py  \
-        --input_model ${CURRENT_MODEL}     \
-        --output_model ${FINAL_MODEL}      \
-        --input_names input                \
-        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
+CURRENT_MODEL=${SIM_MODEL}
 
-# quant
-FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
 else
-    python3 ${RUN_DIR}/quant.py             \
-        --model_name "YOLOV4_DARKNET"       \
-        --model ${CURRENT_MODEL}            \
-        --bsz ${BATCH_SIZE}                 \
-        --dataset_dir ${EVAL_DIR}           \
-        --ann_file ${COCO_GT}               \
-        --observer "hist_percentile"        \
-        --save_quant_model ${FINAL_MODEL}   \
-        --imgsz 608
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
+fi
+CURRENT_MODEL=${NO_DECODER_MODEL}
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
 
-# add decoder op
-FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV3Decoder             \
+            --with_nms             True                   \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
 if [ -f $FINAL_MODEL ];then
-    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
-    python3 ${RUN_DIR}/deploy.py             \
-        --src ${CURRENT_MODEL}               \
-        --dst ${FINAL_MODEL}                 \
-        --decoder_type YoloV3Decoder          \
-        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
-        --decoder8_anchor 12 16 19 36 40 28            \
-        --decoder16_anchor 36 75 76 55 72 146          \
-        --decoder32_anchor 142 110 192 243 459 401
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
     echo "  "Generate ${FINAL_MODEL}
 fi
 CURRENT_MODEL=${FINAL_MODEL}
 
 # Build Engine
-echo Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_int8.engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --precision int8                        \
+        --precision ${PRECISION}                \
         --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
 
 # Inference
-echo Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
 RUN_BATCH_SIZE=16
 python3 ${RUN_DIR}/inference.py                 \
     --test_mode MAP                             \
@@ -116,6 +191,6 @@ python3 ${RUN_DIR}/inference.py                 \
     --eval_dir ${EVAL_DIR}                      \
     --coco_gt ${COCO_GT}                        \
     --pred_dir ${CHECKPOINTS_DIR}               \
-    --precision int8                            \
+    --precision float16                            \
     --map_target 0.30; check_status
-exit ${EXIT_STATUS}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_performance.sh b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_performance.sh
index 7f110386..4665a65f 100644
--- a/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_performance.sh
+++ b/models/cv/object_detection/yolov4/ixrt/scripts/infer_yolov4_int8_performance.sh
@@ -1,110 +1,186 @@
 #!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 
 EXIT_STATUS=0
 check_status()
 {
-    if ((${PIPESTATUS[0]} != 0));then
-    EXIT_STATUS=1
+    ret_code=${PIPESTATUS[0]}
+    if [ ${ret_code} != 0 ]; then
+    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
     fi
 }
 
-PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
-DATASETS_DIR=${DATASETS_DIR:-"${PROJ_DIR}/data/coco"}
-COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
-EVAL_DIR=${DATASETS_DIR}/images/val2017
-CHECKPOINTS_DIR="${PROJ_DIR}/data"
-RUN_DIR="${PROJ_DIR}"
-ORIGINE_MODEL=${CHECKPOINTS_DIR}
+# Run paraments
+BSZ=32
+WARM_UP=3
+TGT=1010
+LOOP_COUNT=100
+RUN_MODE=FPS
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+    esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
 
 echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
 echo DATASETS_DIR : ${DATASETS_DIR}
 echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
 echo ====================== Model Info ======================
-echo Model Name : yolov4_darknet
+echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-BATCH_SIZE=16
-CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
+mkdir -p ${CHECKPOINTS_DIR}
 
-# Cut decoder part
-echo "Cut decoder part"
-FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "CUT Model Skip, $FINAL_MODEL has been existed
+step=0
+faster=0
+CURRENT_MODEL=${ORIGINE_MODEL}
+if [[ ${LAYER_FUSION} == 1 && ${DECODER_FASTER} == 1 ]];then
+    faster=1
+fi
+
+# Simplify Model
+let step++
+echo [STEP ${step}] : Simplify Model
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+if [ -f ${SIM_MODEL} ];then
+    echo "  "Simplify Model skip, ${SIM_MODEL} has been existed
 else
-    python3 ${RUN_DIR}/cut_model.py  \
-        --input_model ${CURRENT_MODEL}     \
-        --output_model ${FINAL_MODEL}      \
-        --input_names input                \
-        --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/simplify_model.py    \
+    --origin_model ${CURRENT_MODEL}         \
+    --output_model ${SIM_MODEL}
+    echo "  "Generate ${SIM_MODEL}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
+CURRENT_MODEL=${SIM_MODEL}
 
-# quant
-FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_without_decoder.onnx
-if [ -f $FINAL_MODEL ];then
-    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
+# Cut Decoder
+let step++
+echo [STEP ${step}] : Cut Decoder
+NO_DECODER_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_without_decoder.onnx
+if [ -f ${NO_DECODER_MODEL} ];then
+    echo "  "Cut Decoder skip, ${SIM_MNO_DECODER_MODELODEL} has been existed
 else
-    python3 ${RUN_DIR}/quant.py             \
-        --model_name "YOLOV4_DARKNET"       \
-        --model ${CURRENT_MODEL}            \
-        --bsz ${BATCH_SIZE}                 \
-        --dataset_dir ${EVAL_DIR}           \
-        --ann_file ${COCO_GT}               \
-        --observer "hist_percentile"        \
-        --save_quant_model ${FINAL_MODEL}   \
-        --imgsz 608
-    echo "  "Generate ${FINAL_MODEL}
+    python3 ${RUN_DIR}/cut_model.py         \
+    --input_model  ${CURRENT_MODEL}         \
+    --output_model ${NO_DECODER_MODEL}      \
+    --input_names ${MODEL_INPUT_NAMES[@]}   \
+    --output_names ${DECODER_INPUT_NAMES[@]}
 fi
-CURRENT_MODEL=${FINAL_MODEL}
+CURRENT_MODEL=${NO_DECODER_MODEL}
 
-# add decoder op
-FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+    let step++
+    echo;
+    echo [STEP ${step}] : Quant Model
+    if [[ -z ${QUANT_EXIST_ONNX} ]];then
+        QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+    fi
+    if [[ -f ${QUANT_EXIST_ONNX} ]];then
+        CURRENT_MODEL=${QUANT_EXIST_ONNX}
+        echo "  "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+    else
+        python3 ${RUN_DIR}/quant.py                         \
+            --model ${CURRENT_MODEL}                        \
+            --model_name ${MODEL_NAME}                      \
+            --dataset_dir ${EVAL_DIR}                       \
+            --ann_file ${COCO_GT}                           \
+            --data_process_type ${DATA_PROCESS_TYPE}        \
+            --observer ${QUANT_OBSERVER}                    \
+            --disable_quant_names ${DISABLE_QUANT_LIST[@]}  \
+            --save_dir $CHECKPOINTS_DIR                     \
+            --bsz   ${QUANT_BATCHSIZE}                      \
+            --step  ${QUANT_STEP}                           \
+            --seed  ${QUANT_SEED}                           \
+            --imgsz ${IMGSIZE}
+        echo "  "Generate ${QUANT_EXIST_ONNX}
+    fi
+    CURRENT_MODEL=${QUANT_EXIST_ONNX}
+fi
+
+# Add Decoder
+if [ $LAYER_FUSION == "1" ]; then
+    let step++
+    echo;
+    echo [STEP ${step}] : Add Decoder
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
+    if [ -f $FUSION_ONNX ];then
+        echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
+    else
+        python3 ${RUN_DIR}/deploy.py                        \
+            --src ${CURRENT_MODEL}                          \
+            --dst ${FUSION_ONNX}                            \
+            --decoder_type        YoloV3Decoder             \
+            --with_nms             False                   \
+            --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
+            --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
+            --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
+            --decoder32_anchor    ${DECODER_32_ANCHOR[@]}   \
+            --num_class           ${DECODER_NUM_CLASS}      \
+            --faster              ${faster}
+    fi
+    CURRENT_MODEL=${FUSION_ONNX}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
 if [ -f $FINAL_MODEL ];then
-    echo "  "Add Decoder Skip, $FINAL_MODEL has been existed
+    echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
-    python3 ${RUN_DIR}/deploy.py             \
-        --src ${CURRENT_MODEL}               \
-        --dst ${FINAL_MODEL}                 \
-        --decoder_type YoloV3Decoder          \
-        --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
-        --decoder8_anchor 12 16 19 36 40 28            \
-        --decoder16_anchor 36 75 76 55 72 146          \
-        --decoder32_anchor 142 110 192 243 459 401
+    python3 ${RUN_DIR}/modify_batchsize.py  \
+        --batch_size ${BSZ}                 \
+        --origin_model ${CURRENT_MODEL}     \
+        --output_model ${FINAL_MODEL}
     echo "  "Generate ${FINAL_MODEL}
 fi
 CURRENT_MODEL=${FINAL_MODEL}
 
 # Build Engine
-echo Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_int8.engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
     python3 ${RUN_DIR}/build_engine.py          \
-        --precision int8                        \
+        --precision ${PRECISION}                \
         --model ${CURRENT_MODEL}                \
         --engine ${ENGINE_FILE}
     echo "  "Generate Engine ${ENGINE_FILE}
 fi
+if [[ ${RUN_MODE} == "MAP" && ${NMS_TYPE} == "GPU" ]];then
+    NMS_ENGINE=${CHECKPOINTS_DIR}/nms.engine
+    # Build NMS Engine
+    python3 ${RUN_DIR}/build_nms_engine.py      \
+        --bsz ${BSZ}                            \
+        --path ${CHECKPOINTS_DIR}               \
+        --all_box_num ${ALL_BOX_NUM}            \
+        --max_box_pre_img   ${MAX_BOX_PRE_IMG}  \
+        --iou_thresh        ${IOU_THRESH}       \
+        --score_thresh      ${SCORE_THRESH}
+fi
 
 # Inference
-echo Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
 RUN_BATCH_SIZE=16
 python3 ${RUN_DIR}/inference.py                 \
     --test_mode FPS                             \
@@ -116,6 +192,6 @@ python3 ${RUN_DIR}/inference.py                 \
     --eval_dir ${EVAL_DIR}                      \
     --coco_gt ${COCO_GT}                        \
     --pred_dir ${CHECKPOINTS_DIR}               \
-    --precision int8                            \
+    --precision float16                            \
     --map_target 0.30; check_status
-exit ${EXIT_STATUS}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov4/ixrt/simplify_model.py b/models/cv/object_detection/yolov4/ixrt/simplify_model.py
new file mode 100644
index 00000000..b4254b6f
--- /dev/null
+++ b/models/cv/object_detection/yolov4/ixrt/simplify_model.py
@@ -0,0 +1,21 @@
+import onnx
+import argparse
+from onnxsim import simplify
+
+# Simplify
+def simplify_model(args):
+    onnx_model = onnx.load(args.origin_model)
+    model_simp, check = simplify(onnx_model)
+    model_simp = onnx.shape_inference.infer_shapes(model_simp)
+    onnx.save(model_simp, args.output_model)
+    print("  Simplify onnx Done.")
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--origin_model", type=str)
+    parser.add_argument("--output_model", type=str)
+    args = parser.parse_args()
+    return args
+
+args = parse_args()
+simplify_model(args)
\ No newline at end of file
diff --git a/tests/model_info.json b/tests/model_info.json
index 054ae60e..660bdfcd 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -1519,7 +1519,7 @@
             "github_branch": "",
             "github_path": "",
             "datasets": "",
-            "download_url": "",
+            "download_url": "http://data.lip6.fr/cadene/pretrainedmodels/inceptionresnetv2-520b38e4.pth",
             "need_third_part": "",
             "precisions": [
                 "fp16",
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index 39e112ba..0508d8d8 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -264,6 +264,8 @@ def run_detec_testcase(model):
 
     for prec in model["precisions"]:
         logging.info(f"Start running {model_name} {prec} test case")
+        result["result"].setdefault(prec, {})
+        result["result"].setdefault(prec, {"status": "FAIL"})
         script = f"""
         cd ../{model['model_path']}
         export DATASETS_DIR=./{dataset_n}/
@@ -294,7 +296,6 @@ def run_detec_testcase(model):
         combined_pattern = re.compile(f"{fps_pattern}|{e2e_pattern}")
         matchs = combined_pattern.finditer(sout)
         for match in matchs:
-            result["result"].setdefault(prec, {"status": "FAIL"})
             for name, value in match.groupdict().items():
                 if value:
                     try:
@@ -306,7 +307,6 @@ def run_detec_testcase(model):
         pattern = r"Average Precision  \(AP\) @\[ (IoU=0.50[:\d.]*)\s*\| area=   all \| maxDets=\s?\d+\s?\] =\s*([\d.]+)"
         matchs = re.findall(pattern, sout)
         for m in matchs:
-            result["result"].setdefault(prec, {})
             try:
                 result["result"][prec][m[0]] = float(m[1])
             except ValueError:
@@ -318,7 +318,6 @@ def run_detec_testcase(model):
             pattern = METRIC_PATTERN
             matchs = re.findall(pattern, sout)
             if matchs and len(matchs) == 1:
-                result["result"].setdefault(prec, {})
                 result["result"][prec].update(get_metric_result(matchs[0]))
                 result["result"][prec]["status"] = "PASS"
         result["result"][prec]["Cost time (s)"] = t
-- 
Gitee


From cbbc0e1f17dd3b1405dc4e1759c88e505dbcd0fe Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 2 Jul 2025 10:39:46 +0800
Subject: [PATCH 11/15] sync bert large squad

---
 .../plm/bert_large_squad/ixrt/CMakeLists.txt  |  49 --
 .../nlp/plm/bert_large_squad/ixrt/README.md   |  36 +-
 .../ixrt/{python => }/builder.py              | 541 +++++++++++++-----
 .../ixrt/{python => }/builder_int8.py         |  57 +-
 .../ixrt/{python => }/builder_utils.py        |  86 +--
 .../ixrt/{python => }/builder_utils_int8.py   |  31 -
 .../plm/bert_large_squad/ixrt/ci/prepare.sh   |  12 +-
 .../ixrt/cmake/FindCompiler.cmake             |  15 -
 .../ixrt/cmake/FindCuda.cmake                 |  57 --
 .../ixrt/cmake/FindIxrt.cmake                 |  19 -
 .../ixrt/cmake/FindPluginFiles.cmake          |   7 -
 .../ixrt/{python => }/evaluate-v1.1.py        |  18 -
 .../ixrt/{python => }/helpers/__init__.py     |   0
 .../ixrt/{python => }/helpers/calibrator.py   |  13 +-
 .../{python => }/helpers/data_processing.py   |  16 +-
 .../ixrt/{python => }/helpers/tokenization.py |   0
 .../ixrt/{python => }/inference.py            | 123 ++--
 .../bert_large_squad/ixrt/load_ixrt_plugin.py |  13 +
 .../ixrt/{python => }/perf.py                 |  34 +-
 .../ixrt/python/load_ixrt_plugin.py           |  40 --
 .../ixrt/python/script/build_engine.sh        |  34 --
 .../ixrt/python/script/inference.sh           |  36 --
 .../ixrt/python/script/inference_squad.sh     |  36 --
 .../ixrt/python/script/mdb_infer_run.sh       |  63 --
 .../ixrt/python/script/perf.sh                |  23 -
 .../infer_bert_large_squad_fp16_accuracy.sh   |  50 ++
 ...infer_bert_large_squad_fp16_performance.sh |  48 ++
 .../infer_bert_large_squad_int8_accuracy.sh   |  49 ++
 ...infer_bert_large_squad_int8_performance.sh |  47 ++
 .../{python/script => scripts}/prepare.sh     |   0
 .../ixrt/src/api/plugin_loader.cc             | 168 ------
 .../ixrt/src/backend/bert/bert_helper.h       | 299 ----------
 .../ixrt/src/backend/cublas/cublas_helper.h   | 312 ----------
 .../backend/ixinfer/ixinfer_gemm_helper.cu    | 416 --------------
 .../src/backend/ixinfer/ixinfer_gemm_helper.h |  73 ---
 .../ixrt/src/common/bertCommon.h              | 242 --------
 .../ixrt/src/common/checkMacrosPlugin.cpp     |  62 --
 .../ixrt/src/common/checkMacrosPlugin.h       | 221 -------
 .../ixrt/src/common/common_def.cuh            |  67 ---
 .../ixrt/src/common/plugin.cpp                |  63 --
 .../bert_large_squad/ixrt/src/common/plugin.h |  72 ---
 .../ixrt/src/common/serialize.h               | 148 -----
 .../ixrt/src/custom_fc/fcInt8Plugin.cpp       | 431 --------------
 .../ixrt/src/custom_fc/fcInt8Plugin.cu        | 485 ----------------
 .../ixrt/src/custom_fc/fcPlugin.cpp           | 345 -----------
 .../ixrt/src/custom_fc/fcPlugin.h             | 246 --------
 .../emb_layernorm/embLayerNormInt8Plugin.cpp  | 503 ----------------
 .../emb_layernorm/embLayerNormInt8Plugin.cu   | 342 -----------
 .../emb_layernorm/embLayerNormInt8Plugin.h    | 128 -----
 .../src/emb_layernorm/embLayerNormPlugin.cpp  | 495 ----------------
 .../src/emb_layernorm/embLayerNormPlugin.cu   | 258 ---------
 .../src/emb_layernorm/embLayerNormPlugin.h    | 142 -----
 .../ixrt/src/ffn/ffnPlugin.cpp                | 389 -------------
 .../bert_large_squad/ixrt/src/ffn/ffnPlugin.h | 216 -------
 .../ixrt/src/gelu/geluPlugin.cpp              | 355 ------------
 .../ixrt/src/gelu/geluPlugin.cu               | 218 -------
 .../ixrt/src/gelu/geluPlugin.h                | 148 -----
 .../qkv_to_context/qkvToContextInt8Plugin.cpp | 335 -----------
 .../qkv_to_context/qkvToContextInt8Plugin.cu  | 488 ----------------
 .../qkv_to_context/qkvToContextInt8Plugin.h   | 164 ------
 .../src/qkv_to_context/qkvToContextPlugin.cpp | 388 -------------
 .../src/qkv_to_context/qkvToContextPlugin.cu  | 317 ----------
 .../src/qkv_to_context/qkvToContextPlugin.h   | 155 -----
 .../skipLayerNormInt8Plugin.cpp               | 404 -------------
 .../skip_layernorm/skipLayerNormInt8Plugin.cu | 361 ------------
 .../skip_layernorm/skipLayerNormInt8Plugin.h  | 146 -----
 .../skip_layernorm/skipLayerNormPlugin.cpp    | 430 --------------
 .../src/skip_layernorm/skipLayerNormPlugin.cu | 401 -------------
 .../src/skip_layernorm/skipLayerNormPlugin.h  | 133 -----
 tests/model_info.json                         |   3 +-
 tests/run_ixrt.py                             |  12 +-
 71 files changed, 781 insertions(+), 11353 deletions(-)
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/builder.py (38%)
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/builder_int8.py (89%)
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/builder_utils.py (74%)
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/builder_utils_int8.py (85%)
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/evaluate-v1.1.py (83%)
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/helpers/__init__.py (100%)
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/helpers/calibrator.py (89%)
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/helpers/data_processing.py (98%)
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/helpers/tokenization.py (100%)
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/inference.py (79%)
 create mode 100644 models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py
 rename models/nlp/plm/bert_large_squad/ixrt/{python => }/perf.py (81%)
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh
 create mode 100644 models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh
 create mode 100644 models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh
 create mode 100644 models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh
 create mode 100644 models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh
 rename models/nlp/plm/bert_large_squad/ixrt/{python/script => scripts}/prepare.sh (100%)
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu
 delete mode 100644 models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h

diff --git a/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt b/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt
deleted file mode 100644
index 9a0e7a12..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-
-project(nv_plugin)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake;${CMAKE_MODULE_PATH}")
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-set(TARGET_NAME ixrt_plugin)
-set(SHARED_TARGET ${TARGET_NAME})
-set(STATIC_TARGET ${TARGET_NAME}_static)
-set(PLUGIN_REPO_PATH ${PROJECT_SOURCE_DIR})
-
-if(DEFINED USE_TENSORRT)
-  find_package(CUDA)
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_75)
-
-  include_directories(
-        ${CUDA_PATH}/include)
-
-  message(STATUS "Plugin lib use TRT 8.6.1")
-  set(TRT_INC_PATH /usr/include/x86_64-linux-gnu/)
-  set(TRT_LIB_PATH /usr/lib/x86_64-linux-gnu/ /usr/local/cuda/targets/x86_64-linux/lib)
-  set(TRT_LIBRARY nvinfer cublasLt)
-  
-  message(STATUS "cuda_libs = ${CUDA_LIBRARIES}")
-  message(STATUS "cudadevrt_libs = ${CUDA_cudadevrt_LIBRARY}")
-else()
-  include(FindIxrt)
-  include(FindCompiler)
-  include(FindCuda)
-  set(TRT_LIBRARY cublasLt cudart ixrt)
-  include_directories(${IXRT_INCLUDE_DIR}
-        ${CUDA_PATH}/include)
-  add_definitions(-D__ILUVATAR__)
-
-  string(APPEND CMAKE_CXX_FLAGS " -std=c++17")
-endif()
-
-include(FindPluginFiles)
-
-################################## Compile Options ######################################
-cuda_add_library(${SHARED_TARGET} SHARED
-        ${PLUGIN_FILES}
-)
-
-target_link_libraries(${SHARED_TARGET} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} ${TRT_LIBRARY})
-target_link_directories(${SHARED_TARGET} PUBLIC ${CUDA_PATH}/lib64 ${TRT_LIB_PATH} ${IXRT_LIB_DIR})
-target_include_directories(${SHARED_TARGET}  PUBLIC ${CUDA_PATH}/include ${TRT_INC_PATH} src PUBLIC src/common)
diff --git a/models/nlp/plm/bert_large_squad/ixrt/README.md b/models/nlp/plm/bert_large_squad/ixrt/README.md
index 13e74114..ce525584 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/README.md
+++ b/models/nlp/plm/bert_large_squad/ixrt/README.md
@@ -18,8 +18,7 @@ Get `bert-large-uncased.zip` from [Google
 Drive](https://drive.google.com/file/d/1eD8QBkbK6YN-_YXODp3tmpp3cZKlrPTA/view?usp=drive_link)
 
 ```bash
-cd python/
-bash script/prepare.sh v1_1
+bash scripts/prepare.sh v1_1
 ```
 
 ### Install Dependencies
@@ -27,27 +26,7 @@ bash script/prepare.sh v1_1
 #### Install on Iluvatar
 
 ```bash
-cmake -S . -B build
-cmake --build build -j16
-```
-
-#### Install on NV
-
-Require tensorrt_version >= 8.6
-
-```bash
-# Get TensorRT docker image
-docker pull nvcr.io/nvidia/tensorrt:23.04-py3
-# Run TensorRT docker
-```
-
-```bash
-# Install requirements.txt in TensorRT docker
 pip3 install -r requirements.txt
-
-# Build
-cmake -S . -B build -DUSE_TENSORRT=true
-cmake --build build -j16
 ```
 
 ## Model Inference
@@ -55,20 +34,15 @@ cmake --build build -j16
 ### FP16
 
 ```bash
-cd python/
-
-# use --bs to set max_batch_size (dynamic)
-bash script/build_engine.sh --bs 32
-bash script/inference_squad.sh --bs 32
+bash scripts/infer_bert_large_squad_fp16_accuracy.sh
+bash scripts/infer_bert_large_squad_fp16_performance.sh
 ```
 
 ### INT8
 
 ```bash
-cd python
-pip install onnx pycuda
-bash script/build_engine.sh --bs 32 --int8
-bash script/inference_squad.sh --bs 32 --int8
+bash scripts/infer_bert_large_squad_int8_accuracy.sh
+bash scripts/infer_bert_large_squad_int8_performance.sh
 ```
 
 | Model              | BatchSize   | Precision   | Latency QPS           | exact_match   | f1      |
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder.py b/models/nlp/plm/bert_large_squad/ixrt/builder.py
similarity index 38%
rename from models/nlp/plm/bert_large_squad/ixrt/python/builder.py
rename to models/nlp/plm/bert_large_squad/ixrt/builder.py
index 627027a0..970f91bc 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/builder.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/builder.py
@@ -1,3 +1,19 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+#
+
 #!/usr/bin/env python3
 # Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 # All Rights Reserved.
@@ -30,38 +46,64 @@
 # limitations under the License.
 #
 
-import os
 import argparse
-import json
-import tensorrt as trt
-import time
-import sys
 import ctypes
+import json
 import os
-import numpy as np
-from builder_utils import load_onnx_weights_and_quant, load_pytorch_weights_and_quant
-from builder_utils import WQKV, BQKV  # Attention Keys
-from builder_utils import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT  # Transformer Keys
-from builder_utils import SQD_W, SQD_B  # SQuAD Output Keys
+import sys
+import time
 
-trt_version = [int(n) for n in trt.__version__.split('.')]
-plugin_lib_name = "libnvinfer_plugin.so" if os.getenv('USE_TRT') == 'True' else "libixrt_plugin.so"
+import numpy as np
+import ixrt
+from builder_utils import (  # Attention Keys; Transformer Keys; SQuAD Output Keys
+    B_AOUT,
+    B_LOUT,
+    B_MID,
+    BQKV,
+    SQD_B,
+    SQD_W,
+    W_AOUT,
+    W_LOUT,
+    W_MID,
+    WQKV,
+    load_onnx_weights_and_quant,
+    load_pytorch_weights_and_quant,
+)
+
+plugin_lib_name = (
+    "libnvinfer_plugin.so" if os.getenv("USE_TRT") == "True" else "libixrt_plugin.so"
+)
 print(plugin_lib_name)
 
-TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
-from load_ixrt_plugin import load_ixrt_plugin, is_nvidia_platform
+TRT_LOGGER = ixrt.Logger(ixrt.Logger.WARNING)
+from load_ixrt_plugin import load_ixrt_plugin
+
 load_ixrt_plugin(TRT_LOGGER)
 
-plg_registry = trt.get_plugin_registry()
+plg_registry = ixrt.get_plugin_registry()
 registry_list = plg_registry.plugin_creator_list
-print("registry_list: ", [registry.name + '/' + registry.plugin_version for registry in registry_list])
-emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic_IxRT", "1", "")
-qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic_IxRT", "1", "")
-skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic_IxRT", "1", "")
-ffn_plg_creator = plg_registry.get_plugin_creator("CustomFFNPluginDynamic_IxRT", "1", "")
-gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic_IxRT", "1", "")
+print(
+    "registry_list: ",
+    [registry.name + "/" + registry.plugin_version for registry in registry_list],
+)
+emln_plg_creator = plg_registry.get_plugin_creator(
+    "CustomEmbLayerNormPluginDynamic_IxRT", "1", ""
+)
+qkv2_plg_creator = plg_registry.get_plugin_creator(
+    "CustomQKVToContextPluginDynamic_IxRT", "1", ""
+)
+skln_plg_creator = plg_registry.get_plugin_creator(
+    "CustomSkipLayerNormPluginDynamic_IxRT", "1", ""
+)
+ffn_plg_creator = plg_registry.get_plugin_creator(
+    "CustomFFNPluginDynamic_IxRT", "1", ""
+)
+gelu_plg_creator = plg_registry.get_plugin_creator(
+    "CustomGeluPluginDynamic_IxRT", "1", ""
+)
 fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic_IxRT", "1", "")
 
+
 class BertConfig:
     def __init__(self, bert_config_path, use_fp16, use_trt):
         with open(bert_config_path, "r") as f:
@@ -74,42 +116,51 @@ class BertConfig:
             self.use_fp16 = use_fp16
             self.use_trt = use_trt
 
+
 def set_tensor_name(tensor, prefix, name):
     tensor.name = prefix + name
 
-def set_output_name(layer, prefix, name, out_idx = 0):
+
+def set_output_name(layer, prefix, name, out_idx=0):
     set_tensor_name(layer.get_output(out_idx), prefix, name)
 
-def set_output_range(layer, maxval, out_idx = 0):
+
+def set_output_range(layer, maxval, out_idx=0):
     layer.get_output(out_idx).set_dynamic_range(-maxval, maxval)
 
+
 def get_mha_dtype(config):
-    dtype = trt.float32
+    dtype = ixrt.float32
     if config.use_fp16:
-        dtype = trt.float16
+        dtype = ixrt.float16
     return int(dtype)
 
+
 def custom_fc(network, input_tensor, out_dims, W, B):
-    pf_out_dims = trt.PluginField("out_dims", np.array(out_dims, dtype=np.int32), trt.PluginFieldType.INT32)
-    pf_type = trt.PluginField("type_id", np.array(int(trt.float16), dtype=np.int32), trt.PluginFieldType.INT32)
-    pf_W = trt.PluginField("W", W, trt.PluginFieldType.FLOAT32)
+    pf_out_dims = ixrt.PluginField(
+        "out_dims", np.array(out_dims, dtype=np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_type = ixrt.PluginField(
+        "type_id", np.array(int(ixrt.float16), dtype=np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_W = ixrt.PluginField("W", W, ixrt.PluginFieldType.FLOAT32)
     fields = [pf_out_dims, pf_type, pf_W]
     if B is not None:
-        pf_B = trt.PluginField("B", B, trt.PluginFieldType.FLOAT32)
+        pf_B = ixrt.PluginField("B", B, ixrt.PluginFieldType.FLOAT32)
         fields.append(pf_B)
 
-    pfc = trt.PluginFieldCollection(fields)
+    pfc = ixrt.PluginFieldCollection(fields)
     fc_plugin = fc_plg_creator.create_plugin("fcplugin", pfc)
     plug_inputs = [input_tensor]
     out_dense = network.add_plugin_v2(plug_inputs, fc_plugin)
     return out_dense
 
+
 def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
     """
     Add the attention layer
     """
-    assert(len(input_tensor.shape) == 5)
-    B, S, hidden_size, _, _ = input_tensor.shape
+    B, S, hidden_size = input_tensor.shape
     num_heads = config.num_attention_heads
     head_size = int(hidden_size / num_heads)
 
@@ -117,18 +168,27 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask)
     Ball = init_dict[prefix + BQKV]
 
     # FC_attention
-    if config.use_trt:
-        mult_all = network.add_fully_connected(input_tensor, 3 * hidden_size, Wall, Ball)
-    else:
-        mult_all = custom_fc(network, input_tensor, 3 * hidden_size, Wall, Ball)
+    mult_all = custom_fc(network, input_tensor, 3 * hidden_size, Wall, Ball)
 
     has_mask = imask is not None
     # QKV2CTX
-    pf_type = trt.PluginField("type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32)
-    pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
-    pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32)
-    pf_has_mask = trt.PluginField("has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32)
-    pfc = trt.PluginFieldCollection([pf_hidden_size, pf_num_heads, pf_has_mask, pf_type])
+    pf_type = ixrt.PluginField(
+        "type_id",
+        np.array([get_mha_dtype(config)], np.int32),
+        ixrt.PluginFieldType.INT32,
+    )
+    pf_hidden_size = ixrt.PluginField(
+        "hidden_size", np.array([hidden_size], np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_num_heads = ixrt.PluginField(
+        "num_heads", np.array([num_heads], np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_has_mask = ixrt.PluginField(
+        "has_mask", np.array([has_mask], np.int32), ixrt.PluginFieldType.INT32
+    )
+    pfc = ixrt.PluginFieldCollection(
+        [pf_hidden_size, pf_num_heads, pf_has_mask, pf_type]
+    )
     qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc)
 
     qkv_in = [mult_all.get_output(0)]
@@ -143,46 +203,56 @@ def skipln(prefix, config, init_dict, network, input_tensor, skip, bias=None):
     Add the skip layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 5
     hidden_size = idims[2]
 
-    dtype = trt.float32
+    dtype = ixrt.float32
     if config.use_fp16:
-        dtype = trt.float16
+        dtype = ixrt.float16
 
-    pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32)
+    pf_ld = ixrt.PluginField(
+        "ld", np.array([hidden_size], np.int32), ixrt.PluginFieldType.INT32
+    )
     wbeta = init_dict[prefix + "beta"]
-    pf_beta = trt.PluginField("beta", wbeta, trt.PluginFieldType.FLOAT32)
+    pf_beta = ixrt.PluginField("beta", wbeta, ixrt.PluginFieldType.FLOAT32)
     wgamma = init_dict[prefix + "gamma"]
-    pf_gamma = trt.PluginField("gamma", wgamma, trt.PluginFieldType.FLOAT32)
-    pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
+    pf_gamma = ixrt.PluginField("gamma", wgamma, ixrt.PluginFieldType.FLOAT32)
+    pf_type = ixrt.PluginField(
+        "type_id", np.array([int(dtype)], np.int32), ixrt.PluginFieldType.INT32
+    )
 
-    fields = [pf_ld, pf_beta, pf_gamma, pf_type ]
+    fields = [pf_ld, pf_beta, pf_gamma, pf_type]
 
     if bias is not None:
-        pf_bias = trt.PluginField("bias", bias, trt.PluginFieldType.FLOAT32)
+        pf_bias = ixrt.PluginField("bias", bias, ixrt.PluginFieldType.FLOAT32)
         fields.append(pf_bias)
 
-    pfc = trt.PluginFieldCollection(fields)
+    pfc = ixrt.PluginFieldCollection(fields)
     skipln_plug = skln_plg_creator.create_plugin("skipln", pfc)
 
     skipln_inputs = [input_tensor, skip]
     layer = network.add_plugin_v2(skipln_inputs, skipln_plug)
     return layer
 
+
 def ffn_trt(prefix, config, init_dict, network, input_tensor):
-     # FC1 + GELU
+    # FC1 + GELU
     B_mid = init_dict[prefix + B_MID]
     W_mid = init_dict[prefix + W_MID]
-    mid_dense = network.add_fully_connected(input_tensor, config.intermediate_size, W_mid, B_mid)
+    mid_dense = network.add_fully_connected(
+        input_tensor, config.intermediate_size, W_mid, B_mid
+    )
 
-    dtype = trt.float32
+    dtype = ixrt.float32
     if config.use_fp16:
-        dtype = trt.float16
-    pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32)
-    pf_ld = trt.PluginField("ld", np.array([config.hidden_size], np.int32), trt.PluginFieldType.INT32)
-
-    pfc = trt.PluginFieldCollection([pf_type, pf_ld])
+        dtype = ixrt.float16
+    pf_type = ixrt.PluginField(
+        "type_id", np.array([int(dtype)], np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_ld = ixrt.PluginField(
+        "ld", np.array([config.hidden_size], np.int32), ixrt.PluginFieldType.INT32
+    )
+
+    pfc = ixrt.PluginFieldCollection([pf_type, pf_ld])
     gelu_plug = gelu_plg_creator.create_plugin("gelu", pfc)
 
     gelu_inputs = [mid_dense.get_output(0)]
@@ -194,54 +264,88 @@ def ffn_trt(prefix, config, init_dict, network, input_tensor):
     # Dense to hidden size
     B_lout = init_dict[prefix + B_LOUT]
     W_lout = init_dict[prefix + W_LOUT]
-    out_dense = network.add_fully_connected(intermediate_act, config.hidden_size, W_lout, B_lout)
+    out_dense = network.add_fully_connected(
+        intermediate_act, config.hidden_size, W_lout, B_lout
+    )
     B_lout = None
 
-    out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, out_dense.get_output(0), input_tensor, B_lout)
+    out_layer = skipln(
+        prefix + "output_layernorm_",
+        config,
+        init_dict,
+        network,
+        out_dense.get_output(0),
+        input_tensor,
+        B_lout,
+    )
     return out_layer
 
+
 def ffn(prefix, config, init_dict, network, input_tensor):
     # FC1 + GELU
     B_mid = init_dict[prefix + B_MID]
     W_mid = init_dict[prefix + W_MID]
     B_lout = init_dict[prefix + B_LOUT]
     W_lout = init_dict[prefix + W_LOUT]
-    pf_out_dim = trt.PluginField("out_dims", np.array(config.hidden_size, np.int32), trt.PluginFieldType.INT32)
-    pf_type = trt.PluginField("type_id", np.array(int(trt.float16), np.int32), trt.PluginFieldType.INT32)
-    pf_W1 = trt.PluginField("W1", W_mid, trt.PluginFieldType.FLOAT32)
-    pf_W2 = trt.PluginField("W2", W_lout, trt.PluginFieldType.FLOAT32)
-    pf_B1 = trt.PluginField("B1", B_mid, trt.PluginFieldType.FLOAT32)
-    pf_act_type = trt.PluginField("act_type", np.array(int(3), np.int32), trt.PluginFieldType.INT32)
-    pfc = trt.PluginFieldCollection([pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type])
+    pf_out_dim = ixrt.PluginField(
+        "out_dims", np.array(config.hidden_size, np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_type = ixrt.PluginField(
+        "type_id", np.array(int(ixrt.float16), np.int32), ixrt.PluginFieldType.INT32
+    )
+    pf_W1 = ixrt.PluginField("W1", W_mid, ixrt.PluginFieldType.FLOAT32)
+    pf_W2 = ixrt.PluginField("W2", W_lout, ixrt.PluginFieldType.FLOAT32)
+    pf_B1 = ixrt.PluginField("B1", B_mid, ixrt.PluginFieldType.FLOAT32)
+    pf_act_type = ixrt.PluginField(
+        "act_type", np.array(int(3), np.int32), ixrt.PluginFieldType.INT32
+    )
+    pfc = ixrt.PluginFieldCollection(
+        [pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type]
+    )
     ffn_plug = ffn_plg_creator.create_plugin("ffn", pfc)
 
     ffn_inputs = [input_tensor]
     ffn_layer = network.add_plugin_v2(ffn_inputs, ffn_plug)
 
-    out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, ffn_layer.get_output(0), input_tensor, B_lout)
+    out_layer = skipln(
+        prefix + "output_layernorm_",
+        config,
+        init_dict,
+        network,
+        ffn_layer.get_output(0),
+        input_tensor,
+        B_lout,
+    )
     return out_layer
 
+
 def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imask):
     """
     Add the transformer layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 5
     hidden_size = idims[2]
 
-    context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask)
+    context_transposed = attention_layer_opt(
+        prefix + "attention_", config, init_dict, network, input_tensor, imask
+    )
     attention_heads = context_transposed.get_output(0)
-    
+
     # FC0
     B_aout = init_dict[prefix + B_AOUT]
     W_aout = init_dict[prefix + W_AOUT]
-    if config.use_trt:
-        attention_out_fc = network.add_fully_connected(attention_heads, hidden_size, W_aout, B_aout)
-    else:
-        attention_out_fc = custom_fc(network, attention_heads, hidden_size, W_aout, B_aout)
-    B_aout = None     
-    
-    skiplayer = skipln(prefix + "attention_output_layernorm_",config, init_dict, network, attention_out_fc.get_output(0), input_tensor, B_aout)
+    attention_out_fc = custom_fc(network, attention_heads, hidden_size, W_aout, B_aout)
+    B_aout = None
+
+    skiplayer = skipln(
+        prefix + "attention_output_layernorm_",
+        config,
+        init_dict,
+        network,
+        attention_out_fc.get_output(0),
+        input_tensor,
+        B_aout,
+    )
     attention_ln = skiplayer.get_output(0)
 
     if config.use_trt:
@@ -250,121 +354,277 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imas
         ffn_layer = ffn(prefix, config, init_dict, network, attention_ln)
     return ffn_layer
 
+
 def bert_model(config, init_dict, network, input_tensor, input_mask):
     """
     Create the bert model
     """
     prev_input = input_tensor
     for layer in range(0, config.num_hidden_layers):
-        ss = "l{}_".format(layer)   
-        out_layer = transformer_layer_opt(ss, config,  init_dict, network, prev_input, input_mask)
+        ss = "l{}_".format(layer)
+        out_layer = transformer_layer_opt(
+            ss, config, init_dict, network, prev_input, input_mask
+        )
         prev_input = out_layer.get_output(0)
     return prev_input
 
+
 def squad_output(prefix, config, init_dict, network, input_tensor):
     """
     Create the squad output
     """
 
     idims = input_tensor.shape
-    assert len(idims) == 5
-    B, S, hidden_size, _, _ = idims
+    B, S, hidden_size = idims
 
     W_out = init_dict[prefix + SQD_W]
     B_out = init_dict[prefix + SQD_B]
 
+    dense = custom_fc(network, input_tensor, 2, W_out, B_out)
+
     if config.use_trt:
-        dense = network.add_fully_connected(input_tensor, 2, W_out, B_out)
-    else:
-        dense = custom_fc(network, input_tensor, 2, W_out, B_out)
-        
+        OUT = network.add_shuffle(dense.get_output(0))
+        OUT.second_transpose = (1, 0, 2)
+        return OUT
     return dense
 
-def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes):
-    input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
-    segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
-    input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
+
+def emb_layernorm(
+    builder,
+    network,
+    config,
+    weights_dict,
+    builder_config,
+    sequence_lengths,
+    batch_sizes,
+):
+    input_ids = network.add_input(
+        name="input_ids",
+        dtype=ixrt.int32,
+        shape=(
+            -1 if len(batch_sizes) > 1 else batch_sizes[0],
+            -1 if len(sequence_lengths) > 1 else sequence_lengths[0],
+        ),
+    )
+    segment_ids = network.add_input(
+        name="segment_ids",
+        dtype=ixrt.int32,
+        shape=(
+            -1 if len(batch_sizes) > 1 else batch_sizes[0],
+            -1 if len(sequence_lengths) > 1 else sequence_lengths[0],
+        ),
+    )
+    input_mask = network.add_input(
+        name="input_mask",
+        dtype=ixrt.int32,
+        shape=(
+            -1 if len(batch_sizes) > 1 else batch_sizes[0],
+            -1 if len(sequence_lengths) > 1 else sequence_lengths[0],
+        ),
+    )
 
     if len(sequence_lengths) > 1:
         profile = builder.create_optimization_profile()
         min_shape = (batch_sizes[0], sequence_lengths[0])
         opt_shape = (batch_sizes[1], sequence_lengths[1])
         max_shape = (batch_sizes[2], sequence_lengths[2])
-        assert(sequence_lengths[0] <= sequence_lengths[1] and sequence_lengths[1] <= sequence_lengths[2])
-        
-        print('set dynamic shape -> ', min_shape, opt_shape, max_shape)
+        assert (
+            sequence_lengths[0] <= sequence_lengths[1]
+            and sequence_lengths[1] <= sequence_lengths[2]
+        )
+
+        print("set dynamic shape -> ", min_shape, opt_shape, max_shape)
         profile.set_shape("input_ids", min_shape, opt_shape, max_shape)
         profile.set_shape("segment_ids", min_shape, opt_shape, max_shape)
         profile.set_shape("input_mask", min_shape, opt_shape, max_shape)
         builder_config.add_optimization_profile(profile)
 
-    wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"], trt.PluginFieldType.FLOAT32)
-    wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"], trt.PluginFieldType.FLOAT32)
-    wwordemb = trt.PluginField("bert_embeddings_word_embeddings", weights_dict["bert_embeddings_word_embeddings"], trt.PluginFieldType.FLOAT32)
-    wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32)
-    wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32)
-
-    output_fp16 = trt.PluginField("output_fp16", np.array([1 if config.use_fp16 else 0]).astype(np.int32), trt.PluginFieldType.INT32)
-    mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32)
-
-    pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type])
+    wbeta = ixrt.PluginField(
+        "bert_embeddings_layernorm_beta",
+        weights_dict["bert_embeddings_layernorm_beta"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+
+    wgamma = ixrt.PluginField(
+        "bert_embeddings_layernorm_gamma",
+        weights_dict["bert_embeddings_layernorm_gamma"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+    wwordemb = ixrt.PluginField(
+        "bert_embeddings_word_embeddings",
+        weights_dict["bert_embeddings_word_embeddings"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+    wtokemb = ixrt.PluginField(
+        "bert_embeddings_token_type_embeddings",
+        weights_dict["bert_embeddings_token_type_embeddings"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+    wposemb = ixrt.PluginField(
+        "bert_embeddings_position_embeddings",
+        weights_dict["bert_embeddings_position_embeddings"],
+        ixrt.PluginFieldType.FLOAT32,
+    )
+
+    output_fp16 = ixrt.PluginField(
+        "output_fp16",
+        np.array([1 if config.use_fp16 else 0]).astype(np.int32),
+        ixrt.PluginFieldType.INT32,
+    )
+    mha_type = ixrt.PluginField(
+        "mha_type_id",
+        np.array([get_mha_dtype(config)], np.int32),
+        ixrt.PluginFieldType.INT32,
+    )
+
+    pfc = ixrt.PluginFieldCollection(
+        [wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type]
+    )
     fn = emln_plg_creator.create_plugin("embeddings", pfc)
 
-    inputs = [input_ids, segment_ids, input_mask]
+    if config.use_trt:
+        input_ids = network.add_shuffle(input_ids)
+        input_ids.second_transpose = (1, 0)
+        segment_ids = network.add_shuffle(segment_ids)
+        segment_ids.second_transpose = (1, 0)
+        input_mask = network.add_shuffle(input_mask)
+        input_mask.second_transpose = (1, 0)
+        inputs = [
+            input_ids.get_output(0),
+            segment_ids.get_output(0),
+            input_mask.get_output(0),
+        ]
+    else:
+        inputs = [input_ids, segment_ids, input_mask]
     emb_layer = network.add_plugin_v2(inputs, fn)
     return emb_layer
 
+
 def build_engine(batch_sizes, sequence_lengths, config, weights_dict):
-    explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    explicit_batch_flag = 1 << int(ixrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
 
-    builder = trt.Builder(TRT_LOGGER)
-    with builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config:
+    builder = ixrt.Builder(TRT_LOGGER)
+    with builder.create_network(
+        explicit_batch_flag
+    ) as network, builder.create_builder_config() as builder_config:
         if config.use_fp16:
-            builder_config.set_flag(trt.BuilderFlag.FP16)
+            builder_config.set_flag(ixrt.BuilderFlag.FP16)
 
         # Create the network
-        emb_layer = emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes)
+        emb_layer = emb_layernorm(
+            builder,
+            network,
+            config,
+            weights_dict,
+            builder_config,
+            sequence_lengths,
+            batch_sizes,
+        )
         embeddings = emb_layer.get_output(0)
         mask_idx = emb_layer.get_output(1)
-        
+
         bert_out = bert_model(config, weights_dict, network, embeddings, mask_idx)
 
         squad_logits = squad_output("cls_", config, weights_dict, network, bert_out)
         squad_logits_out = squad_logits.get_output(0)
+        squad_logits.set_output_type(0, ixrt.float32)
 
         network.mark_output(squad_logits_out)
 
         build_start_time = time.time()
-        engine = builder.build_engine(network, builder_config)
-        build_time_elapsed = (time.time() - build_start_time)
-        TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed))
-        return engine
-    
+        serialized_engine = builder.build_serialized_network(network, builder_config)
+        build_time_elapsed = time.time() - build_start_time
+        TRT_LOGGER.log(
+            TRT_LOGGER.INFO, "build serialized_engine in {:.3f} Sec".format(build_time_elapsed)
+        )
+        return serialized_engine
+
+
 def str2bool(v):
-    return v.lower() in ('yes', 'true')    
+    return v.lower() in ("yes", "true")
+
 
 def main():
-    parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("-z", "--use_trt", type=str2bool, default=False, help = "Whether to use tensorRT or IxRT")
-    parser.add_argument("-x", "--onnx", required=False, help="The ONNX model file path.")
-    parser.add_argument("-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path.")
-    parser.add_argument("-o", "--output", required=True, default="bert_base_384.engine", help="The bert engine file, ex bert.engine")
-    parser.add_argument("-b", "--batch-size", nargs='+', help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.", type=int)
-    parser.add_argument("-s", "--sequence-length", nargs='+', help="Sequence length of the BERT model", type=int)
-    parser.add_argument("-c", "--config-dir", required=True,
-                        help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google")
-    parser.add_argument("-f", "--fp16", action="store_true", help="Indicates that inference should be run in FP16 precision", required=False)
-    parser.add_argument("-j", "--squad-json", default="squad/dev-v1.1.json", help="squad json dataset used for int8 calibration", required=False)
-    parser.add_argument("-v", "--vocab-file", default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt", help="Path to file containing entire understandable vocab", required=False)
-    parser.add_argument("--verbose", action="store_true", help="Turn on verbose logger and set profiling verbosity to DETAILED", required=False)
+    parser = argparse.ArgumentParser(
+        description="IxRT BERT Sample",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-z",
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use ixrt or IxRT",
+    )
+    parser.add_argument(
+        "-x", "--onnx", required=False, help="The ONNX model file path."
+    )
+    parser.add_argument(
+        "-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path."
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        required=True,
+        default="bert_base_384.engine",
+        help="The bert engine file, ex bert.engine",
+    )
+    parser.add_argument(
+        "-b",
+        "--batch-size",
+        nargs="+",
+        help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.",
+        type=int,
+    )
+    parser.add_argument(
+        "-s",
+        "--sequence-length",
+        nargs="+",
+        help="Sequence length of the BERT model",
+        type=int,
+    )
+    parser.add_argument(
+        "-c",
+        "--config-dir",
+        required=True,
+        help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google",
+    )
+    parser.add_argument(
+        "-f",
+        "--fp16",
+        action="store_true",
+        help="Indicates that inference should be run in FP16 precision",
+        required=False,
+    )
+    parser.add_argument(
+        "-j",
+        "--squad-json",
+        default="squad/dev-v1.1.json",
+        help="squad json dataset used for int8 calibration",
+        required=False,
+    )
+    parser.add_argument(
+        "-v",
+        "--vocab-file",
+        default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt",
+        help="Path to file containing entire understandable vocab",
+        required=False,
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Turn on verbose logger and set profiling verbosity to DETAILED",
+        required=False,
+    )
 
     args, _ = parser.parse_known_args()
     args.batch_size = args.batch_size or [1]
     args.sequence_length = args.sequence_length or [128]
-    args.use_trt = is_nvidia_platform()
 
     if len(args.sequence_length) not in [1, 3]:
-        print("Error: You must provide <args.sequence_length> either one or three integers.")
+        print(
+            "Error: You must provide <args.sequence_length> either one or three integers."
+        )
         sys.exit(1)
 
     if len(args.batch_size) not in [1, 3]:
@@ -375,7 +635,9 @@ def main():
         TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE
 
     bert_config_path = args.config_dir
-    TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path))
+    TRT_LOGGER.log(
+        TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path)
+    )
 
     config = BertConfig(bert_config_path, args.fp16, args.use_trt)
 
@@ -384,15 +646,18 @@ def main():
     elif args.pytorch != None:
         weights_dict = load_pytorch_weights_and_quant(args.pytorch, config)
     else:
-        raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.")
+        raise RuntimeError(
+            "You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model."
+        )
 
-    with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as engine:
-        TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...")
-        serialized_engine = engine.serialize()
+    with build_engine(
+        args.batch_size, args.sequence_length, config, weights_dict
+    ) as serialized_engine:
         TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output))
         with open(args.output, "wb") as fout:
             fout.write(serialized_engine)
         TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py b/models/nlp/plm/bert_large_squad/ixrt/builder_int8.py
similarity index 89%
rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py
rename to models/nlp/plm/bert_large_squad/ixrt/builder_int8.py
index e51d7c40..5a16bc53 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/builder_int8.py
@@ -1,35 +1,3 @@
-#!/usr/bin/env python3
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
 import os
 import argparse
 import json
@@ -43,6 +11,7 @@ from builder_utils_int8 import load_pytorch_weights_and_quant
 from builder_utils_int8 import WQKV, BQKV  # Attention Keys
 from builder_utils_int8 import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT  # Transformer Keys
 from builder_utils_int8 import SQD_W, SQD_B  # SQuAD Output Keys
+from builder import custom_fc as custom_fc_fp16
 
 trt_version = [int(n) for n in trt.__version__.split('.')]
 
@@ -114,8 +83,7 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask)
     """
     Add the attention layer
     """
-    assert(len(input_tensor.shape) == 5)
-    B, S, hidden_size, _, _ = input_tensor.shape
+    B, S, hidden_size = input_tensor.shape
     num_heads = config.num_attention_heads
     head_size = int(hidden_size / num_heads)
 
@@ -157,7 +125,6 @@ def skipln(prefix, config, init_dict, network, input_tensor, skip, residual, is_
     Add the skip layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 5
     hidden_size = idims[2]
 
     dtype = trt.float32
@@ -236,7 +203,6 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imas
     Add the transformer layer
     """
     idims = input_tensor.shape
-    assert len(idims) == 5
     hidden_size = idims[2]
 
     context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask)
@@ -281,13 +247,12 @@ def squad_output(prefix, config, init_dict, network, input_tensor):
     """
 
     idims = input_tensor.shape
-    assert len(idims) == 5
-    B, S, hidden_size, _, _ = idims
+    B, S, hidden_size = idims
 
     W_out = init_dict[prefix + SQD_W]
     B_out = init_dict[prefix + SQD_B]
 
-    dense = network.add_fully_connected(input_tensor, 2, W_out, B_out)
+    dense = custom_fc_fp16(network, input_tensor, 2, W_out, B_out)
     return dense
 
 def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes):
@@ -314,7 +279,7 @@ def emb_layernorm(builder, network, config, weights_dict, builder_config, sequen
     wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32)
     wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32)
 
-    output_fp16 = trt.PluginField("output_fp16", np.array([0]).astype(np.int32), trt.PluginFieldType.INT32)
+    output_fp16 = trt.PluginField("output_fp16", np.array([1]).astype(np.int32), trt.PluginFieldType.INT32)
     mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32)
 
     pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type])
@@ -354,10 +319,10 @@ def build_engine(batch_sizes, sequence_lengths, config, weights_dict):
         network.mark_output(squad_logits_out)
 
         build_start_time = time.time()
-        engine = builder.build_engine(network, builder_config)
+        plan = builder.build_serialized_network(network, builder_config)
         build_time_elapsed = (time.time() - build_start_time)
         TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed))
-        return engine
+        return plan
     
 def main():
     parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -389,7 +354,7 @@ def main():
     if args.verbose:
         TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE
 
-    bert_config_path = args.config_dir
+    bert_config_path = os.path.join(args.config_dir, "bert_config.json")
     TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path))
 
     config = BertConfig(bert_config_path, args.int8)
@@ -403,13 +368,11 @@ def main():
         raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.")
 
     # engine = build_engine(args.batch_size, args.workspace_size, args.sequence_length, config, weights_dict, args.squad_json, args.vocab_file, None, args.calib_num, args.verbose)
-    with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as engine:
-        TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...")
-        serialized_engine = engine.serialize()
+    with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as serialized_engine:
         TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output))
         with open(args.output, "wb") as fout:
             fout.write(serialized_engine)
         TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.")
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py b/models/nlp/plm/bert_large_squad/ixrt/builder_utils.py
similarity index 74%
rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py
rename to models/nlp/plm/bert_large_squad/ixrt/builder_utils.py
index 76737977..51c294e1 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/builder_utils.py
@@ -1,34 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
 import onnx
 import numpy as np
 import tensorrt as trt
@@ -93,6 +62,10 @@ def get_onnx_weight_dict(tensor_dict, config):
             Bqkv[1,:] = tensor_dict[prefix + BK]
             Bqkv[2,:] = tensor_dict[prefix + BV]
 
+            if config.use_trt:
+                Wqkv = np.ascontiguousarray(Wqkv.reshape((3, N, H, N, H)).transpose((1,0,2,3,4)))
+                Bqkv = np.ascontiguousarray(Bqkv.reshape((3, N, H)).transpose((1,0,2)))
+
             weights_dict[prefix + WQKV] = Wqkv.flatten()
             weights_dict[prefix + BQKV] = Bqkv.flatten()
             weights_dict[prefix + WQKV + "_notrans"] = np.ascontiguousarray(Wqkv.T).flatten()
@@ -103,6 +76,10 @@ def get_onnx_weight_dict(tensor_dict, config):
             flat_tensor = np.ascontiguousarray(tensor).flatten()
             weights_dict[outname] = flat_tensor
 
+            if outname.find("kernel") != -1 and config.use_trt:
+                tensor = np.transpose(tensor)
+                weights_dict[outname + "_notrans"] = np.ascontiguousarray(tensor).flatten()
+
     return weights_dict
 
 def onnx_to_trt_name(onnx_name):
@@ -162,24 +139,67 @@ def onnx_to_trt_name(onnx_name):
     parsed = '_'.join(toks)
     return parsed
 
+def pt_to_trt_name(pt_name):
+    """
+    Converting variables in the onnx checkpoint to names corresponding to the naming convention used in the TF version, expected by the builder
+    """
+    qkv_strings = {'key', 'value', 'query', 'query_key_value'}
+    pt_name = pt_name.lower()
+    toks = [t.strip('_') for t in pt_name.split('.')]
+    if toks[0] == 'bert': #embeddings or encoder
+        if toks[1] == 'encoder': #transformer
+            if toks[-2] == 'layernorm': #bias->beta, weight->gamma
+                toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma'
+            elif (toks[-2] == 'dense' or toks[-2] in qkv_strings) and toks[-1] == 'weight':
+                toks[-1] = 'kernel'
+
+            if 'final_input_quantizer' not in toks[2]:
+                ind = toks.index('layers')+1 if 'layers' in toks else 3
+                toks = toks[ind:]
+                toks[0] = 'l{}'.format(int(toks[0]))
+
+        else:
+            if toks[-2] == 'layernorm': #bias->beta, weight->gamma
+                toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma'
+            else: #embeddings: drop "_weight" suffix
+                toks = toks[:-1]
+
+    elif 'qa_outputs' in pt_name: ##
+        name = 'cls_squad_output_bias' if toks[-1] == 'bias' else 'cls_squad_output_weights'
+        return name
+    else:
+        print("Encountered unknown case:", pt_name)
+        assert(False)
+    parsed = '_'.join(toks)
+    return parsed
+
 def load_onnx_weights_and_quant(path, config):
     """
     Load the weights from the onnx checkpoint
     """
     model = onnx.load(path)
     weights = model.graph.initializer
+    # for w in weights:
+    #     print(w.name, w.dims,flush=True)
     tensor_dict = dict((onnx_to_trt_name(w.name), np.frombuffer(w.raw_data, np.int8).reshape(w.dims))
                        if w.name.split('_')[-1] == 'mask' else
                        (onnx_to_trt_name(w.name), np.frombuffer(w.raw_data, np.float32).reshape(w.dims))
                        for w in weights)
+    # for key in tensor_dict:
+    #     print(key, tensor_dict[key].shape,flush=True)
+
     return get_onnx_weight_dict(tensor_dict, config)
 
 def load_pytorch_weights_and_quant(path, config):
     """
     Load the weights from the pytorch checkpoint
     """
-    state_dict = torch.load(path, map_location='cpu')["model"]
-    tensor_dict = {onnx_to_trt_name(name):val.numpy() for name, val in state_dict.items()}
+    state_dict = torch.load(path, map_location='cpu')
+    # for name in state_dict:
+    #     print(name, state_dict[name].size(),flush=True)
+    tensor_dict = {pt_to_trt_name(name):val.numpy()  for name, val in state_dict.items()}
+    # for key in tensor_dict:
+    #     print(key, tensor_dict[key].shape,flush=True)
     return get_onnx_weight_dict(tensor_dict, config)
 
 class BertConfig:
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py b/models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py
similarity index 85%
rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py
rename to models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py
index 56ac8d18..77997b1b 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py
@@ -1,34 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
 import numpy as np
 import tensorrt as trt
 import json
diff --git a/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh b/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh
index ebc8effc..bc47b149 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh
+++ b/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh
@@ -28,14 +28,6 @@ fi
 # install ixrt run
 bash /root/data/install/ixrt-1.0.0.alpha+corex.4.3.0-linux_x86_64.run
 
-if [ "$1" = "nvidia" ]; then
-    cmake -S . -B build -DUSE_TENSORRT=true
-    cmake --build build -j16
-else
-    cmake -S . -B build
-    cmake --build build -j16
-fi
-
 pip install -r requirements.txt
-mkdir -p ./python/data
-ln -s /root/data/checkpoints/bert-large-uncased/ ./python/data && ln -s /root/data/datasets/squad/ ./python/data
\ No newline at end of file
+mkdir -p ./data
+ln -s /root/data/checkpoints/bert-large-uncased/ ./data && ln -s /root/data/datasets/squad/ ./data
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake
deleted file mode 100644
index 07c436f5..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-if(NOT COMPILER_PATH)
-  if (EXISTS /opt/sw_home/local/bin/clang++)
-    set(COMPILER_PATH /opt/sw_home/local/bin)
-  elseif (EXISTS /usr/local/corex/bin/clang++)
-    set(COMPILER_PATH /usr/local/corex/bin)
-  else()
-    message(STATUS "COMPILER_PATH is not set and we couldn't find clang compiler neither, will use system C/C++ compiler")
-  endif()
-endif()
-if (COMPILER_PATH)
-  set(CMAKE_CXX_COMPILER ${COMPILER_PATH}/clang++)
-  set(CMAKE_C_COMPILER ${COMPILER_PATH}/clang)
-endif()
-
-message(STATUS "Use ${CMAKE_CXX_COMPILER} and ${CMAKE_C_COMPILER} as C++ and C compiler")
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake
deleted file mode 100644
index 58e39e60..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake
+++ /dev/null
@@ -1,57 +0,0 @@
-# This cmake does:
-# - Set CUDA_PATH
-# - Find libcudart
-# - Util functions like cuda_add_library, cuda_add_executable
-
-
-# CUDA_PATH can be specified through below means shown in priority order 1.
-# cmake command line argument, -DCUDA_PATH=/path/to/cuda 2. bash environment
-# variable, export CUDA_PATH=/path/to/cuda
-if(DEFINED ENV{CUDA_PATH})
-  set(CUDA_PATH "$ENV{CUDA_PATH}")
-else()
-  set(CUDA_PATH
-      "/opt/sw_home/local/cuda"
-      CACHE PATH "cuda installation root path")
-endif()
-message(STATUS "Use CUDA_PATH=${CUDA_PATH} ")
-
-# GPU arch
-if(NOT "${CUDA_ARCH}" STREQUAL "")
-  set(CUDA_ARCH
-      ${CUDA_ARCH}
-      CACHE STRING "GPU architecture tag, ivcore11")
-else("${CUDA_ARCH}" STREQUAL "")
-  set(CUDA_ARCH
-      "ivcore11"
-      CACHE STRING "GPU architecture tag, ivcore11")
-endif()
-message(STATUS "Use CUDA_ARCH=${CUDA_ARCH}")
-
-macro(cuda_add_executable)
-  foreach(File ${ARGN})
-    if(${File} MATCHES ".*\.cu$")
-      set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-    endif()
-  endforeach()
-  add_executable(${ARGV})
-endmacro()
-
-macro(cuda_add_library)
-  foreach(File ${ARGN})
-    if(${File} MATCHES ".*\.cu$")
-      set_source_files_properties(${File} PROPERTIES LANGUAGE CXX)
-    endif()
-  endforeach()
-  add_library(${ARGV})
-endmacro()
-
-find_library(
-  CUDART_LIBRARY cudart
-  PATHS ${CUDA_PATH}
-  PATH_SUFFIXES lib/x64 lib64 lib
-  NO_DEFAULT_PATH)
-
-if (NOT USE_TRT)
-  set(CUDA_LIBRARIES cudart)
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake
deleted file mode 100644
index 5b0f2729..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-# This cmake file decides how to build with IxRT
-# Custom IxRT Path
-if(NOT "${IXRT_HOME}" STREQUAL "")
-    set(IXRT_INCLUDE_DIR ${IXRT_HOME}/include)
-    set(IXRT_LIB_DIR ${IXRT_HOME}/lib)
-# From default paths
-else()
-  set(IXRT_INCLUDE_DIR /usr/local/corex/include)
-  set(IXRT_LIB_DIR /usr/local/corex/lib)
-endif()
-
-message(STATUS "IXRT_INCLUDE_DIR:   ${IXRT_INCLUDE_DIR}")
-message(STATUS "IXRT_LIB_DIR:   ${IXRT_LIB_DIR}")
-
-if(EXISTS ${IXRT_INCLUDE_DIR} AND EXISTS ${IXRT_LIB_DIR})
-    include_directories(${IXRT_INCLUDE_DIR})
-else()
-    message( FATAL_ERROR "IxRT library doesn't exist!")
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake
deleted file mode 100644
index 60360699..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB_RECURSE PLUGIN_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu)
-
-if(DEFINED USE_TENSORRT)
-   list(FILTER PLUGIN_FILES EXCLUDE REGEX "${CMAKE_CURRENT_SOURCE_DIR}/src/backend/ixinfer")
-endif()
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py b/models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py
similarity index 83%
rename from models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py
rename to models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py
index ce5bb98d..c73db423 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py
@@ -1,18 +1,4 @@
 #!/usr/bin/env python3
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 #
 # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
@@ -107,10 +93,6 @@ def evaluate(dataset, predictions, f1_acc):
         print("&&&& FAILED TensorRT BERT Squad Accuracy matches reference.")
     else:
         print("&&&& PASSED TensorRT BERT Squad Accuracy matches reference.")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["exact_match"] = round(exact_match, 3)
-    metricResult["metricResult"]["f1"] = round(f1, 3)
-    print(metricResult)
     return {'exact_match': exact_match, 'f1': f1}
 
 if __name__ == '__main__':
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/__init__.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/__init__.py
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/__init__.py
rename to models/nlp/plm/bert_large_squad/ixrt/helpers/__init__.py
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py
similarity index 89%
rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py
rename to models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py
index beacc625..73084f39 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py
@@ -19,8 +19,8 @@
 import tensorrt as trt
 import os
 
-import pycuda.driver as cuda
-import pycuda.autoinit
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 import numpy as np
 import helpers.tokenization as tokenization
 import helpers.data_processing as dp
@@ -80,9 +80,12 @@ class BertCalibrator(trt.IInt8LegacyCalibrator):
                 segment_ids = features[0].segment_ids
                 input_mask = features[0].input_mask
 
-        cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel())
-        cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel())
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[0], input_ids.ravel(), input_ids.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[1], segment_ids.ravel(), segment_ids.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        err, = cuda.cuMemcpyHtoD(self.device_inputs[2], input_mask.ravel(), input_mask.ravel().nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         self.current_index += self.batch_size
         return self.device_inputs
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py
similarity index 98%
rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py
rename to models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py
index 712e1a61..88459ebf 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py
@@ -159,14 +159,14 @@ def convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_le
         input_mask = [1] * len(input_ids)
 
         # Zero-pad up to the sequence length.
-        # while len(input_ids) < max_seq_length:
-        #     input_ids.append(0)
-        #     input_mask.append(0)
-        #     segment_ids.append(0)
-
-        # assert len(input_ids) == max_seq_length
-        # assert len(input_mask) == max_seq_length
-        # assert len(segment_ids) == max_seq_length
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
 
         def create_int_feature(values):
             feature = np.asarray(values, dtype=np.int32, order=None)
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/tokenization.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/tokenization.py
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/tokenization.py
rename to models/nlp/plm/bert_large_squad/ixrt/helpers/tokenization.py
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/inference.py b/models/nlp/plm/bert_large_squad/ixrt/inference.py
similarity index 79%
rename from models/nlp/plm/bert_large_squad/ixrt/python/inference.py
rename to models/nlp/plm/bert_large_squad/ixrt/inference.py
index ec93972d..95b88dc5 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/inference.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/inference.py
@@ -38,8 +38,8 @@ import argparse
 import collections
 import numpy as np
 import tensorrt as trt
-import pycuda.driver as cuda
-import pycuda.autoinit
+import cuda.cuda as cuda
+import cuda.cudart as cudart
 
 import helpers.tokenization as tokenization
 import helpers.data_processing as dp
@@ -153,14 +153,15 @@ if __name__ == '__main__':
                 break
         if selected_profile == -1:
             raise RuntimeError("Could not find any profile that can run batch size {}.".format(args.batch_size))
-        
+
         # Create a stream in which to copy inputs/outputs and run inference.
-        stream = cuda.Stream()
+        err_dr, stream = cuda.cuStreamCreate(0)
+        assert(err_dr == cuda.CUresult.CUDA_SUCCESS)
         
         # if args.use_trt:
         #     context.active_optimization_profile = selected_profile
         # else:
-        context.set_optimization_profile_async(selected_profile, stream.handle)
+        context.set_optimization_profile_async(selected_profile, stream)
         binding_idx_offset = selected_profile * num_binding_per_profile
 
         input_shape = (args.batch_size, max_seq_length)
@@ -170,11 +171,17 @@ if __name__ == '__main__':
         assert context.all_binding_shapes_specified
 
         # Allocate device memory for inputs.
-        d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]
+        d_inputs = []
+        for binding in range(3):
+            err, ptr = cuda.cuMemAlloc(input_nbytes)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+            d_inputs.append(ptr)
 
         # Allocate output buffer by querying the size from the context. This may be different for different input shapes.
-        h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32)
-        d_output = cuda.mem_alloc(h_output.nbytes)
+        h_output = np.empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32) 
+ 
+        err, d_output = cuda.cuMemAlloc(h_output.nbytes)
+        assert(err == cuda.CUresult.CUDA_SUCCESS)
 
         def inference(features, tokens):
             global h_output
@@ -191,25 +198,32 @@ if __name__ == '__main__':
                 segment_ids_batch = np.repeat(np.expand_dims(feature.segment_ids, 0), args.batch_size, axis=0)
                 input_mask_batch = np.repeat(np.expand_dims(feature.input_mask, 0), args.batch_size, axis=0)
 
-                input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel()))
-                segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel()))
-                input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel()))
+                input_ids = cuda.cuMemHostRegister(np.ascontiguousarray(input_ids_batch.ravel()), input_ids_batch.nbytes)
+                segment_ids = cuda.cuMemHostRegister(np.ascontiguousarray(segment_ids_batch.ravel()), segment_ids_batch.nbytes)
+                input_mask = cuda.cuMemHostRegister(np.ascontiguousarray(input_mask_batch.ravel()), input_mask.nbytes)
 
                 eval_start_time = time.time()
-                cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)
-                cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)
-                cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], input_ids, input_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], segment_ids, segment_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[2], input_mask, input_mask.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
                 # Run inference
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
                 # Synchronize the stream
-                stream.synchronize()
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
                 eval_time_elapsed += (time.time() - eval_start_time)
 
                 # Transfer predictions back from GPU
-                cuda.memcpy_dtoh_async(h_output, d_output, stream)
-                stream.synchronize()
-
+                err, = cuda.cuMemcpyDtoHAsync(h_output, d_output, h_output.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                # for x in h_output[0].reshape(-1,2):
+                #     print(x)
                 # Only retrieve and post-process the first batch
                 batch = h_output[0]
                 
@@ -218,7 +232,7 @@ if __name__ == '__main__':
                     end_logits = np.array(batch.squeeze()[:, 1]),
                     feature_index = feature_index
                     ))
-
+            
             eval_time_elapsed /= len(features)
 
             # Total number of n-best predictions to generate in the nbest_predictions.json output file
@@ -258,14 +272,16 @@ if __name__ == '__main__':
             batch_example = []
             max_batch_length = 0
             seq_length_list = []
-            for index in tqdm(sort_index):
+            for index in sort_index:
                 batch_feature.append(features_list[index])
                 batch_example.append(squad_examples[index])
                 max_batch_length = max(max_batch_length, len(features_list[index].input_ids))
                 if args.int8:
-                    max_batch_length = max_seq_length
-                else:
                     max_batch_length = math.ceil(max_batch_length / 2) * 2
+                else:
+                    # workround to solve bs=1 10% slow
+                    if args.batch_size == 1:
+                        max_batch_length = math.ceil(max_batch_length / 64) * 64
                 seq_length_list.append(len(features_list[index].input_ids))
                 if len(batch_feature) == args.batch_size:
                     batch_input_ids = [
@@ -319,28 +335,39 @@ if __name__ == '__main__':
                 for binding in range(3):
                     context.set_binding_shape(binding, (args.batch_size, max_seq_length))
                 assert context.all_binding_shapes_specified
-                cuda.memcpy_htod_async(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream)
-                cuda.memcpy_htod_async(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream)
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
-            stream.synchronize()
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel().nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel().nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
+            err, = cuda.cuStreamSynchronize(stream)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
             
-            start_time = time.time()
+            infer_toal_time = 0
             output_index = 0
             for input_ids, segment_ids in tqdm(all_token_ids):
                 for binding in range(3):
                     context.set_binding_shape(binding, input_ids.shape)
                 assert context.all_binding_shapes_specified
 
-                cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream)
-                cuda.memcpy_htod_async(d_inputs[1], segment_ids.ravel(), stream)
-                stream.synchronize()
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], input_ids.ravel(), input_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], segment_ids.ravel(), segment_ids.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                infer_start_time = time.time()
+                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                infer_end_time = time.time()
+                infer_time = infer_end_time - infer_start_time
+                infer_toal_time += infer_time
+                err, = cuda.cuMemcpyDtoHAsync(h_output, d_output, h_output.nbytes, stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+                err, = cuda.cuStreamSynchronize(stream)
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
 
-                context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)
-                stream.synchronize()
-                
-                cuda.memcpy_dtoh_async(h_output, d_output, stream)
-                stream.synchronize()
-    
                 new_h_output = np.array(h_output.reshape(-1)[:input_ids.shape[0]*input_ids.shape[1]*2]).reshape(input_ids.shape[0], input_ids.shape[1], 2)
                 for index in range(input_ids.shape[0]):
                     networkOutputs.append(_NetworkOutput(
@@ -349,7 +376,12 @@ if __name__ == '__main__':
                         feature_index = index
                     ))
                     output_index += 1
-            infer_time = time.time() - start_time
+            for i in range(3):
+                err, = cuda.cuMemFree(d_inputs[i])
+                assert(err == cuda.CUresult.CUDA_SUCCESS)
+            err, = cuda.cuMemFree(d_output)
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+            
             output_index = 0
             for (be, bf) in zip(batch_example_list, batch_feature_list):
                 for index in range(len(bf)):
@@ -357,7 +389,7 @@ if __name__ == '__main__':
                         [networkOutputs[output_index]], args.n_best_size, args.max_answer_length)
                     output_index += 1
                     all_precision[be[index].id] = prediction
-            return infer_time, all_precision
+            return infer_toal_time, all_precision
 
         status = 0
         if squad_examples:
@@ -366,21 +398,16 @@ if __name__ == '__main__':
             features_list = []
             lengths = []
 
-            for example_index, example in tqdm(enumerate(squad_examples)):
+            for example_index, example in enumerate(squad_examples):
                 features = question_features(example.doc_tokens, example.question_text)
                 features_list.append(features[0])
                 lengths.append(len(features[0].input_ids))
 
             sort_index = np.argsort(lengths)
-            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)
-            print(F"E2E time : {infer_time:.3f} seconds")
+            infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions)          
             
-            qps = len(squad_examples)/infer_time
+            qps = math.ceil(len(squad_examples)/args.batch_size)*args.batch_size/infer_time
             print(f"Latency QPS: {qps} sentences/s")
-            metricResult = {"metricResult": {}}
-            metricResult["metricResult"]["E2E time"] = round(infer_time, 3)
-            metricResult["metricResult"]["Latency QPS"] = round(qps, 3)
-            print(metricResult)
 
             with open(output_prediction_file, "w") as f:
                 f.write(json.dumps(all_predictions, indent=4))
@@ -415,4 +442,4 @@ if __name__ == '__main__':
                     # question_text = input("Question (to exit, type one of {:}): ".format(EXIT_CMDS))
         del context
         del engine
-        sys.exit(status)
\ No newline at end of file
+        sys.exit(status)
diff --git a/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py b/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py
new file mode 100644
index 00000000..b3701308
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,13 @@
+from os.path import join, dirname, exists
+import tensorrt as trt
+import ctypes
+
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+    if not dynamic_path:
+        dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    trt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/perf.py b/models/nlp/plm/bert_large_squad/ixrt/perf.py
similarity index 81%
rename from models/nlp/plm/bert_large_squad/ixrt/python/perf.py
rename to models/nlp/plm/bert_large_squad/ixrt/perf.py
index 968a3943..9f98fe39 100644
--- a/models/nlp/plm/bert_large_squad/ixrt/python/perf.py
+++ b/models/nlp/plm/bert_large_squad/ixrt/perf.py
@@ -1,34 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-#
-# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
 import argparse
 import ctypes
 import time
@@ -55,6 +24,7 @@ class DeviceBuffer(object):
 
 def main():
     parser = argparse.ArgumentParser(description='BERT Inference Benchmark')
+    parser.add_argument("-z", "--use_trt", action="store_false", help="Whether to use tensorRT or IxRT")
     parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine')
     parser.add_argument('-b', '--batch-size', default=[], action="append", help='Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.', type=int)
     parser.add_argument('-s', '--sequence-length', default=128, help='Sequence length of the BERT model', type=int)
@@ -66,7 +36,7 @@ def main():
     args.batch_size = args.batch_size or [1]
 
     # Import necessary plugins for BERT TensorRT
-    load_ixrt_plugin(TRT_LOGGER, dynamic_path="../build/libixrt_plugin.so")
+    load_ixrt_plugin(TRT_LOGGER)
 
     with open(args.engine, 'rb') as f:
         runtime = trt.Runtime(TRT_LOGGER)
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py b/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py
deleted file mode 100644
index 93301c30..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from os.path import join, dirname, exists, abspath
-import tensorrt as trt
-import ctypes
-import os
-import subprocess
-
-def is_nvidia_platform():
-    try:
-        # 尝试运行 nvidia-smi
-        subprocess.check_output(['nvidia-smi'])
-        return True
-    except (subprocess.CalledProcessError, FileNotFoundError):
-        return False
-
-def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(abspath(__file__)), "..", "build", "libixrt_plugin.so") 
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    handle = ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
-    handle.initLibNvInferPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-    handle.initLibNvInferPlugins.restype = ctypes.c_bool
-    handle.initLibNvInferPlugins(None, namespace.encode('utf-8'))
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh
deleted file mode 100644
index 7a7a05c5..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 builder.py -x ./data/bert-large-uncased/bert_large_v1_1_fake_quant.onnx \
-                       -w 4096 \
-                       -o ./data/bert_large_384.engine \
-                       -s 1 384 384 \
-                       -b 1 ${BSZ} ${BSZ} \
-                       --fp16 \
-                       -c ./data/bert-large-uncased/bert_config.json
-else
-    echo 'USE_INT8=True'
-    python3 builder_int8.py -pt ./data/bert-large-uncased/bert_large_int8_qat.bin \
-                -o ./data/bert_large_384_int8.engine \
-                -s 1 384 384 \
-                -b 1 ${BSZ} ${BSZ} \
-                -i \
-                -c ./data/bert-large-uncased/bert_config.json 
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh
deleted file mode 100644
index 550c735e..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-PASSAGE='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, 
-speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations 
-for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components 
-to take advantage of powerful TensorRT optimizations for your apps.'
-QUESTION="What is TensorRT?"
-
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 inference.py -e ./data/bert_large_384.engine \
-                        -s 384 \
-                        -p $PASSAGE \
-                        -q $QUESTION \
-                        -v ./data/bert-large-uncased/vocab.txt 
-else
-    echo 'USE_INT8=True'
-    python3 inference.py -e ./data/bert_large_384_int8.engine \
-                        -s 384 \
-                        -p $PASSAGE \
-                        -q $QUESTION \
-                        -v ./data/bert-large-uncased/vocab.txt 
-fi
-
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh
deleted file mode 100644
index 088b1d39..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ./data/bert_large_384.engine \
-                            -b ${BSZ} \
-                            -s 384 \
-                            -sq ./data/squad/dev-v1.1.json \
-                            -v ./data/bert-large-uncased/vocab.txt \
-                            -o ./data/predictions-bert_large_384.json 
-    python3 evaluate-v1.1.py  ./data/squad/dev-v1.1.json  ./data/predictions-bert_large_384.json 90
-else
-    echo 'USE_INT8=True'
-    UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ./data/bert_large_384_int8.engine \
-                            -b ${BSZ} \
-                            -s 384 \
-                            -sq ./data/squad/dev-v1.1.json \
-                            -v ./data/bert-large-uncased/vocab.txt \
-                            -o ./data/predictions-bert_large_384_int8.json \
-                            -i
-    python3 evaluate-v1.1.py  ./data/squad/dev-v1.1.json  ./data/predictions-bert_large_384_int8.json 88
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh
deleted file mode 100644
index f19c1def..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-index=0
-options=("$@") # 将所有参数存储到数组中
-PRECISION=fp16
-BSZ=32
-
-# 循环遍历所有参数
-while [[ $index -lt ${#options[@]} ]]; do
-    argument=${options[$index]}
-    case $argument in
-    --bs)
-        ((index++))
-        BSZ=${options[$index]}
-        ;;
-    --prec)
-        ((index++))
-        PRECISION=${options[$index]}
-        ;;
-    esac
-    ((index++))
-done
-
-# 设置INT8_FLAG
-INT8_FLAG=""
-if [[ "$PRECISION" == "int8" ]]; then
-    INT8_FLAG="--int8"
-fi
-
-echo "PREC_FLAG=$INT8_FLAG"
-echo "PRECISION=$PRECISION"
-echo "BSZ=$BSZ"
-
-# 检查环境并执行相应的脚本
-if command -v ixsmi &>/dev/null; then
-    echo "MR env"
-    cmake -S . -B build
-    cmake --build build -j16
-elif command -v nvidia-smi &>/dev/null; then
-    echo "NV env"
-    cmake -S . -B build -DUSE_TENSORRT=true
-    cmake --build build -j16
-else
-    echo "No driver detected"
-    exit 1
-fi
-cd ./python/
-bash script/build_engine.sh --bs $BSZ $INT8_FLAG
-bash script/inference_squad.sh --bs $BSZ $INT8_FLAG
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh
deleted file mode 100644
index 1ad462a7..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-BSZ=1
-USE_FP16=True
-
-# Update arguments
-index=0
-options=$@
-arguments=($options)
-for argument in $options
-do
-    index=`expr $index + 1`
-    case $argument in
-      --bs) BSZ=${arguments[index]};;
-      --int8) USE_FP16=False;;
-    esac
-done
-
-if [ "$USE_FP16" = "True" ]; then
-    echo 'USE_FP16=True'
-    python3 perf.py -e ./data/bert_large_384.engine -b ${BSZ} -s 384
-else
-    echo 'USE_INT8=True'
-    python3 perf.py -e ./data/bert_large_384_int8.engine -b ${BSZ} -s 384
-fi
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh
new file mode 100644
index 00000000..02025e96
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_accuracy.sh
@@ -0,0 +1,50 @@
+set -eo pipefail
+
+BSZ=32
+TGT=90
+USE_TRT=False
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+      --use_trt) USE_TRT=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+project_path=./
+checkpoints_path=${project_path}/data/bert-large-uncased
+datasets_path=${project_path}/data
+
+echo 'USE_TRT='${USE_TRT}
+export USE_TRT=$USE_TRT
+
+echo "Step1 Build Engine FP16(bert large squad)!"
+
+python3 builder.py -x ${checkpoints_path}/bert_large_v1_1_fake_quant.onnx \
+                   -w 4096 \
+                   -o ${checkpoints_path}/bert_large_b${BSZ}.engine \
+                   -s 1 384 384\
+                   -b 1 ${BSZ} ${BSZ}\
+                   --fp16 \
+                   -c ${checkpoints_path}/bert_config.json \
+                   -z ${USE_TRT}
+
+echo "Step2 Run dev.json and generate json"
+python3 inference.py -e ${checkpoints_path}/bert_large_b${BSZ}.engine \
+                        -s 384 \
+                        -b ${BSZ} \
+                        -sq ${datasets_path}/squad/dev-v1.1.json \
+                        -v ${checkpoints_path}/vocab.txt \
+                        -o ${checkpoints_path}/predictions-bert_large_b${BSZ}.json \
+                        -z ${USE_TRT}
+
+echo "Step3 Inference(test F1-score)"
+python3 evaluate-v1.1.py  ${datasets_path}/squad/dev-v1.1.json  ${checkpoints_path}/predictions-bert_large_b${BSZ}.json ${TGT}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh
new file mode 100644
index 00000000..44285fa1
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_fp16_performance.sh
@@ -0,0 +1,48 @@
+set -eo pipefail
+
+BSZ=32
+TGT=150
+USE_TRT=False
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+      --use_trt) USE_TRT=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+project_path=./
+checkpoints_path=${project_path}/data/bert-large-uncased
+datasets_path=${project_path}/data
+
+echo 'USE_TRT='${USE_TRT}
+export USE_TRT=$USE_TRT
+
+echo "Step1 Build Engine FP16(bert large squad)!"
+
+python3 builder.py -x ${checkpoints_path}/bert_large_v1_1_fake_quant.onnx \
+                   -w 4096 \
+                   -o ${checkpoints_path}/bert_large_b${BSZ}.engine \
+                   -s 1 384 384\
+                   -b 1 ${BSZ} ${BSZ}\
+                   --fp16 \
+                   -c ${checkpoints_path}/bert_config.json \
+                   -z ${USE_TRT}
+
+echo "Step2 Inference(test QPS)"
+UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ${checkpoints_path}/bert_large_b${BSZ}.engine \
+                        -s 384 \
+                        -b ${BSZ} \
+                        -sq ${datasets_path}/squad/dev-v1.1.json \
+                        -v ${checkpoints_path}/vocab.txt \
+                        -o ${checkpoints_path}/predictions-bert_large_b${BSZ}.json \
+                        -z ${USE_TRT} \
+                        --target_qps ${TGT}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh
new file mode 100644
index 00000000..ddbcf234
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_accuracy.sh
@@ -0,0 +1,49 @@
+set -eo pipefail
+
+BSZ=32
+TGT=88
+USE_TRT=False
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+      --use_trt) USE_TRT=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+project_path=./
+checkpoints_path=${project_path}/data/bert-large-uncased
+datasets_path=${project_path}/data
+
+echo 'USE_TRT='${USE_TRT}
+export USE_TRT=$USE_TRT
+
+echo "Step1 Build Engine Int8(bert large squad)!"
+
+python3 builder_int8.py -pt ${checkpoints_path}/bert_large_int8_qat.bin \
+                -o ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \
+                -b 1 ${BSZ} ${BSZ} \
+                -s 1 384 384 \
+                -i \
+                -c ${checkpoints_path}
+
+echo "Step2 Run dev.json and generate json"
+python3 inference.py -e ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \
+                        -b ${BSZ} \
+                        -s 384 \
+                        -sq ${datasets_path}/squad/dev-v1.1.json \
+                        -v ${checkpoints_path}/vocab.txt \
+                        -o ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json \
+                        -z ${USE_TRT} \
+                        -i
+
+echo "Step3 Inference(test F1-score)"
+python3 evaluate-v1.1.py  ${datasets_path}/squad/dev-v1.1.json  ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json ${TGT}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh
new file mode 100644
index 00000000..3ead05ef
--- /dev/null
+++ b/models/nlp/plm/bert_large_squad/ixrt/scripts/infer_bert_large_squad_int8_performance.sh
@@ -0,0 +1,47 @@
+set -eo pipefail
+
+BSZ=32
+TGT=200
+USE_TRT=False
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+    index=`expr $index + 1`
+    case $argument in
+      --bs) BSZ=${arguments[index]};;
+      --tgt) TGT=${arguments[index]};;
+      --use_trt) USE_TRT=${arguments[index]};;
+    esac
+done
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+project_path=./
+checkpoints_path=${project_path}/data/bert-large-uncased
+datasets_path=${project_path}/data
+
+echo 'USE_TRT='${USE_TRT}
+export USE_TRT=$USE_TRT
+
+echo "Step1 Build Engine Int8(bert large squad)!"
+
+python3 builder_int8.py -pt ${checkpoints_path}/bert_large_int8_qat.bin \
+                -o ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \
+                -b 1 ${BSZ} ${BSZ} \
+                -s 1 384 384 \
+                -i \
+                -c ${checkpoints_path}
+
+echo "Step2 Inference(test QPS)"
+UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \
+                        -b ${BSZ} \
+                        -s 384 \
+                        -sq ${datasets_path}/squad/dev-v1.1.json \
+                        -v ${checkpoints_path}/vocab.txt \
+                        -o ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json \
+                        -z ${USE_TRT} \
+                        --target_qps ${TGT} \
+                        -i         
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/prepare.sh b/models/nlp/plm/bert_large_squad/ixrt/scripts/prepare.sh
similarity index 100%
rename from models/nlp/plm/bert_large_squad/ixrt/python/script/prepare.sh
rename to models/nlp/plm/bert_large_squad/ixrt/scripts/prepare.sh
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc b/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc
deleted file mode 100644
index ceea8d8b..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include <memory>
-#include <mutex>
-#include <stack>
-#include <unordered_set>
-
-#include "NvInfer.h"
-#include "NvInferPlugin.h"
-#include "NvInferRuntimeCommon.h"
-#include "custom_fc/fcPlugin.h"
-#include "emb_layernorm/embLayerNormPlugin.h"
-#include "emb_layernorm/embLayerNormInt8Plugin.h"
-#include "gelu/geluPlugin.h"
-#include "qkv_to_context/qkvToContextInt8Plugin.h"
-#include "qkv_to_context/qkvToContextPlugin.h"
-#include "skip_layernorm/skipLayerNormInt8Plugin.h"
-#include "skip_layernorm/skipLayerNormPlugin.h"
-#include "ffn/ffnPlugin.h"
-
-using namespace nvinfer1;
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-extern ILogger* gLogger;
-
-}  // namespace plugin
-}  // namespace nvinfer1
-
-namespace {
-// This singleton ensures that each plugin is only registered once for a given
-// namespace and type, and attempts of duplicate registration are ignored.
-class PluginCreatorRegistry {
-   public:
-    static PluginCreatorRegistry& getInstance() {
-        static PluginCreatorRegistry instance;
-        return instance;
-    }
-
-    string GetPluginUniqKey(const AsciiChar* const plugin_namespace, const AsciiChar* const plugin_name,
-                            const AsciiChar* const plugin_version) {
-        stringstream os;
-        if (plugin_namespace[0] != '\0') {
-            os << plugin_namespace << "/";
-        }
-        os << plugin_name;
-        if (plugin_version[0] != '\0') {
-            os << "/" << plugin_version;
-        }
-        return os.str();
-    }
-
-    template <typename CreatorType>
-    void addPluginCreator(void* logger, char const* libNamespace) {
-        printf("start addPluginCreator %s\n", libNamespace);
-        // Make accesses to the plugin creator registry thread safe
-        std::lock_guard<std::mutex> lock(mRegistryLock);
-
-        std::string errorMsg;
-        std::string verboseMsg;
-
-        std::unique_ptr<CreatorType> pluginCreator{new CreatorType{}};
-        pluginCreator->setPluginNamespace(libNamespace);
-
-        nvinfer1::ixrt_plugin::gLogger = static_cast<nvinfer1::ILogger*>(logger);
-        std::string pluginType = GetPluginUniqKey(pluginCreator->getPluginNamespace(), pluginCreator->getPluginName(),
-                                                  pluginCreator->getPluginVersion());
-
-        if (mRegistryList.find(pluginType) == mRegistryList.end()) {
-            bool status = getPluginRegistry()->registerCreator(*pluginCreator, libNamespace);
-            if (status) {
-                mRegistry.push(std::move(pluginCreator));
-                mRegistryList.insert(pluginType);
-                printf("Registered plugin creator -  %s\n", pluginType.c_str());
-                verboseMsg = "Registered plugin creator - " + pluginType;
-            } else {
-                printf("Could not register plugin creator - %s\n", pluginType.c_str());
-                errorMsg = "Could not register plugin creator -  " + pluginType;
-            }
-        } else {
-            printf("Plugin creator already registered - %s\n", pluginType.c_str());
-            verboseMsg = "Plugin creator already registered - " + pluginType;
-        }
-
-        if (logger) {
-            if (!errorMsg.empty()) {
-                nvinfer1::ixrt_plugin::gLogger->log(ILogger::Severity::kERROR, errorMsg.c_str());
-            }
-            if (!verboseMsg.empty()) {
-                nvinfer1::ixrt_plugin::gLogger->log(ILogger::Severity::kVERBOSE, verboseMsg.c_str());
-            }
-        }
-    }
-
-    ~PluginCreatorRegistry() {
-        std::lock_guard<std::mutex> lock(mRegistryLock);
-
-        // Release pluginCreators in LIFO order of registration.
-        while (!mRegistry.empty()) {
-            mRegistry.pop();
-        }
-        mRegistryList.clear();
-    }
-
-   private:
-    PluginCreatorRegistry() {}
-
-    std::mutex mRegistryLock;
-    std::stack<std::unique_ptr<IPluginCreator>> mRegistry;
-    std::unordered_set<std::string> mRegistryList;
-
-   public:
-    PluginCreatorRegistry(PluginCreatorRegistry const&) = delete;
-    void operator=(PluginCreatorRegistry const&) = delete;
-};
-
-template <typename CreatorType>
-void initializePlugin(void* logger, char const* libNamespace) {
-    PluginCreatorRegistry::getInstance().addPluginCreator<CreatorType>(logger, libNamespace);
-}
-
-}  // namespace
-
-extern "C" {
-bool initLibNvInferPlugins(void* logger, const char* libNamespace) {
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FCPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FCInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::FFNPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::EmbLayerNormPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::EmbLayerNormInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::GeluPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::QKVToContextPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::QKVToContextInt8PluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::SkipLayerNormPluginDynamicCreator>(logger, libNamespace);
-    initializePlugin<nvinfer1::ixrt_plugin::bert::SkipLayerNormInt8PluginHFaceCreator>(logger, libNamespace);
-    return true;
-}
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h
deleted file mode 100644
index bd094b40..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h
+++ /dev/null
@@ -1,299 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cuda.h>
-#include <cuda_fp16.h>
-
-#include <stdexcept>
-
-#ifndef C10_WARP_SIZE
-
-#ifdef __ILUVATAR__
-#define C10_WARP_SIZE 64
-#else
-#define C10_WARP_SIZE 32
-#endif
-
-#endif
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-const float epsilon = 0.000000000001;
-const unsigned int WARP_REDUCE_MASK = 0xffffffff;
-const float CUDA_FLOAT_INF_NEG = -100000000.f;  // FIXME later
-const float CUDA_FLOAT_INF_POS = 100000000.f;   // FIXME later
-const int CUDA_INT_INF = 2147483647;
-const int MAX_THREADS = 1024;
-
-__forceinline__ __device__ int8_t float2int8(float x, float quant_scale) {
-    float i8_f = x * quant_scale;
-    int32_t i8 = floorf(i8_f + 0.5);
-    i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-    return int8_t(i8);
-}
-
-inline __device__ void WelfordCombine(float val, float *mean, float *m2, float *count) {
-    // Use Welford Online algorithem to compute mean and variance
-    // For more details you can refer to:
-    // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
-    *count += 1;
-    float delta1 = val - *mean;
-    *mean += delta1 / *count;
-    float delta2 = val - *mean;
-    *m2 += delta1 * delta2;
-}
-
-inline __device__ void WelfordCombine(float b_mean, float b_m2, float b_count, float *mean, float *m2, float *count) {
-    if (b_count == 0) {
-        return;
-    }
-    float new_count = *count + b_count;
-    float nb_over_n = b_count / new_count;
-    float delta = b_mean - *mean;
-    *mean += delta * nb_over_n;
-    *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
-    *count = new_count;
-}
-
-__inline__ __device__ void WelfordWarpReduce(float thread_mean, float thread_m2, float thread_count, float *mean,
-                                             float *m2, float *count) {
-    *mean = thread_mean;
-    *m2 = thread_m2;
-    *count = thread_count;
-    for (int mask = C10_WARP_SIZE / 2; mask > 0; mask /= 2) {
-        float b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
-        float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask);
-        float b_count = __shfl_down_sync(0xffffffff, *count, mask);
-        WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
-    }
-}
-// addd by pxl
-// block内所有数据完成reduce
-//  template <int >
-__inline__ __device__ void WelfordBlockAllReduce(float thread_mean, float thread_m2, float thread_count,
-                                                 float *result_mean, float *result_m2, float *result_count) {
-    __shared__ float mean_shared[C10_WARP_SIZE];
-    __shared__ float m2_shared[C10_WARP_SIZE];
-    __shared__ float count_shared[C10_WARP_SIZE];
-    __shared__ float mean_result_broadcast;
-    __shared__ float m2_result_broadcast;
-    __shared__ float count_result_broadcast;
-
-    const int lid = threadIdx.x % C10_WARP_SIZE;
-    const int wid = threadIdx.x / C10_WARP_SIZE;
-    float warp_mean = 0;
-    float warp_m2 = 0;
-    float warp_count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
-    __syncthreads();
-
-    if (lid == 0) {
-        mean_shared[wid] = warp_mean;
-        m2_shared[wid] = warp_m2;
-        count_shared[wid] = warp_count;
-    }
-    __syncthreads();
-
-    if (wid == 0) {
-        if (threadIdx.x < blockDim.x / C10_WARP_SIZE) {
-            warp_mean = mean_shared[lid];
-            warp_m2 = m2_shared[lid];
-            warp_count = count_shared[lid];
-
-        } else {
-            warp_mean = 0.f;
-            warp_m2 = 0.f;
-            warp_count = 0.f;
-        }
-        __syncwarp();
-
-        float block_mean = 0;
-        float block_m2 = 0;
-        float block_count = 0;
-
-        WelfordWarpReduce(warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
-
-        if (lid == 0) {
-            mean_result_broadcast = block_mean;
-            m2_result_broadcast = block_m2;
-            count_result_broadcast = block_count;
-        }
-    }
-    __syncthreads();
-    *result_mean = mean_result_broadcast;
-    *result_m2 = m2_result_broadcast;
-    *result_count = count_result_broadcast;
-}
-__forceinline__ __device__ char4 float42char4(float4 vals, float quant_scale) {
-    char4 res;
-    res.x = float2int8(vals.x, quant_scale);
-    res.y = float2int8(vals.y, quant_scale);
-    res.z = float2int8(vals.z, quant_scale);
-    res.w = float2int8(vals.w, quant_scale);
-    return res;
-}
-
-// load 两个 half2, 保存到 float4
-__forceinline__ __device__ void load_float4_from_half(float4 &vals, __half2 *input, int index) {
-    __half2 i1 = input[index * 2];
-    __half2 i2 = input[index * 2 + 1];
-
-    vals.x = __half2float(i1.x);
-    vals.y = __half2float(i1.y);
-    vals.z = __half2float(i2.x);
-    vals.w = __half2float(i2.y);
-}
-
-/* Convert vector index to 3-dim tensor index */
-__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1, int dim2, int *id0, int *id1, int *id2) {
-    *id2 = src % dim2;
-    src /= dim2;
-
-    *id1 = src % dim1;
-    *id0 = src / dim1;
-}
-
-__forceinline__ __device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2, int hidden_size,
-                                                            float epsilon, float4 scale, float4 bias) {
-    float4 norm_value;
-    norm_value.x =
-        (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.x + bias.x;
-    norm_value.y =
-        (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.y + bias.y;
-    norm_value.z =
-        (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.z + bias.z;
-    norm_value.w =
-        (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.w + bias.w;
-    return norm_value;
-}
-
-// for layer norm
-__forceinline__ __device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2, int hidden_size,
-                                                            float epsilon, half2 scale_1, half2 scale_2, half2 bias_1,
-                                                            half2 bias_2) {
-    float4 norm_value;
-    norm_value.x =
-        (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + __half2float(bias_1.x);
-    norm_value.y =
-        (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + __half2float(bias_1.y);
-    norm_value.z =
-        (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_2.x) + __half2float(bias_2.x);
-    norm_value.w =
-        (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_2.y) + __half2float(bias_2.y);
-    return norm_value;
-}
-/* Convert half2 into float2, mask inf and -inf */
-__forceinline__ __host__ __device__ float safe_half_to_float(half hval) {
-    return fmax(fmin(100000.f, __half2float(hval)), -100000.f);
-}
-__forceinline__ __device__ float4 char4addfloat4_dequant(char4 input_4, float4 residual,
-                                                        float dequant_scale) {
-    float4 res;
-    res.x = __int2float_rn(input_4.x) * dequant_scale + residual.x;
-    res.y = __int2float_rn(input_4.y) * dequant_scale + residual.y;
-    res.z = __int2float_rn(input_4.z) * dequant_scale + residual.z;
-    res.w = __int2float_rn(input_4.w) * dequant_scale + residual.w;
-    return res;
-}
-
-__forceinline__ __device__ float4 char4addhalf2_dequant(char4 input_4, half2 residual_1, half2 residual_2,
-                                                        float dequant_scale) {
-    float4 res;
-    res.x = __int2float_rn(input_4.x) * dequant_scale + safe_half_to_float(residual_1.x);
-    res.y = __int2float_rn(input_4.y) * dequant_scale + safe_half_to_float(residual_1.y);
-    res.z = __int2float_rn(input_4.z) * dequant_scale + safe_half_to_float(residual_2.x);
-    res.w = __int2float_rn(input_4.w) * dequant_scale + safe_half_to_float(residual_2.y);
-    return res;
-}
-
-// gelu
-//  IxinferBiasGeluI8II8OKernel
-template <typename T>
-__forceinline__ __device__ T tanhf_exp(T x) {
-    // float e1 = __expf(x);
-    // float e2 = 1.0f / e1;
-    // return (e1 - e2) / (e1 + e2);
-
-    return (2.f / (1.f + __expf(-2.f * x)) - 1.f);
-}
-
-template <typename T>
-__forceinline__ __device__ T gelu(T x) {
-    float cdf = 0.5f * (1.0f + tanhf_exp((0.7978845608028654f * (x + 0.044715f * x * x * x))));
-    return x * cdf;
-}
-
-// softmax
-__forceinline__ __host__ __device__ int log2_ceil(int value) {
-    int log2_value = 0;
-    while ((1 << log2_value) < value) ++log2_value;
-    return log2_value;
-}
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width, unsigned int mask = 0xffffffff) {
-#if !(defined(__HIP_PLATFORM_HCC__) || defined(__ILUVATAR__))
-    return __shfl_xor_sync(mask, value, laneMask, width);
-#else
-    return __shfl_xor(value, laneMask, width);
-#endif
-}
-
-template <typename T>
-struct Add {
-    __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-struct Max {
-    __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; }
-};
-template <typename acc_t, int REDUCE_WARP_SIZE, template <typename> class ReduceOp>
-__device__ __forceinline__ void warp_reduce(acc_t *sum) {
-    ReduceOp<acc_t> r;
-#pragma unroll
-    for (int offset = REDUCE_WARP_SIZE / 2; offset > 0; offset /= 2) {
-        acc_t b = WARP_SHFL_XOR(*sum, offset, REDUCE_WARP_SIZE);
-        *sum = r(*sum, b);
-    }
-}
-/* Convert 3-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int targetid_3dim(int id1, int id2, int id3, int dim2, int dim3) {
-    return id1 * dim2 * dim3 + id2 * dim3 + id3;
-}
-
-/* Convert 4-dim tensor index into vector index */
-__forceinline__ __host__ __device__ int targetid_4dim(int id1, int id2, int id3, int id4, int dim2, int dim3,
-                                                      int dim4) {
-    // return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4;
-    int res = id4;
-
-    int ld = dim4;
-    res += id3 * ld;
-
-    ld *= dim3;
-    res += id2 * ld;
-
-    ld *= dim2;
-    res += id1 * ld;
-
-    return res;
-}
-
-}  // namespace backend
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h
deleted file mode 100644
index c0f34842..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h
+++ /dev/null
@@ -1,312 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cublasLt.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <stdexcept>
-
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-/* GPU function guard */
-
-/**
- * @brief cublasLt gemm without imma
- *
- * @tparam OutType output dtype
- * @tparam ScaleType scale dtype
- * @param input_a
- * @param input_b
- * @param output_c
- * @param batch_count
- * @param m
- * @param n
- * @param k
- * @param stridea
- * @param strideb
- * @param stridec
- * @param alpha
- * @param cublasLt_handle
- * @param stream
- */
-template <typename OutType, typename ScaleType>
-void cublaslt_gemm(const int8_t* input_a, const int8_t* input_b, OutType* output_c, int batch_count, int m, int n,
-                   int k, int64_t stridea, int64_t strideb, int64_t stridec, const ScaleType alpha,
-                   cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-    cublasOperation_t transpose = CUBLAS_OP_T;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t compute_type = CUDA_R_32I;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype;
-    cudaDataType_t scale_dtype;
-    if (std::is_same<OutType, int32_t>::value) {
-        out_dtype = CUDA_R_32I;
-        scale_dtype = CUDA_R_32I;
-    } else if (std::is_same<OutType, int8_t>::value) {
-        out_dtype = CUDA_R_8I;
-        scale_dtype = CUDA_R_32F;
-    } else {
-        throw std::runtime_error("Unsupported output type");
-    }
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-    CHECK_GPU_ERROR(
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transpose, sizeof(transpose)));
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_8I, k, m, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_8I, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    ScaleType beta = ScaleType(0);
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-inline void cublaslt_gemm(const half* input_a, const half* input_b, half* output_c, int batch_count, int m, int n,
-                          int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                          cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-    cublasOperation_t transpose = CUBLAS_OP_T;
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
-#else
-    cudaDataType_t compute_type = CUDA_R_32F;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype = CUDA_R_16F;
-    cudaDataType_t scale_dtype = CUDA_R_32F;
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-    CHECK_GPU_ERROR(
-        cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transpose, sizeof(transpose)));
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_16F, k, m, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_16F, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    float beta = 0.0;
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-template void cublaslt_gemm<int32_t, int32_t>(const int8_t* input_a, const int8_t* input_b, int32_t* output_c,
-                                              int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                              int64_t stridec, const int32_t alpha, cublasLtHandle_t cublasLt_handle,
-                                              cudaStream_t stream);
-
-template void cublaslt_gemm<int8_t, float>(const int8_t* input_a, const int8_t* input_b, int8_t* output_c,
-                                           int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                           int64_t stridec, const float alpha, cublasLtHandle_t cublasLt_handle,
-                                           cudaStream_t stream);
-
-/************add by pxl *************/
-template <typename OutType, typename ScaleType>
-void cublaslt_gemm_nn(const int8_t* input_a, const int8_t* input_b, OutType* output_c, int batch_count, int m, int n,
-                      int k, int64_t stridea, int64_t strideb, int64_t stridec, const ScaleType alpha,
-                      cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I;
-#else
-    cudaDataType_t compute_type = CUDA_R_32I;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype;
-    cudaDataType_t scale_dtype;
-    if (std::is_same<OutType, int32_t>::value) {
-        out_dtype = CUDA_R_32I;
-        scale_dtype = CUDA_R_32I;
-    } else if (std::is_same<OutType, int8_t>::value) {
-        out_dtype = CUDA_R_8I;
-        scale_dtype = CUDA_R_32F;
-    } else {
-        throw std::runtime_error("Unsupported output type");
-    }
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_8I, m, k, m));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_8I, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    ScaleType beta = ScaleType(0);
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-template void cublaslt_gemm_nn<int32_t, int32_t>(const int8_t* input_a, const int8_t* input_b, int32_t* output_c,
-                                                 int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                                 int64_t stridec, const int32_t alpha, cublasLtHandle_t cublasLt_handle,
-                                                 cudaStream_t stream);
-
-template void cublaslt_gemm_nn<int8_t, float>(const int8_t* input_a, const int8_t* input_b, int8_t* output_c,
-                                              int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb,
-                                              int64_t stridec, const float alpha, cublasLtHandle_t cublasLt_handle,
-                                              cudaStream_t stream);
-
-inline void cublaslt_gemm_nn(const half* input_a, const half* input_b, half* output_c, int batch_count, int m, int n,
-                          int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                          cublasLtHandle_t cublasLt_handle, cudaStream_t stream) {
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
-#else
-    cudaDataType_t compute_type = CUDA_R_32F;
-#endif
-    cublasLtMatmulDesc_t matmul_desc;
-    cublasLtMatrixLayout_t desc_a = NULL;
-    cublasLtMatrixLayout_t desc_b = NULL;
-    cublasLtMatrixLayout_t desc_c = NULL;
-
-    cudaDataType_t out_dtype = CUDA_R_16F;
-    cudaDataType_t scale_dtype = CUDA_R_32F;
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype));
-#else
-    CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type));
-    CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype,
-                                                   sizeof(scale_dtype)));
-#endif
-
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_16F, m, k, m));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_16F, k, n, k));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m));
-
-    if (batch_count > 1) {
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea,
-                                                         sizeof(stridea)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb,
-                                                         sizeof(strideb)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count,
-                                                         sizeof(batch_count)));
-        CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec,
-                                                         sizeof(stridec)));
-    }
-
-    float beta = 0.0;
-    CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta,
-                                   output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream));
-
-    CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b));
-    CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c));
-}
-
-}  // namespace backend
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu b/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu
deleted file mode 100644
index b3f0bbcb..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "ixinfer_gemm_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace backend {
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                     int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: m,k input_b: n,k  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, const float *bias, int8_t *output_c, int batch_count,
-                     int m, int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const float beta, const int act_type, cuinferHandle_t &cuinfer_handle, cudaStream_t &stream) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_nn_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: n,k  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_N;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = m;
-    int ldb = k;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_nt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: k,n  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_T;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = m;
-    int ldb = n;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_tt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream) {
-    /* TN: input_a: k,m input_b: k,n  output_c: n,m */
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_T;
-
-    cudaDataType_t Atype = CUDA_R_8I;
-    cudaDataType_t Btype = CUDA_R_8I;
-    cudaDataType_t Ctype = CUDA_R_8I;
-    cudaDataType_t computeType = CUDA_R_32I;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-
-    int lda = k;
-    int ldb = n;
-    int ldc = m;
-
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, nullptr, customOption);
-
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error!");
-    }
-}
-
-void cuinfer_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                  int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t handle,
-                  cudaStream_t stream) {
-    /* Performs operation using cublas */
-    float beta = 0.0f;
-    cublasSetStream(handle, stream);
-    cublasStatus_t status;
-    if (batch_count <= 1) {
-        status = cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, k, input_b,
-                              CUDA_R_16F, k, &beta, output_c, CUDA_R_16F, m, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    } else {
-        status = cublasGemmStridedBatchedEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, k,
-                                            stridea, input_b, CUDA_R_16F, k, strideb, &beta, output_c, CUDA_R_16F, m,
-                                            stridec, batch_count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    }
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinfer_gemm error!");
-    }
-}
-
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                     int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t handle,
-                     cudaStream_t stream) {
-    /* Performs operation using cublas */
-    float beta = 0.0f;
-    cublasSetStream(handle, stream);
-    cublasStatus_t status;
-    if (batch_count <= 1) {
-        // k,m n,k
-        status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, m, input_b,
-                              CUDA_R_16F, k, &beta, output_c, CUDA_R_16F, m, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    } else {
-        status = cublasGemmStridedBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, m,
-                                            stridea, input_b, CUDA_R_16F, k, strideb, &beta, output_c, CUDA_R_16F, m,
-                                            stridec, batch_count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
-    }
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinfer_gemm error!");
-    }
-}
-
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-        // std::cout << "CUINFER_BLAS_GEMM_CUSTOM_NONE" << std::endl;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, const float beta,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-        // std::cout << "CUINFER_BLAS_GEMM_CUSTOM_NONE" << std::endl;
-    }
-
-    int lda = k;
-    int ldb = k;
-    int ldc = m;
-    // float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_N;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = m;
-    int ldb = k;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-void cuinfer_nt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_N;
-    cuinferOperation_t transb = CUINFER_OP_T;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = m;
-    int ldb = n;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-void cuinfer_tt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) {
-    cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST;
-    cuinferOperation_t transa = CUINFER_OP_T;
-    cuinferOperation_t transb = CUINFER_OP_T;
-    cudaDataType_t Atype = CUDA_R_16F;
-    cudaDataType_t Btype = CUDA_R_16F;
-    cudaDataType_t Ctype = CUDA_R_16F;
-    cudaDataType_t computeType = CUDA_R_32F;
-    cudaDataType_t scaleType = CUDA_R_32F;
-    cuinferGEMMCustomOption_t customOption;
-    if (bias != nullptr) {
-        if (act_type == 3) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU;
-
-        } else if (act_type == 4) {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU;
-        } else {
-            customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS;
-        }
-    } else {
-        customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE;
-    }
-
-    int lda = k;
-    int ldb = n;
-    int ldc = m;
-    float beta = 0.f;
-
-    cuinferStatus_t status =
-        cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype,
-                          lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count,
-                          computeType, scaleType, nullptr, (void *)bias, customOption);
-    if (status != CUINFER_STATUS_SUCCESS) {
-        throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !");
-    }
-}
-
-}  // namespace backend
-}  // namespace nvinfer1::ixrt_plugin
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h
deleted file mode 100644
index 2433b3a1..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <ixinfer.h>
-
-#include <stdexcept>
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace backend {
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                     int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, const float *bias, int8_t *output_c, int batch_count,
-                     int m, int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const float beta, const int act_type, cuinferHandle_t &cuinfer_handle, cudaStream_t &stream);
-
-void cuinfer_nn_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_nt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_tt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n,
-                        int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                        cuinferHandle_t cuinfer_handle, cudaStream_t stream);
-
-void cuinfer_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                  int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t cublas_handle,
-                  cudaStream_t stream);
-
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k,
-                     int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t cublas_handle,
-                     cudaStream_t stream);
-
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                  int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, const float beta,
-                  const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_nn_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_nt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-void cuinfer_tt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m,
-                     int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha,
-                     const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle);
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h
deleted file mode 100644
index a75d902f..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cuda_fp16.h>
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <numeric>
-
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-constexpr uint32_t BDIM = 0;  // batch dimension
-constexpr uint32_t SDIM = 1;  // seq len dimension
-constexpr uint32_t HDIM = 2;  // hidden dimension
-
-#define TRT_UNUSED (void)
-
-template <typename T>
-struct CudaDeleter {
-    void operator()(T* buf) { IXRT_PLUGIN_CUASSERT(cudaFree(buf)); }
-};
-
-template <typename T>
-using cuda_unique_ptr = std::unique_ptr<T, CudaDeleter<T>>;
-
-inline uint32_t getElementSize(nvinfer1::DataType t) noexcept {
-    switch (t) {
-        case nvinfer1::DataType::kINT32:
-            return 4;
-        case nvinfer1::DataType::kFLOAT:
-            return 4;
-        case nvinfer1::DataType::kHALF:
-            return 2;
-        case nvinfer1::DataType::kBOOL:
-        // case nvinfer1::DataType::kUINT8:
-        case nvinfer1::DataType::kINT8:
-            return 1;
-        default:
-            break;
-        // case DataType::kUNKNOWN:
-        // case DataType::kINT64:
-        // case DataType::kFLOAT64:
-            // break;
-    }
-    return 0;
-}
-
-inline int64_t getWeightsSize(nvinfer1::Weights const& w, nvinfer1::DataType type) {
-    return w.count * getElementSize(type);
-}
-
-template <typename T>
-using cuda_shared_ptr = std::shared_ptr<T>;
-
-template <typename T>
-void make_cuda_shared(cuda_shared_ptr<T>& ptr, void* cudaMem) {
-    ptr.reset(static_cast<T*>(cudaMem), bert::CudaDeleter<T>());
-}
-
-struct WeightsWithOwnership : public nvinfer1::Weights {
-    ILogger* logger_;
-    WeightsWithOwnership() {
-        values = nullptr;
-        count = 0;
-    }
-    ~WeightsWithOwnership() { operator delete[](const_cast<void*>(values)); }
-
-    WeightsWithOwnership(WeightsWithOwnership const&) = delete;
-    WeightsWithOwnership operator=(WeightsWithOwnership const&) = delete;
-    WeightsWithOwnership(WeightsWithOwnership const&&) = delete;
-    WeightsWithOwnership operator=(WeightsWithOwnership const&&) = delete;
-
-    void convertAndCopy(nvinfer1::Weights const& src, nvinfer1::DataType type, float scale = 1) {
-        this->type = type;
-        this->count = src.count;
-
-        if (type == nvinfer1::DataType::kFLOAT) {
-            auto destBuf = new float[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kFLOAT) {
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Float Array(Host)" << endl;
-                std::copy_n(static_cast<float const*>(src.values), src.count, destBuf);
-            } else {
-                IXRT_PLUGIN_ASSERT(src.type == nvinfer1::DataType::kHALF);
-
-                ixrt_plugin::gLogInfo << "Half Weights(Host) => Float Array(Host)" << endl;
-                auto const s = static_cast<half const*>(src.values);
-                auto d = static_cast<float*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    d[it] = __half2float(s[it]);
-                }
-            }
-        } else if (type == nvinfer1::DataType::kHALF) {
-            auto destBuf = new half[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kHALF) {
-                ixrt_plugin::gLogInfo << "Half Weights(Host) => Half Array(Host)" << endl;
-                std::copy_n(static_cast<half const*>(src.values), src.count, destBuf);
-            } else {
-                IXRT_PLUGIN_ASSERT(src.type == nvinfer1::DataType::kFLOAT);
-
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Half Array(Host)" << endl;
-                auto const s = static_cast<float const*>(src.values);
-                auto d = static_cast<half*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    d[it] = __float2half(s[it]);
-                }
-            }
-        } else if (type == nvinfer1::DataType::kINT8) {
-            auto destBuf = new int8_t[src.count];
-            this->values = destBuf;
-
-            if (src.type == nvinfer1::DataType::kFLOAT) {
-                ixrt_plugin::gLogInfo << "Float Weights(Host) => Int8 Array(Host)" << endl;
-                auto const s = static_cast<float const*>(src.values);
-                auto d = static_cast<int8_t*>(const_cast<void*>(this->values));
-
-                for (auto it = 0; it < src.count; it++) {
-                    int32_t v = static_cast<int32_t>(std::roundf(s[it] / scale));
-                    d[it] = v <= -127 ? -127 : (v >= 127 ? 127 : v);
-                }
-            } else if (src.type == nvinfer1::DataType::kINT8) {
-                ixrt_plugin::gLogInfo << "Int8 Weights(Host) => Int8 Array(Host)" << endl;
-                std::copy_n(static_cast<int8_t const*>(src.values), src.count, destBuf);
-            } else {
-                throw std::runtime_error("Unsupported DataType specified for plugin.");
-            }
-        } else {
-            throw std::runtime_error("Unsupported DataType specified for plugin.");
-        }
-    }
-
-    void convertAndCopy(char const*& srcBuf, size_t count, nvinfer1::DataType type) noexcept {
-        this->type = type;
-        this->count = count;
-        auto const nbBytes = getWeightsSize(*this, type);
-        auto destBuf = new char[nbBytes];
-        this->values = destBuf;
-
-        std::copy_n(srcBuf, nbBytes, destBuf);
-        srcBuf += nbBytes;
-    }
-};
-
-template <typename T>
-inline void copyToDevice(WeightsWithOwnership& hostWeights, size_t nbBytes, cuda_unique_ptr<T>& cudaWeights) {
-    if (hostWeights.values) {
-        void* cudaMem{nullptr};
-        IXRT_PLUGIN_CUASSERT(cudaMalloc(&cudaMem, nbBytes));
-        IXRT_PLUGIN_CUASSERT(cudaMemcpy(cudaMem, hostWeights.values, nbBytes, cudaMemcpyHostToDevice));
-        cudaWeights.reset(static_cast<T*>(cudaMem));
-    }
-}
-
-template <typename T>
-inline void serFromDev(char*& buffer, T const* data, size_t nbElem) {
-    const size_t len = sizeof(T) * nbElem;
-    IXRT_PLUGIN_CUASSERT(cudaMemcpy(buffer, static_cast<void const*>(data), len, cudaMemcpyDeviceToHost));
-    buffer += len;
-}
-
-template <typename T>
-inline T* deserToDev(char const*& buffer, size_t nbElem) {
-    void* dev{nullptr};
-    const size_t len = sizeof(T) * nbElem;
-    IXRT_PLUGIN_CUASSERT(cudaMalloc(&dev, len));
-    IXRT_PLUGIN_CUASSERT(cudaMemcpy(dev, buffer, len, cudaMemcpyHostToDevice));
-
-    buffer += len;
-    return static_cast<T*>(dev);
-}
-
-inline nvinfer1::DataType fieldTypeToDataType(const nvinfer1::PluginFieldType ftype) {
-    switch (ftype) {
-        case nvinfer1::PluginFieldType::kFLOAT32: {
-            gLogInfo << "PluginFieldType is Float32" << endl;
-            return nvinfer1::DataType::kFLOAT;
-        }
-        case nvinfer1::PluginFieldType::kFLOAT16: {
-            gLogInfo << "PluginFieldType is Float16" << endl;
-            return nvinfer1::DataType::kHALF;
-        }
-        case nvinfer1::PluginFieldType::kINT32: {
-            gLogInfo << "PluginFieldType is Int32" << endl;
-            return nvinfer1::DataType::kINT32;
-        }
-        case nvinfer1::PluginFieldType::kINT8: {
-            gLogInfo << "PluginFieldType is Int8" << endl;
-            return nvinfer1::DataType::kINT8;
-        }
-        default:
-            throw std::invalid_argument("No corresponding datatype for plugin field type");
-    }
-}
-
-inline int64_t volume(nvinfer1::Dims const& d) {
-    return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies<int64_t>{});
-}
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp
deleted file mode 100644
index 8e705d6c..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "checkMacrosPlugin.h"
-
-#include "NvInferRuntimeCommon.h"
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-ILogger* gLogger{};
-
-template <ILogger::Severity kSeverity>
-int32_t LogStream<kSeverity>::Buf::sync() {
-    std::string s = str();
-    while (!s.empty() && s.back() == '\n') {
-        s.pop_back();
-    }
-    if (gLogger != nullptr) {
-        gLogger->log(kSeverity, s.c_str());
-    }
-    str("");
-    return 0;
-}
-
-// These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger
-// (otherwise, it will not log)
-LogStream<ILogger::Severity::kERROR> gLogError;
-LogStream<ILogger::Severity::kWARNING> gLogWarning;
-LogStream<ILogger::Severity::kINFO> gLogInfo;
-LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h
deleted file mode 100644
index 76d87a92..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cublasLt.h>
-
-#include <cassert>
-#include <iostream>
-#include <mutex>
-#include <sstream>
-
-#include "NvInfer.h"
-#include "NvInferRuntime.h"
-
-// Logs failed assertion and aborts.
-// Aborting is undesirable and will be phased-out from the plugin module, at which point
-// PLUGIN_ASSERT will perform the same function as PLUGIN_VALIDATE.
-using namespace std;
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-
-#ifdef _MSC_VER
-#define FN_NAME __FUNCTION__
-#else
-#define FN_NAME __func__
-#endif
-
-#define IXRT_PLUGIN_CHECK_VALUE(value, msg)                            \
-    {                                                                  \
-        if (not(value)) {                                              \
-            std::cerr << __FILE__ << " (" << __LINE__ << ")"           \
-                      << "-" << __FUNCTION__ << " : "                  \
-                      << " Plugin assert error: " << msg << std::endl; \
-            std::exit(EXIT_FAILURE);                                   \
-        }                                                              \
-    }
-
-#define IXRT_PLUGIN_ASSERT(value)                             \
-    {                                                         \
-        if (not(value)) {                                     \
-            std::cerr << __FILE__ << " (" << __LINE__ << ")"  \
-                      << "-" << __FUNCTION__ << " : "         \
-                      << " Plugin assert false" << std::endl; \
-            std::exit(EXIT_FAILURE);                          \
-        }                                                     \
-    }
-
-#define IXRT_PLUGIN_CHECK_CUDA(call)                                        \
-    do {                                                                    \
-        const cudaError_t error_code = call;                                \
-        if (error_code != cudaSuccess) {                                    \
-            printf("CUDA Error:\n");                                        \
-            printf("    File:       %s\n", __FILE__);                       \
-            printf("    Line:       %d\n", __LINE__);                       \
-            printf("    Error code: %d\n", error_code);                     \
-            printf("    Error text: %s\n", cudaGetErrorString(error_code)); \
-            exit(1);                                                        \
-        }                                                                   \
-    } while (0)
-
-inline void caughtError(const std::exception& e) { std::cerr << e.what() << std::endl; }
-
-#define IXRT_PLUGIN_FAIL(msg)                         \
-    do {                                              \
-        std::ostringstream stream;                    \
-        stream << "Assertion failed: " << msg << "\n" \
-               << __FILE__ << ':' << __LINE__ << "\n" \
-               << "Aborting..."                       \
-               << "\n";                               \
-        IXRT_PLUGIN_CHECK_CUDA(cudaDeviceReset());    \
-        abort;                                        \
-    } while (0)
-
-inline void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg) {
-    std::cerr << file << " (" << line << ")"
-              << "-" << function << " : " << msg << std::endl;
-    std::exit(EXIT_FAILURE);
-}
-
-#define IXRT_PLUGIN_CUASSERT(status_)                             \
-    {                                                             \
-        auto s_ = status_;                                        \
-        if (s_ != cudaSuccess) {                                  \
-            const char* msg = cudaGetErrorString(s_);             \
-            throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg); \
-        }                                                         \
-    }
-
-#undef CUINFER_CHECK
-#define CUINFER_CHECK(func)                                                              \
-    do {                                                                                 \
-        cuinferStatus_t status = (func);                                                 \
-        if (status != CUINFER_STATUS_SUCCESS) {                                          \
-            std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \
-                      << cuinferGetErrorString(status) << std::endl;                     \
-            std::exit(EXIT_FAILURE);                                                     \
-        }                                                                                \
-    } while (0)
-
-static std::string _cudaGetErrorString(cublasStatus_t error) {
-    switch (error) {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-    return "CUBLAS_UNKNOW";
-}
-
-template <typename T>
-void check_gpu_error(T result, char const* const func, const char* const file, int const line) {
-    if (result) {
-        throw std::runtime_error(std::string("[CUDA][ERROR] ") + +file + "(" + std::to_string(line) +
-                                 "): " + (_cudaGetErrorString(result)) + "\n");
-    }
-}
-
-#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__)
-
-template <ILogger::Severity kSeverity>
-class LogStream : public std::ostream {
-    class Buf : public std::stringbuf {
-       public:
-        int32_t sync() override;
-    };
-
-    Buf buffer;
-    std::mutex mLogStreamMutex;
-
-   public:
-    std::mutex& getMutex() { return mLogStreamMutex; }
-    LogStream() : std::ostream(&buffer){};
-};
-
-// Use mutex to protect multi-stream write to buffer
-template <ILogger::Severity kSeverity, typename T>
-LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, T const& msg) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << msg;
-    return stream;
-}
-
-// Special handling static numbers
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, int32_t num) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << num;
-    return stream;
-}
-
-// Special handling std::endl
-template <ILogger::Severity kSeverity>
-inline LogStream<kSeverity>& operator<<(LogStream<kSeverity>& stream, std::ostream& (*f)(std::ostream&)) {
-    std::lock_guard<std::mutex> guard(stream.getMutex());
-    auto& os = static_cast<std::ostream&>(stream);
-    os << f;
-    return stream;
-}
-
-extern LogStream<ILogger::Severity::kERROR> gLogError;
-extern LogStream<ILogger::Severity::kWARNING> gLogWarning;
-extern LogStream<ILogger::Severity::kINFO> gLogInfo;
-extern LogStream<ILogger::Severity::kVERBOSE> gLogVerbose;
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh b/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh
deleted file mode 100644
index b9b9eb8e..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-
-#pragma once
-
-#include <cublasLt.h>
-
-#include <cuda.h>
-namespace nvinfer1 {
-namespace ixrt_plugin {
-#ifdef __ILUVATAR__
-static const int kMaxThreadNbPerBlock = 1024;
-static const int kMaxBlockNbPerSM = 8;
-static const int kWarpSize = 64;
-static const dim3 kMaxBlockDimension = {4096, 4096, 64};
-static const dim3 kMaxGridDimension = {4294967295, 65536, 65536};
-static const int kNbThreadsPerBlockGainBestPerformance = 1024;
-static const int kMaxSharedMemSizePerBlock = (128 * 1024 * 4);
-static const int kNbSmemLane = 64;
-static const int kNbBytesPerSmemLane = 4;
-#else
-static const int kMaxThreadNbPerBlock = 1024;
-static const int kMaxBlockNbPerSM = 8;
-static const int kWarpSize = 32;
-static const dim3 kMaxBlockDimension = {1024, 1024, 64};
-static const dim3 kMaxGridDimension = {2147483647, 65535, 65535};
-static const int kNbThreadsPerBlockGainBestPerformance = 256;
-static const int kMaxSharedMemSizePerBlock = 48 * 1024 * 4;
-static const int kNbSmemLane = 32;
-static const int kNbBytesPerSmemLane = 4;
-#endif
-
-static const int kNbCe = 4;
-static const int kNbCuPerCe = 4;
-static const int kNbSppPerCu = 4;
-
-static const float kLog2e = 1.442695040888963387;
-
-#define DivUp(x, y) (((x) + (y)-1) / (y))
-
-__device__ __forceinline__ float floatExp(float x) { return __builtin_exp2f(kLog2e * x); }
-
-__device__ __forceinline__ float floatLog(float x) { return __logf(x); }
-
-__forceinline__ int nearest_num(int x, int value) {
-    if (x % value == 0) {
-        return x;
-    } else {
-        int padding = value - x % value;
-        return x + padding;
-    }
-}
-}  // namespace nvinfer1::ixrt_plugin
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp
deleted file mode 100644
index 29908ff1..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "plugin.h"
-#include "checkMacrosPlugin.h"
-
-namespace nvinfer1
-{
-namespace ixrt_plugin
-{
-
-void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc)
-{
-    for (int32_t i = 0; i < fc->nbFields; i++)
-    {
-        requiredFieldNames.erase(fc->fields[i].name);
-    }
-    if (!requiredFieldNames.empty())
-    {
-        std::stringstream msg{};
-        msg << "PluginFieldCollection missing required fields: {";
-        char const* separator = "";
-        for (auto const& field : requiredFieldNames)
-        {
-            msg << separator << field;
-            separator = ", ";
-        }
-        msg << "}";
-        std::string msg_str = msg.str();
-        IXRT_PLUGIN_CHECK_VALUE(false, msg_str.c_str());
-    }
-}
-
-} // namespace ixrt_plugin
-} // namespace nvinfer1
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h
deleted file mode 100644
index b24ef300..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h
+++ /dev/null
@@ -1,72 +0,0 @@
-
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-#include <cstring>
-#include <string>
-#include <set>
-#include "NvInferRuntimeCommon.h"
-
-typedef enum
-{
-    STATUS_SUCCESS = 0,
-    STATUS_FAILURE = 1,
-    STATUS_BAD_PARAM = 2,
-    STATUS_NOT_SUPPORTED = 3,
-    STATUS_NOT_INITIALIZED = 4
-} pluginStatus_t;
-
-namespace nvinfer1 {
-
-namespace ixrt_plugin {
-
-
-// Write values into buffer
-template <typename T>
-void write(char*& buffer, const T& val) {
-    std::memcpy(buffer, &val, sizeof(T));
-    buffer += sizeof(T);
-}
-
-// Read values from buffer
-template <typename T>
-T read(const char*& buffer) {
-    T val{};
-    std::memcpy(&val, buffer, sizeof(T));
-    buffer += sizeof(T);
-    return val;
-}
-
-void validateRequiredAttributesExist(std::set<std::string> requiredFieldNames, PluginFieldCollection const* fc);
-
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h
deleted file mode 100644
index 11ef7eca..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#pragma once
-
-#include <cstring>
-#include <vector>
-#include <cassert>
-#include <type_traits>
-
-#include <iostream>
-using std::cerr;
-using std::cout;
-using std::endl;
-
-template <typename T>
-inline void serialize_value(void** buffer, T const& value);
-
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value);
-
-namespace
-{
-
-template <typename T, class Enable = void>
-struct Serializer
-{
-};
-
-template <typename T>
-struct Serializer<T,
-    typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
-{
-    static size_t serialized_size(T const&)
-    {
-        return sizeof(T);
-    }
-    static void serialize(void** buffer, T const& value)
-    {
-        ::memcpy(*buffer, &value, sizeof(T));
-        reinterpret_cast<char*&>(*buffer) += sizeof(T);
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, T* value)
-    {
-        assert(*buffer_size >= sizeof(T));
-        ::memcpy(value, *buffer, sizeof(T));
-        reinterpret_cast<char const*&>(*buffer) += sizeof(T);
-        *buffer_size -= sizeof(T);
-    }
-};
-
-template <>
-struct Serializer<const char*>
-{
-    static size_t serialized_size(const char* value)
-    {
-        return strlen(value) + 1;
-    }
-    static void serialize(void** buffer, const char* value)
-    {
-        ::strcpy(static_cast<char*>(*buffer), value);
-        reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, const char** value)
-    {
-        *value = static_cast<char const*>(*buffer);
-        size_t data_size = strnlen(*value, *buffer_size) + 1;
-        assert(*buffer_size >= data_size);
-        reinterpret_cast<char const*&>(*buffer) += data_size;
-        *buffer_size -= data_size;
-    }
-};
-
-template <typename T>
-struct Serializer<std::vector<T>,
-    typename std::enable_if<std::is_arithmetic<T>::value || std::is_enum<T>::value || std::is_pod<T>::value>::type>
-{
-    static size_t serialized_size(std::vector<T> const& value)
-    {
-        return sizeof(value.size()) + value.size() * sizeof(T);
-    }
-    static void serialize(void** buffer, std::vector<T> const& value)
-    {
-        serialize_value(buffer, value.size());
-        size_t nbyte = value.size() * sizeof(T);
-        ::memcpy(*buffer, value.data(), nbyte);
-        reinterpret_cast<char*&>(*buffer) += nbyte;
-    }
-    static void deserialize(void const** buffer, size_t* buffer_size, std::vector<T>* value)
-    {
-        size_t size;
-        deserialize_value(buffer, buffer_size, &size);
-        value->resize(size);
-        size_t nbyte = value->size() * sizeof(T);
-        assert(*buffer_size >= nbyte);
-        ::memcpy(value->data(), *buffer, nbyte);
-        reinterpret_cast<char const*&>(*buffer) += nbyte;
-        *buffer_size -= nbyte;
-    }
-};
-
-} // namespace
-
-template <typename T>
-inline size_t serialized_size(T const& value)
-{
-    return Serializer<T>::serialized_size(value);
-}
-
-template <typename T>
-inline void serialize_value(void** buffer, T const& value)
-{
-    return Serializer<T>::serialize(buffer, value);
-}
-
-template <typename T>
-inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value)
-{
-    return Serializer<T>::deserialize(buffer, buffer_size, value);
-}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp
deleted file mode 100644
index cf00d620..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp
+++ /dev/null
@@ -1,431 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-#include "NvInferRuntimeCommon.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "cuda_runtime_api.h"
-#include "driver_types.h"
-#include "fcPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <cassert>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFC_VERSION{"2"};
-char const* const kFC_NAME{"CustomFCPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FCInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> FCInt8PluginDynamicCreator::mPluginAttributes;
-
-FCInt8PluginDynamicCreator::FCInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("W", nullptr, PluginFieldType::kINT8, 1));
-    mPluginAttributes.emplace_back(PluginField("fc_amax", nullptr, PluginFieldType::kFLOAT32, 2));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* FCInt8PluginDynamicCreator::getPluginName() const noexcept { return kFC_NAME; }
-
-char const* FCInt8PluginDynamicCreator::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-PluginFieldCollection const* FCInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* FCInt8PluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FCInt8PluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        Weights W{DataType::kINT8, nullptr, 0LL};
-        Weights Bias{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "W", "fc_amax"}, fc);
-        vector<float> weight_scale;
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("W") == 0) {
-                gLogInfo << "Building W..." << endl;
-                W.values = fc->fields[i].data;
-                W.count = fc->fields[i].length;
-                W.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W int8: " << (W.type == DataType::kINT8) << endl;
-            }
-
-            if (fieldName.compare("Bias") == 0) {
-                gLogInfo << "Building Bias..." << endl;
-                Bias.values = fc->fields[i].data;
-                Bias.count = fc->fields[i].length;
-                Bias.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is Bias float32: " << (Bias.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("fc_amax") == 0) {
-                gLogInfo << "Building fc_amax..." << endl;
-                for (auto j = 0; j < fc->fields[i].length; j++) {
-                    auto value = static_cast<float const*>(fc->fields[i].data)[j];
-                    weight_scale.emplace_back(value / 127.0);
-                }
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (W.count == 0 || W.values == nullptr || W.count < outDims) {
-            gLogInfo << "Invalid weights" << endl;
-        }
-
-        DataType type = DataType::kINT8;
-        return new FCInt8PluginDynamic(name, type, outDims, W, Bias, weight_scale);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FCInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                         size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FCInt8PluginDynamic::destroy()
-    try {
-        return new FCInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FCInt8PluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FCInt8PluginDynamicCreator);
-//#########################################################################//
-FCInt8PluginDynamic::FCInt8PluginDynamic(std::string const name, DataType const type, int32_t const outDim,
-                                         Weights const& W, Weights const& Bias, vector<float> const& scale)
-    : mLayerName(name),
-      mType(type),
-      mOutDim(outDim),
-      mNumParams(W.count),
-      mNmax(0),
-      mK(0),
-      mWdev(nullptr),
-      mNumBias(Bias.count),
-      mScale(scale),
-      mBiasdev(nullptr) {
-    if (W.type == nvinfer1::DataType::kFLOAT) {
-        float weight_max = std::numeric_limits<float>::min();
-        for (int64_t wb = 0, we = W.count; wb < we; ++wb) {
-            float val = static_cast<const float*>(W.values)[wb];
-            weight_max = std::max(weight_max, std::abs(val));
-        }
-        // mWeightScale = 127 / weight_max;
-    }
-
-    mW.convertAndCopy(W, DataType::kINT8, scale[0]);
-    copyToDevice(mW, getWeightsSize(mW, DataType::kINT8), mWdev);
-    if (Bias.values != nullptr) {
-        mBias.convertAndCopy(Bias, DataType::kFLOAT);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kFLOAT), mBiasdev);
-    }
-}
-
-FCInt8PluginDynamic::FCInt8PluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev(nullptr), mBiasdev(nullptr) {
-    gLogInfo << "FCInt8PluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mOutDim);
-    deserialize_value(&data, &length, &mNumParams);
-    deserialize_value(&data, &length, &mNmax);
-    deserialize_value(&data, &length, &mK);
-    deserialize_value(&data, &length, &mNumBias);
-    deserialize_value(&data, &length, &mScale);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW.convertAndCopy(d, mNumParams, DataType::kINT8);
-    copyToDevice(mW, getWeightsSize(mW, DataType::kINT8), mWdev);
-    if (mNumBias > 0) {
-        mBias.convertAndCopy(d, mNumBias, DataType::kFLOAT);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kFLOAT), mBiasdev);
-    }
-}
-
-// IPluginV2 Methods
-char const* FCInt8PluginDynamic::getPluginType() const noexcept { return kFC_NAME; }
-
-char const* FCInt8PluginDynamic::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-int32_t FCInt8PluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FCInt8PluginDynamic::initialize() noexcept {
-    gLogInfo << "FCInt8PluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FCInt8PluginDynamic::terminate() noexcept { gLogInfo << "FCInt8PluginDynamic terminate" << endl; }
-
-size_t FCInt8PluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mType) + sizeof(mOutDim) + sizeof(mNumParams) + sizeof(mNmax) + sizeof(mK) + sizeof(mNumBias) +
-           mScale.size() * sizeof(float) + sizeof(mScale.size()) + getElementSize(DataType::kINT8) * mNumParams +
-           getElementSize(DataType::kFLOAT) * mNumBias;
-}
-
-void FCInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mOutDim);
-    serialize_value(&buffer, mNumParams);
-    serialize_value(&buffer, mNmax);
-    serialize_value(&buffer, mK);
-    serialize_value(&buffer, mNumBias);
-    serialize_value(&buffer, mScale);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev.get()), mNumParams * getElementSize(DataType::kINT8));
-
-    if (mNumBias > 0) {
-        serFromDev(d, static_cast<char*>(mBiasdev.get()), mNumBias * getElementSize(DataType::kFLOAT));
-    }
-}
-
-void FCInt8PluginDynamic::destroy() noexcept {
-    gLogInfo << "FCInt8PluginDynamic destroy" << endl;
-    mWdev.reset(nullptr);
-    if (mNumBias > 0) {
-        mBiasdev.reset(nullptr);
-    }
-    delete this;
-}
-
-void FCInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FCInt8PluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    // IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kINT8);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FCInt8PluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FCInt8PluginDynamic clone" << endl;
-
-        auto* p = new FCInt8PluginDynamic(mLayerName, mType, mOutDim, mW, mBias, mScale);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FCInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                   IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mOutDim);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FCInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                    int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FCInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                          DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        mK = inDims0.d[HDIM];  // hiddensize
-        // IXRT_PLUGIN_ASSERT(hiddenSize * mOutDim == mNumParams);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FCInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                             PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    int32_t const B = inputs[0].dims.d[BDIM];
-    int32_t const S = inputs[0].dims.d[SDIM];
-    int32_t const oE = outputs[0].dims.d[HDIM];
-#ifdef __ILUVATAR__
-        return B * S * oE * sizeof(int8_t);
-#else 
-        return B * S * oE * sizeof(int32_t);
-#endif
-}
-
-int32_t FCInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                     void const* const* inputs, void* const* outputs, void* workSpace,
-                                     cudaStream_t stream) noexcept {
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const E = inputDesc->dims.d[HDIM];
-        int32_t const oE = outputDesc->dims.d[HDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        float qkv_in_scale = inputDesc[0].scale;
-        float qkv_wei_scale = mScale[0];
-        float output_scale = outputDesc[0].scale;
-        float qkv_out_scale;
-        if (mScale.size() == 2) {
-            qkv_out_scale = mScale[1];
-        } else {
-            qkv_out_scale = output_scale;
-        }
-#ifdef __ILUVATAR__
-        int8_t* buffer = static_cast<int8_t*>(workSpace);
-#else
-        int32_t* buffer = static_cast<int32_t*>(workSpace);
-#endif
-        if (mType == DataType::kINT8) {
-            auto const* const input = static_cast<int8_t const*>(inputs[0]);
-            auto* output = static_cast<int8_t*>(outputs[0]);
-            auto weight = static_cast<int8_t*>(mWdev.get());
-
-            float dequant_scale = (qkv_in_scale * qkv_wei_scale) / qkv_out_scale;
-
-            if (mBiasdev.get() != nullptr) {
-#ifdef __ILUVATAR__
-                cuinfer_i8_gemm(weight, input, nullptr, buffer, 1, oE, n, E, 0, 0, 0, dequant_scale, 0.0, 0,
-                                cuinfer_handle, stream);
-                dequantGemmWithBias(buffer, static_cast<float*>(mBiasdev.get()), output, B * S, oE, qkv_out_scale,
-                                    1.0 / output_scale, stream);
-#else
-                cublaslt_gemm(weight, input, buffer, 1, oE, n, E, 0, 0, 0, 1, blaslt_handle, stream);
-                dequantGemmWithBias(buffer, static_cast<float*>(mBiasdev.get()), output, B * S, oE,  dequant_scale, qkv_out_scale,
-                                    1.0 / output_scale, stream);
-#endif
-                
-            } else {
-#ifdef __ILUVATAR__
-                cuinfer_i8_gemm(weight, input, nullptr, output, 1, oE, n, E, 0, 0, 0, dequant_scale, 0.0, 0,
-                                cuinfer_handle, stream);
-#else
-                
-                cublaslt_gemm(weight, input, buffer, 1, oE, n, E, 0, 0, 0, 1, blaslt_handle, stream);
-                quantGemm(buffer, output, B * S, oE, dequant_scale, stream);
-#endif
-            }
-        } else {
-            gLogError << "Unsupported type error, expected [kINT8], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu
deleted file mode 100644
index 7e233c87..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu
+++ /dev/null
@@ -1,485 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "fcPlugin.h"
-using namespace nvinfer1::ixrt_plugin::backend;
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_without_bias(const int8_t* input, int8_t* output, int hidden_size, float dequant_scale,
-                                          float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    char4* p_input = (char4*)input;
-    char4* p_output = (char4*)output;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-
-        val[it].x = __int2float_rn(p_input[element_index].x) * dequant_scale;
-        val[it].y = __int2float_rn(p_input[element_index].y) * dequant_scale;
-        val[it].z = __int2float_rn(p_input[element_index].z) * dequant_scale;
-        val[it].w = __int2float_rn(p_input[element_index].w) * dequant_scale;
-
-        char4 res = float42char4(val[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_with_bias(const int8_t* input, const float* bias, int8_t* output, int hidden_size,
-                                       float dequant_scale, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    char4* p_input = (char4*)input;
-    float4* p_bias = (float4*)bias;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        bias_val.x = p_bias[element_index].x;
-        bias_val.y = p_bias[element_index].y;
-        bias_val.z = p_bias[element_index].z;
-        bias_val.w = p_bias[element_index].w;
-
-        val[it].x = __int2float_rn(p_input[element_index].x) * dequant_scale + bias_val.x;
-        val[it].y = __int2float_rn(p_input[element_index].y) * dequant_scale + bias_val.y;
-        val[it].z = __int2float_rn(p_input[element_index].z) * dequant_scale + bias_val.z;
-        val[it].w = __int2float_rn(p_input[element_index].w) * dequant_scale + bias_val.w;
-
-        char4 res = float42char4(val[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void dequant_gemm_with_bias(const int32_t* input, const float* bias, int8_t* output, int hidden_size,
-                                       float quant_scale1, float dequant_scale, float quant_scale2, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    float4* p_bias = (float4*)bias;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        bias_val.x = p_bias[element_index].x;
-        bias_val.y = p_bias[element_index].y;
-        bias_val.z = p_bias[element_index].z;
-        bias_val.w = p_bias[element_index].w;
-
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale1);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale1);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale1);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale1);
-
-        val[it].x = __int2float_rn(q_input.x) * dequant_scale + bias_val.x;
-        val[it].y = __int2float_rn(q_input.y) * dequant_scale + bias_val.y;
-        val[it].z = __int2float_rn(q_input.z) * dequant_scale + bias_val.z;
-        val[it].w = __int2float_rn(q_input.w) * dequant_scale + bias_val.w;
-
-        char4 res = float42char4(val[it], quant_scale2);
-        p_output[element_index] = res;
-    }
-}
-
-void dequantGemmWithoutBias(int8_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-                            float quant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_without_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_without_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_without_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_without_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_without_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_without_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_without_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_without_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_without_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_without_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_without_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_without_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_without_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_without_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_without_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_without_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithoutBias");
-            break;
-    }
-}
-
-void dequantGemmWithBias(int8_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale, float quant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_with_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_with_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_with_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_with_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_with_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_with_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_with_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_with_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_with_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_with_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_with_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_with_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_with_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_with_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_with_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_with_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithBias with int8_t input");
-            break;
-    }
-}
-
-void dequantGemmWithBias(int32_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float quant_scale1, float dequant_scale, float quant_scale2, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            dequant_gemm_with_bias<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 2:
-            dequant_gemm_with_bias<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 3:
-            dequant_gemm_with_bias<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 4:
-            dequant_gemm_with_bias<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 5:
-            dequant_gemm_with_bias<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 6:
-            dequant_gemm_with_bias<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 7:
-            dequant_gemm_with_bias<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 8:
-            dequant_gemm_with_bias<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 9:
-            dequant_gemm_with_bias<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 10:
-            dequant_gemm_with_bias<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 11:
-            dequant_gemm_with_bias<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 12:
-            dequant_gemm_with_bias<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 13:
-            dequant_gemm_with_bias<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 14:
-            dequant_gemm_with_bias<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 15:
-            dequant_gemm_with_bias<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        case 16:
-            dequant_gemm_with_bias<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("dequantGemmWithBias with int32_t input");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void quant_gemm(const int32_t* input, int8_t* output, int hidden_size, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_start = blockIdx.x * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale);
-
-        p_output[element_index] = q_input;
-    }
-}
-
-void quantGemm(int32_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int num_per_tca = 64;
-    dim3 gridSize(batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-
-    switch (num_warp) {
-        case 1:
-            quant_gemm<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 2:
-            quant_gemm<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 3:
-            quant_gemm<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 4:
-            quant_gemm<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 5:
-            quant_gemm<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 6:
-            quant_gemm<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 7:
-            quant_gemm<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 8:
-            quant_gemm<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 9:
-            quant_gemm<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 10:
-            quant_gemm<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 11:
-            quant_gemm<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 12:
-            quant_gemm<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 13:
-            quant_gemm<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 14:
-            quant_gemm<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 15:
-            quant_gemm<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 16:
-            quant_gemm<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("quantGemm");
-            break;
-    }
-}
-
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp
deleted file mode 100644
index 67541535..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-#include "fcPlugin.h"
-
-#include "NvInferRuntimeCommon.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFC_VERSION{"1"};
-char const* const kFC_NAME{"CustomFCPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FCPluginDynamicCreator::mFC{};
-std::vector<PluginField> FCPluginDynamicCreator::mPluginAttributes;
-
-FCPluginDynamicCreator::FCPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("W", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* FCPluginDynamicCreator::getPluginName() const noexcept { return kFC_NAME; }
-
-char const* FCPluginDynamicCreator::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-PluginFieldCollection const* FCPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* FCPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FCPluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        int32_t typeId = -1;
-        Weights W{DataType::kFLOAT, nullptr, 0LL};
-        Weights B{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "type_id", "W"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (fieldName.compare("W") == 0) {
-                gLogInfo << "Building W..." << endl;
-                W.values = fc->fields[i].data;
-                W.count = fc->fields[i].length;
-                W.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W float32: " << (W.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("B") == 0) {
-                gLogInfo << "Building B..." << endl;
-                B.values = fc->fields[i].data;
-                B.count = fc->fields[i].length;
-                B.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is B float32: " << (B.type == DataType::kFLOAT) << endl;
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (typeId < 0 || typeId > 1) {
-            gLogInfo << "Invalid type id" << typeId << endl;
-        }
-        if (W.count == 0 || W.values == nullptr || W.count < outDims) {
-            gLogInfo << "Invalid weights" << endl;
-        }
-
-        DataType type = typeId == 0 ? DataType::kFLOAT : DataType::kHALF;
-        return new FCPluginDynamic(name, type, outDims, W, B);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FCPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                     size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FCPluginDynamic::destroy()
-    try {
-        return new FCPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FCPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FCPluginDynamicCreator);
-//#########################################################################//
-FCPluginDynamic::FCPluginDynamic(std::string const name, DataType const type, int32_t const outDim, Weights const& W,
-                                 Weights const& B)
-    : mLayerName(name),
-      mType(type),
-      mOutDim(outDim),
-      mNumParams(W.count),
-      mNumBias(B.count),
-      mWdev(nullptr),
-      mBdev(nullptr) {
-    mW.convertAndCopy(W, mType);
-    copyToDevice(mW, getWeightsSize(mW, mType), mWdev);
-    if (mNumBias) {
-        mB.convertAndCopy(B, mType);
-        copyToDevice(mB, getWeightsSize(mB, mType), mBdev);
-    }
-}
-
-FCPluginDynamic::FCPluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev(nullptr) {
-    gLogInfo << "FCPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mOutDim);
-    deserialize_value(&data, &length, &mNumParams);
-    deserialize_value(&data, &length, &mNumBias);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW.convertAndCopy(d, mNumParams, mType);
-    copyToDevice(mW, getWeightsSize(mW, mType), mWdev);
-    if (mNumBias) {
-        mB.convertAndCopy(d, mNumBias, mType);
-        copyToDevice(mB, getWeightsSize(mB, mType), mBdev);
-    }
-}
-
-// IPluginV2 Methods
-char const* FCPluginDynamic::getPluginType() const noexcept { return kFC_NAME; }
-
-char const* FCPluginDynamic::getPluginVersion() const noexcept { return kFC_VERSION; }
-
-int32_t FCPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FCPluginDynamic::initialize() noexcept {
-    gLogInfo << "FCPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FCPluginDynamic::terminate() noexcept { gLogInfo << "FCPluginDynamic terminate" << endl; }
-
-size_t FCPluginDynamic::getSerializationSize() const noexcept {
-    size_t wordSize = getElementSize(mType);
-    return wordSize * (mNumParams + mNumBias) + sizeof(mType) + sizeof(mOutDim) + sizeof(mNumParams) + sizeof(mNumBias);
-}
-
-void FCPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mOutDim);
-    serialize_value(&buffer, mNumParams);
-    serialize_value(&buffer, mNumBias);
-
-    size_t wordSize = getElementSize(mType);
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev.get()), mNumParams * wordSize);
-    if (mNumBias) {
-        serFromDev(d, static_cast<char*>(mBdev.get()), mNumBias * wordSize);
-    }
-}
-
-void FCPluginDynamic::destroy() noexcept {
-    gLogInfo << "FCPluginDynamic destroy" << endl;
-    mWdev.reset(nullptr);
-    if (mNumBias) {
-        mBdev.reset(nullptr);
-    }
-    delete this;
-}
-
-void FCPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FCPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FCPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                            int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FCPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FCPluginDynamic clone" << endl;
-
-        auto* p = new FCPluginDynamic(mLayerName, mType, mOutDim, mW, mB);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FCPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                               IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mOutDim);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FCPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FCPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                      DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        // IXRT_PLUGIN_ASSERT(hiddenSize * mOutDim == mNumParams);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FCPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                         PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t FCPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                 void const* const* inputs, void* const* outputs, void* workSpace,
-                                 cudaStream_t stream) noexcept {
-    gLogInfo << "in FCPluginDynamic.." << endl;
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const E = inputDesc->dims.d[HDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        if (mType == DataType::kHALF) {
-            auto const* const input = static_cast<half const*>(inputs[0]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto weight = static_cast<half*>(mWdev.get());
-            half* bias = nullptr;
-            if (mNumBias) {
-                bias = static_cast<half*>(mBdev.get());
-            }
-
-#ifdef __ILUVATAR__
-            cuinfer_gemm(weight, input, bias, output, 1, mOutDim, n, E, 0, 0, 0, 1.0f, -1, stream, cuinfer_handle);
-#else
-            cublaslt_gemm(weight, input, output, 1, mOutDim, n, E, 0, 0, 0, 1.0f, blaslt_handle, stream);
-#endif
-        } else {
-            gLogError << "Unsupported type error, expected [kHALF,kFLOAT], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h
deleted file mode 100644
index 2f9115dc..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h
+++ /dev/null
@@ -1,246 +0,0 @@
-
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "bertCommon.h"
-#include "driver_types.h"
-
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#else
-#include "backend/cublas/cublas_helper.h"
-#endif
-
-namespace nvinfer1 {
-namespace ixrt_plugin {
-namespace bert {
-
-void quantGemm(int32_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-               cudaStream_t stream);
-
-void dequantGemmWithBias(int32_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale1, float dequant_scale2, float quant_scale, cudaStream_t stream);
-
-void dequantGemmWithBias(int8_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size,
-                         float dequant_scale, float quant_scale, cudaStream_t stream);
-
-void dequantGemmWithoutBias(int8_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale,
-                            float quant_scale, cudaStream_t stream);
-
-class FCPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FCPluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                    nvinfer1::Weights const& W, nvinfer1::Weights const& B);
-
-    FCPluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FCPluginDynamic without arguments, so we
-    // delete default constructor.
-    FCPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    size_t mNumBias;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-    bert::WeightsWithOwnership mB;
-    bert::cuda_unique_ptr<void> mBdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FCPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FCPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class FCInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FCInt8PluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                        nvinfer1::Weights const& W, nvinfer1::Weights const& Bias, vector<float> const& scale);
-
-    FCInt8PluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FCInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    FCInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    int32_t mNmax;
-    int32_t mK;
-    int32_t mNumBias;
-
-    vector<float> mScale;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FCInt8PluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FCInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace ixrt_plugin
-}  // namespace nvinfer1
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp
deleted file mode 100644
index 292e8a63..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp
+++ /dev/null
@@ -1,503 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "NvInferImpl.h"
-#include "NvInferRuntimeCommon.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "driver_types.h"
-#include "embLayerNormInt8Plugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* EMB_LAYER_NORM_INT8_VERSION{"2"};
-char const* EMB_LAYER_NORM_INT8_NAME{"CustomEmbLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection EmbLayerNormInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> EmbLayerNormInt8PluginDynamicCreator::mPluginAttributes;
-
-EmbLayerNormInt8PluginDynamicCreator::EmbLayerNormInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_beta"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_gamma"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_word_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_token_type_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_position_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("output_fp16"));
-    mPluginAttributes.emplace_back(PluginField("full_mask"));
-    mPluginAttributes.emplace_back(PluginField("mha_type_id"));
-    mPluginAttributes.emplace_back(PluginField("pad_id"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginName() const noexcept { return EMB_LAYER_NORM_INT8_NAME; }
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginVersion() const noexcept {
-    return EMB_LAYER_NORM_INT8_VERSION;
-}
-
-PluginFieldCollection const* EmbLayerNormInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamicCreator::createPlugin(char const* name,
-                                                                        PluginFieldCollection const* fc) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        gLogInfo << "EmbLayerNormInt8PluginDynamic createPlugin." << endl;
-        std::set<std::string> const requiredAttributes{
-            "bert_embeddings_layernorm_beta",      "bert_embeddings_layernorm_gamma",
-            "bert_embeddings_word_embeddings",     "bert_embeddings_token_type_embeddings",
-            "bert_embeddings_position_embeddings",
-        };
-
-        bool output_fp16 = false;
-        bool useFullMask = false;
-        Weights beta{};
-        Weights gamma{};
-        Weights word_emb{};
-        Weights pos_emb{};
-        Weights tok_emb{};
-        int32_t mhaTypeId = 0;
-        int32_t pad_id = 0;
-
-        for (auto i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("bert_embeddings_layernorm_beta") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_word_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_word_embeddings..." << endl;
-                word_emb.values = fc->fields[i].data;
-                word_emb.count = fc->fields[i].length;
-                word_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_token_type_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_token_type_embeddings..." << endl;
-                tok_emb.values = fc->fields[i].data;
-                tok_emb.count = fc->fields[i].length;
-                tok_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_position_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_position_embeddings..." << endl;
-                pos_emb.values = fc->fields[i].data;
-                pos_emb.count = fc->fields[i].length;
-                pos_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("output_fp16") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building output_fp16: " << output_fp16 << endl;
-            }
-
-            if (field_name.compare("full_mask") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                useFullMask = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building full_mask: " << useFullMask << endl;
-            }
-
-            if (field_name.compare("mha_type_id") == 0) {
-                mhaTypeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_ASSERT(mhaTypeId >= 0 && mhaTypeId < 3);
-                gLogInfo << "Building mha typeId: " << mhaTypeId << endl;
-            }
-
-            if (field_name.compare("pad_id") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32)
-                pad_id = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-        gLogInfo << "Building EmbLayerNormInt8PluginDynamic Plugin..." << endl;
-        DataType mhaType = static_cast<DataType>(mhaTypeId);
-        EmbLayerNormInt8PluginDynamic* p =
-            new EmbLayerNormInt8PluginDynamic(name, output_fp16 ? DataType::kHALF : DataType::kFLOAT, mhaType, beta,
-                                              gamma, word_emb, pos_emb, tok_emb, useFullMask, pad_id);
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                             size_t serialLength) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(serialData != nullptr);
-        return new EmbLayerNormInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void EmbLayerNormInt8PluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(pluginNamespace != nullptr);
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(EmbLayerNormInt8PluginDynamicCreator);
-
-//#########################################################################//
-EmbLayerNormInt8PluginDynamic::EmbLayerNormInt8PluginDynamic(std::string const& name, DataType const type,
-                                                             DataType const mhaType, Weights const& beta,
-                                                             Weights const& gamma, Weights const& wordEmb,
-                                                             Weights const& posEmb, Weights const& tokEmb,
-                                                             bool const useFullMask, int32_t padId)
-    : mLayerName(name),
-      mHiddenSize(beta.count),
-      mEmbType(type),
-      mUseFullMask(useFullMask),
-      mMhaType(mhaType),
-      mPadId(padId) {
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    IXRT_PLUGIN_ASSERT(mHiddenSize > 0U);
-    IXRT_PLUGIN_ASSERT(wordEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(posEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(tokEmb.count % mHiddenSize == 0);
-    mWordVocabSize = wordEmb.count / mHiddenSize;
-    mPosVocabSize = posEmb.count / mHiddenSize;
-    mTokVocabSize = tokEmb.count / mHiddenSize;
-
-    mBeta.convertAndCopy(beta, nvinfer1::DataType::kFLOAT);
-    mGamma.convertAndCopy(gamma, nvinfer1::DataType::kFLOAT);
-    mWordEmb.convertAndCopy(wordEmb, mEmbType);
-    mTokEmb.convertAndCopy(tokEmb, mEmbType);
-    mPosEmb.convertAndCopy(posEmb, mEmbType);
-
-    copyToDevice(mGamma, sizeof(float) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(float) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-EmbLayerNormInt8PluginDynamic::EmbLayerNormInt8PluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mWordEmbDev(nullptr),
-      mTokEmbDev(nullptr),
-      mPosEmbDev(nullptr) {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic deserialize." << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mEmbType);
-    deserialize_value(&data, &length, &mMhaType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mSeqLen);
-    deserialize_value(&data, &length, &mPadId);
-    deserialize_value(&data, &length, &mWordVocabSize);
-    deserialize_value(&data, &length, &mPosVocabSize);
-    deserialize_value(&data, &length, &mTokVocabSize);
-    deserialize_value(&data, &length, &mUseFullMask);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kFLOAT);
-    mGamma.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kFLOAT);
-    mWordEmb.convertAndCopy(d, mHiddenSize * mWordVocabSize, mEmbType);
-    mPosEmb.convertAndCopy(d, mHiddenSize * mPosVocabSize, mEmbType);
-    mTokEmb.convertAndCopy(d, mHiddenSize * mTokVocabSize, mEmbType);
-
-    copyToDevice(mGamma, sizeof(float) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(float) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-// IPluginV2 Methods
-char const* EmbLayerNormInt8PluginDynamic::getPluginType() const noexcept { return EMB_LAYER_NORM_INT8_NAME; }
-
-char const* EmbLayerNormInt8PluginDynamic::getPluginVersion() const noexcept { return EMB_LAYER_NORM_INT8_VERSION; }
-
-int32_t EmbLayerNormInt8PluginDynamic::getNbOutputs() const noexcept { return 3; }
-
-int32_t EmbLayerNormInt8PluginDynamic::initialize() noexcept { return 0; }
-
-void EmbLayerNormInt8PluginDynamic::terminate() noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic terminate." << endl;
-}
-
-size_t EmbLayerNormInt8PluginDynamic::getSerializationSize() const noexcept {
-    size_t const wordSize = getElementSize(mEmbType);
-    return sizeof(mEmbType) * 2                       // mEmbType, mMhaType
-           + sizeof(mHiddenSize) * 6                  // mHiddenSize, mSeqLen, 3*VocabSize, mPadId
-           + sizeof(mUseFullMask)                     // mask type
-           + 2 * sizeof(float) * mHiddenSize           // beta + gamma
-           + wordSize * mHiddenSize * mWordVocabSize  // word emb
-           + wordSize * mHiddenSize * mPosVocabSize   // pos emb
-           + wordSize * mHiddenSize * mTokVocabSize   // tok emb
-        ;
-}
-
-void EmbLayerNormInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mEmbType);
-    serialize_value(&buffer, mMhaType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mSeqLen);
-    serialize_value(&buffer, mPadId);
-    serialize_value(&buffer, mWordVocabSize);
-    serialize_value(&buffer, mPosVocabSize);
-    serialize_value(&buffer, mTokVocabSize);
-    serialize_value(&buffer, mUseFullMask);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, mBetaDev.get(), mHiddenSize);
-    serFromDev(d, mGammaDev.get(), mHiddenSize);
-    size_t const wordSize = getElementSize(mEmbType);
-    serFromDev(d, static_cast<char*>(mWordEmbDev.get()), mHiddenSize * mWordVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mPosEmbDev.get()), mHiddenSize * mPosVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mTokEmbDev.get()), mHiddenSize * mTokVocabSize * wordSize);
-}
-
-void EmbLayerNormInt8PluginDynamic::destroy() noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic destroy." << endl;
-    // This gets called when the network containing plugin is destroyed
-    mGammaDev.reset(nullptr);
-    mBetaDev.reset(nullptr);
-    mWordEmbDev.reset(nullptr);
-    mPosEmbDev.reset(nullptr);
-    mTokEmbDev.reset(nullptr);
-    delete this;
-}
-
-void EmbLayerNormInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType EmbLayerNormInt8PluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                          int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index >= 0 && index <= 2);
-    if (index == 0) {
-        return mMhaType;
-    }
-    if (index == 1) {
-        return DataType::kINT8;
-    }
-    return DataType::kFLOAT;
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "EmbLayerNormInt8PluginDynamic clone." << endl;
-
-        auto p = new EmbLayerNormInt8PluginDynamic(mLayerName, mEmbType, mMhaType, mBeta, mGamma, mWordEmb, mPosEmb,
-                                                   mTokEmb, mUseFullMask);
-        p->mSeqLen = mSeqLen;
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs EmbLayerNormInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                             int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        // Input should be input ids and token ids and the input mask
-        // Output should be the embeddings tensor and mask indices
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == 2);  // BxS
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[2].nbDims);
-
-        IXRT_PLUGIN_ASSERT(outputIndex >= 0 || outputIndex <= 2);
-
-        if (outputIndex == 0) {
-            DimsExprs ret;
-            ret.nbDims = 5;
-            ret.d[0] = inputs[0].d[BDIM];
-            ret.d[1] = inputs[0].d[SDIM];
-            ret.d[2] = exprBuilder.constant(mHiddenSize);
-            ret.d[3] = exprBuilder.constant(1);
-            ret.d[4] = exprBuilder.constant(1);
-            return ret;
-        }
-        if (outputIndex == 1) {
-            DimsExprs ret;
-            ret.nbDims = 2;
-            ret.d[0] = inputs[0].d[BDIM];
-            ret.d[1] = inputs[0].d[SDIM];
-            return ret;
-        }
-
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[BDIM];
-        ret.d[1] = inputs[0].d[SDIM];
-        ret.d[2] = exprBuilder.constant(mHiddenSize);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool EmbLayerNormInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                              int32_t nbInputs, int32_t nbOutputs) noexcept {
-    // 3 inputs of size BxS
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 3);
-
-    PluginTensorDesc const& desc = inOut[pos];
-    if (desc.format != TensorFormat::kLINEAR) {
-        return false;
-    }
-    if (pos == 0) {
-        return desc.type == DataType::kINT32;
-    }
-
-    PluginTensorDesc const& prev = inOut[pos - 1];
-    if (pos == 1 || pos == 2) {
-        return desc.type == DataType::kINT32 && desc.format == prev.format;
-    }
-
-    // emb_out
-    if (pos == 3 || pos == 4) {
-        return desc.type == DataType::kINT8 && desc.format == prev.format;
-    }
-    // residual
-    return desc.type == DataType::kFLOAT;
-}
-
-void EmbLayerNormInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                    DynamicPluginTensorDesc const* outputs,
-                                                    int32_t nbOutputs) noexcept {
-    gLogInfo << "EmbLayerNormInt8PluginDynamic configurePlugin." << endl;
-
-    // Validate input arguments
-    IXRT_PLUGIN_ASSERT(nbOutputs == 3);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-    IXRT_PLUGIN_ASSERT(inputs[0].desc.dims.nbDims == 2);
-    int32_t const S = inputs[0].desc.dims.d[SDIM];
-    mSeqLen = S;
-    int32_t const B = inputs[0].desc.dims.d[BDIM];
-    TRT_UNUSED B;
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[1].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[1].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[2].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[2].desc.dims.d[BDIM]);
-
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[SDIM]) == mSeqLen);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[BDIM] == B);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[2]) == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[4] == 1);
-
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[0] == B);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[1] == S);
-
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[SDIM] == outputs[0].desc.dims.d[SDIM]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[BDIM] == outputs[0].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[2] == outputs[0].desc.dims.d[2]);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[4] == 1);
-}
-
-size_t EmbLayerNormInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                       PluginTensorDesc const* outputs,
-                                                       int32_t nbOutputs) const noexcept {
-    int32_t const B = inputs[0].dims.d[BDIM];
-    int32_t const S = inputs[0].dims.d[SDIM];
-    return B * S * sizeof(int32_t);
-}
-
-int32_t EmbLayerNormInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                               void const* const* inputs, void* const* outputs, void* workspace,
-                                               cudaStream_t stream) noexcept {
-    try {
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t status = STATUS_SUCCESS;
-        int32_t fmha_S = S;
-        int32_t batch_tokens = B * fmha_S;
-
-        // Our plugin outputs only one tensor
-        auto const inputIds = static_cast<int32_t const*>(inputs[0]);
-        auto const segmentIds = static_cast<int32_t const*>(inputs[1]);
-
-        float const* beta = mBetaDev.get();
-        float const* gamma = mGammaDev.get();
-        auto output = static_cast<int8_t*>(outputs[0]);
-        auto mNewMask = static_cast<int8_t*>(outputs[1]);
-        auto residual = static_cast<float*>(outputs[2]);
-        auto const wordEmb = static_cast<float const*>(mWordEmbDev.get());
-        auto const tokEmb = static_cast<float const*>(mTokEmbDev.get());
-        auto const posEmb = static_cast<float const*>(mPosEmbDev.get());
-
-        float l0_qkv_in_amax = outputDesc[0].scale * 127;
-
-        auto mask_idx = static_cast<int32_t*>(workspace);
-        status = embLayerNorm(stream, static_cast<int32_t>(mHiddenSize), B, S, inputIds, segmentIds, beta, gamma,
-                              wordEmb, posEmb, tokEmb, mWordVocabSize, mTokVocabSize, residual, output, mask_idx,
-                              mPadId, l0_qkv_in_amax);
-
-        IxinferMaskPad(mask_idx, mNewMask, B, S, mHiddenSize, fmha_S, batch_tokens, stream);
-
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu
deleted file mode 100644
index 3aa0cd86..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu
+++ /dev/null
@@ -1,342 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "embLayerNormInt8Plugin.h"
-#include "backend/bert/bert_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualI8O(const float *input, int8_t *output, int hidden_size, float quant_scale) {
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size;
-
-    input += block_start;
-    output += block_start;
-
-    float4 *p_input = (float4 *)input;
-    char4 *p_output = (char4 *)output;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        vals[it].x = p_input[element_index].x;
-        vals[it].y = p_input[element_index].y;
-        vals[it].z = p_input[element_index].z;
-        vals[it].w = p_input[element_index].w;
-
-        char4 res = float42char4(vals[it], quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <typename T>
-void IxinferResidualI8OLauncher(const T *input, int8_t *output, int batch_tokens, int hidden_size, float quant_scale,
-                                  cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size / 4 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            IxinferResidualI8O<1><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 2:
-            IxinferResidualI8O<2><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 3:
-            IxinferResidualI8O<3><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 4:
-            IxinferResidualI8O<4><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 5:
-            IxinferResidualI8O<5><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 6:
-            IxinferResidualI8O<6><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 7:
-            IxinferResidualI8O<7><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 8:
-            IxinferResidualI8O<8><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 9:
-            IxinferResidualI8O<9><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 10:
-            IxinferResidualI8O<10><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 11:
-            IxinferResidualI8O<11><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 12:
-            IxinferResidualI8O<12><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 13:
-            IxinferResidualI8O<13><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 14:
-            IxinferResidualI8O<14><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 15:
-            IxinferResidualI8O<15><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        case 16:
-            IxinferResidualI8O<16><<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, quant_scale);
-            break;
-        default:
-            throw std::runtime_error("IxinferResidualI8OLauncher");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferBertEmbedLnKernel(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens,
-                                         float *output, int *pad_mask, int *type_ids, int pad_id, int batch_size,
-                                         int seq_len, int hidden_dim, const float *scale, const float *bias) {
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_dim;
-    int batch_idx, seq_idx;
-    batch_idx = blockIdx.x / seq_len;
-    seq_idx = blockIdx.x % seq_len;
-
-    int tokens_idx = blockIdx.x;
-    int token = tokens[tokens_idx];
-    int token_type = type_ids[tokens_idx];
-
-    output += block_start;
-
-    float4 *p_output = (float4 *)output;
-
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_value = (float4 *)(token_emb + token * hidden_dim);
-    float4 *p_pemb = (float4 *)(pos_emb + seq_idx * hidden_dim);
-    float4 *p_temb = (float4 *)(type_emb + token_type * hidden_dim);
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-        if (token == pad_id) {
-            if (element_index == 0) {
-                pad_mask[tokens_idx] = 1;
-            }
-            vals[it] = make_float4(0.f, 0.f, 0.f, 0.f);
-
-        } else {
-            if (element_index == 0) {
-                pad_mask[tokens_idx] = 0;
-            }
-        
-            vals[it].x = p_value[element_index].x + p_pemb[element_index].x + p_temb[element_index].x;
-            vals[it].y = p_value[element_index].y + p_pemb[element_index].y + p_temb[element_index].y;
-            vals[it].z = p_value[element_index].z + p_pemb[element_index].z + p_temb[element_index].z;
-            vals[it].w = p_value[element_index].w + p_pemb[element_index].w + p_temb[element_index].w;
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-        }
-    }
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 scale_value = p_scale[element_index];
-        float4 bias_value = p_bias[element_index];
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_dim, epsilon,
-                                                      scale_value, bias_value);
-        int tokens_idx = blockIdx.x;
-
-        int token = tokens[tokens_idx];
-        if (token == pad_id) {
-            p_output[element_index] = make_float4(0.f, 0.f, 0.f, 0.f);
-        } else {
-            p_output[element_index] = norm_value;
-        }
-    }
-}
-
-
-void IxinferBertEmbedLn(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, float *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const float *scale, const float *bias, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    int batch_tokens = batch_size * seq_len;
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-    int num_warp = hidden_size / C10_WARP_SIZE / 4; 
-
-    switch (num_warp) {
-        case 1:
-            IxinferBertEmbedLnKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 2:
-            IxinferBertEmbedLnKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 3:
-            IxinferBertEmbedLnKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 4:
-            IxinferBertEmbedLnKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 5:
-            IxinferBertEmbedLnKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 6:
-            IxinferBertEmbedLnKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 7:
-            IxinferBertEmbedLnKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 8:
-            IxinferBertEmbedLnKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 9:
-            IxinferBertEmbedLnKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 10:
-            IxinferBertEmbedLnKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 11:
-            IxinferBertEmbedLnKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 12:
-            IxinferBertEmbedLnKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 13:
-            IxinferBertEmbedLnKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 14:
-            IxinferBertEmbedLnKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 15:
-            IxinferBertEmbedLnKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 16:
-            IxinferBertEmbedLnKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        default:
-            throw std::runtime_error("IxinferBertEmbedLn");
-            break;
-    }
-}
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    float const* beta, float const* gamma, float const* wordEmb, float const* posEmb, float const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, float* buffer, int8_t* output, int32_t* maskIdx, int32_t padId, float l0_qkv_in_amax)
-{
-    IxinferBertEmbedLn(wordEmb, posEmb, tokEmb, inputIds, buffer, maskIdx, (int*)segmentIds,
-                                    padId, B, S, E, gamma, beta, stream);
-                         
-    IxinferResidualI8OLauncher<float>(buffer, output, B*S, E, 127.0 / l0_qkv_in_amax, stream);
-    return cudaSuccess;
-}
-
-void __global__ IxinferMaskPadKernel(const int32_t* mask, int8_t* new_mask, int bsz,
-                                    int ori_seq_len, int hsz, int fmha_seq_len) {
-    int batch_idx = blockIdx.x;
-    int seq_idx = blockIdx.y;
-
-    if (seq_idx < ori_seq_len) {
-        if (threadIdx.x == 0) {
-            new_mask[batch_idx * fmha_seq_len + seq_idx] = mask[batch_idx * ori_seq_len + seq_idx];
-        }
-    } else {
-        new_mask[batch_idx * fmha_seq_len + seq_idx] = 1;
-    }
-} 
-
-void IxinferMaskPad(int32_t* mask, int8_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream) {
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hsz/2>4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 !=0");
-    }
-    if (ori_seq_len > fmha_seq_len) {
-        throw std::runtime_error("ori_seq_len > fmha_seq_len");
-    }
-    if (bsz * ori_seq_len > batch_tokens) {
-        throw std::runtime_error("bsz*ori_seq_len > batch_tokens");
-    }
-    dim3 blockSize(bsz, fmha_seq_len);
-    IxinferMaskPadKernel<<<blockSize, hsz / 2, 0, stream>>>(mask, new_mask, bsz, ori_seq_len, hsz,
-                                                           fmha_seq_len);
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin                  
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h
deleted file mode 100644
index 5fee7a43..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-void IxinferBertEmbedLn(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, float *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const float *scale, const float *bias, cudaStream_t stream);
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    float const* beta, float const* gamma, float const* wordEmb, float const* posEmb, float const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, float* buffer, int8_t* output, int32_t* maskIdx, int32_t padId, float token_embed_amax_);
-
-void IxinferMaskPad(int32_t* mask, int8_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream);
-
-class EmbLayerNormInt8PluginDynamic : public IPluginV2DynamicExt {
-   public:
-    EmbLayerNormInt8PluginDynamic(std::string const& name, nvinfer1::DataType const type, nvinfer1::DataType const mhaType,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& word_emb,
-        nvinfer1::Weights const& pos_emb, nvinfer1::Weights const& tok_emb, bool const useFullMask, int32_t padId = 0);
-    EmbLayerNormInt8PluginDynamic(std::string const& name, void const* data, size_t length);
-    EmbLayerNormInt8PluginDynamic() noexcept = delete;
-    ~EmbLayerNormInt8PluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    size_t mHiddenSize;
-    size_t mSeqLen;
-    size_t mPadId;
-    DataType mEmbType;
-    bool mUseFullMask;
-    DataType mMhaType;
-    size_t mWordVocabSize, mPosVocabSize, mTokVocabSize;
-    cuda_unique_ptr<float> mGammaDev;
-    cuda_unique_ptr<float> mBetaDev;
-    cuda_unique_ptr<void> mWordEmbDev;
-    cuda_unique_ptr<void> mTokEmbDev;
-    cuda_unique_ptr<void> mPosEmbDev;
-    // cuda_unique_ptr<int32_t> mNewMask;
-    WeightsWithOwnership mBeta;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mWordEmb;
-    WeightsWithOwnership mTokEmb;
-    WeightsWithOwnership mPosEmb; 
-};
-
-class EmbLayerNormInt8PluginDynamicCreator : public IPluginCreator {
-   public:
-    EmbLayerNormInt8PluginDynamicCreator();
-
-    ~EmbLayerNormInt8PluginDynamicCreator() override = default;
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    PluginFieldCollection const* getFieldNames() noexcept override;
-
-    IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
-
-    IPluginV2DynamicExt* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
-
-};
-
-
-} // namespace bert
-} //namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp
deleted file mode 100644
index 499b2eef..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp
+++ /dev/null
@@ -1,495 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "embLayerNormPlugin.h"
-
-#include "NvInferImpl.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "driver_types.h"
-
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* EMB_LAYER_NORM_VERSION{"1"};
-char const* EMB_LAYER_NORM_NAME{"CustomEmbLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection EmbLayerNormPluginDynamicCreator::mFC{};
-std::vector<PluginField> EmbLayerNormPluginDynamicCreator::mPluginAttributes;
-
-EmbLayerNormPluginDynamicCreator::EmbLayerNormPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_beta"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_gamma"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_word_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_token_type_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("bert_embeddings_position_embeddings"));
-    mPluginAttributes.emplace_back(PluginField("output_fp16"));
-    mPluginAttributes.emplace_back(PluginField("full_mask"));
-    mPluginAttributes.emplace_back(PluginField("mha_type_id"));
-    mPluginAttributes.emplace_back(PluginField("pad_id"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginName() const noexcept { return EMB_LAYER_NORM_NAME; }
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginVersion() const noexcept { return EMB_LAYER_NORM_VERSION; }
-
-PluginFieldCollection const* EmbLayerNormPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2DynamicExt* EmbLayerNormPluginDynamicCreator::createPlugin(char const* name,
-                                                                    PluginFieldCollection const* fc) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        gLogInfo << "EmbLayerNormPluginDynamic createPlugin." << endl;
-        std::set<std::string> const requiredAttributes{
-            "bert_embeddings_layernorm_beta",      "bert_embeddings_layernorm_gamma",
-            "bert_embeddings_word_embeddings",     "bert_embeddings_token_type_embeddings",
-            "bert_embeddings_position_embeddings",
-        };
-
-        bool output_fp16 = false;
-        bool useFullMask = false;
-        Weights beta{};
-        Weights gamma{};
-        Weights word_emb{};
-        Weights pos_emb{};
-        Weights tok_emb{};
-        int32_t mhaTypeId = 0;
-        int32_t pad_id = 0;
-
-        for (auto i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("bert_embeddings_layernorm_beta") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) {
-                gLogInfo << "Building bert_embeddings_layernorm_gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_word_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_word_embeddings..." << endl;
-                word_emb.values = fc->fields[i].data;
-                word_emb.count = fc->fields[i].length;
-                word_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_token_type_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_token_type_embeddings..." << endl;
-                tok_emb.values = fc->fields[i].data;
-                tok_emb.count = fc->fields[i].length;
-                tok_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bert_embeddings_position_embeddings") == 0) {
-                gLogInfo << "Building bert_embeddings_position_embeddings..." << endl;
-                pos_emb.values = fc->fields[i].data;
-                pos_emb.count = fc->fields[i].length;
-                pos_emb.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("output_fp16") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building output_fp16: " << output_fp16 << endl;
-            }
-
-            if (field_name.compare("full_mask") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-                useFullMask = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
-                gLogInfo << "Building full_mask: " << useFullMask << endl;
-            }
-
-            if (field_name.compare("mha_type_id") == 0) {
-                mhaTypeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_ASSERT(mhaTypeId >= 0 && mhaTypeId < 3);
-                gLogInfo << "Building mha typeId: " << mhaTypeId << endl;
-            }
-
-            if (field_name.compare("pad_id") == 0) {
-                IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32)
-                pad_id = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-        gLogInfo << "Building EmbLayerNormPluginDynamic Plugin..." << endl;
-        DataType mhaType = static_cast<DataType>(mhaTypeId);
-        EmbLayerNormPluginDynamic* p =
-            new EmbLayerNormPluginDynamic(name, output_fp16 ? DataType::kHALF : DataType::kFLOAT, mhaType, beta, gamma,
-                                          word_emb, pos_emb, tok_emb, useFullMask, pad_id);
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2DynamicExt* EmbLayerNormPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                         size_t serialLength) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(serialData != nullptr);
-        return new EmbLayerNormPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void EmbLayerNormPluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(pluginNamespace != nullptr);
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(EmbLayerNormPluginDynamicCreator);
-
-//#########################################################################//
-EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(std::string const& name, DataType const type,
-                                                     DataType const mhaType, Weights const& beta, Weights const& gamma,
-                                                     Weights const& wordEmb, Weights const& posEmb,
-                                                     Weights const& tokEmb, bool const useFullMask, int32_t padId)
-    : mLayerName(name),
-      mHiddenSize(beta.count),
-      mEmbType(type),
-      mUseFullMask(useFullMask),
-      mMhaType(mhaType),
-      mPadId(padId) {
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    IXRT_PLUGIN_ASSERT(mHiddenSize > 0U);
-    IXRT_PLUGIN_ASSERT(wordEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(posEmb.count % mHiddenSize == 0);
-    IXRT_PLUGIN_ASSERT(tokEmb.count % mHiddenSize == 0);
-    mWordVocabSize = wordEmb.count / mHiddenSize;
-    mPosVocabSize = posEmb.count / mHiddenSize;
-    mTokVocabSize = tokEmb.count / mHiddenSize;
-
-    mBeta.convertAndCopy(beta, nvinfer1::DataType::kHALF);
-    mGamma.convertAndCopy(gamma, nvinfer1::DataType::kHALF);
-    mWordEmb.convertAndCopy(wordEmb, mEmbType);
-    mTokEmb.convertAndCopy(tokEmb, mEmbType);
-    mPosEmb.convertAndCopy(posEmb, mEmbType);
-
-    copyToDevice(mGamma, sizeof(half) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(half) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mWordEmbDev(nullptr),
-      mTokEmbDev(nullptr),
-      mPosEmbDev(nullptr) {
-    gLogInfo << "EmbLayerNormPluginDynamic deserialize." << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mEmbType);
-    deserialize_value(&data, &length, &mMhaType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mSeqLen);
-    deserialize_value(&data, &length, &mPadId);
-    deserialize_value(&data, &length, &mWordVocabSize);
-    deserialize_value(&data, &length, &mPosVocabSize);
-    deserialize_value(&data, &length, &mTokVocabSize);
-    deserialize_value(&data, &length, &mUseFullMask);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kHALF);
-    mGamma.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kHALF);
-    mWordEmb.convertAndCopy(d, mHiddenSize * mWordVocabSize, mEmbType);
-    mPosEmb.convertAndCopy(d, mHiddenSize * mPosVocabSize, mEmbType);
-    mTokEmb.convertAndCopy(d, mHiddenSize * mTokVocabSize, mEmbType);
-
-    copyToDevice(mGamma, sizeof(half) * mGamma.count, mGammaDev);
-    copyToDevice(mBeta, sizeof(half) * mBeta.count, mBetaDev);
-    copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev);
-    copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev);
-    copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev);
-}
-
-// IPluginV2 Methods
-char const* EmbLayerNormPluginDynamic::getPluginType() const noexcept { return EMB_LAYER_NORM_NAME; }
-
-char const* EmbLayerNormPluginDynamic::getPluginVersion() const noexcept { return EMB_LAYER_NORM_VERSION; }
-
-int32_t EmbLayerNormPluginDynamic::getNbOutputs() const noexcept { return 2; }
-
-int32_t EmbLayerNormPluginDynamic::initialize() noexcept { return 0; }
-
-void EmbLayerNormPluginDynamic::terminate() noexcept {  gLogInfo << "EmbLayerNormPluginDynamic terminate." << endl; }
-
-size_t EmbLayerNormPluginDynamic::getSerializationSize() const noexcept {
-    size_t const wordSize = getElementSize(mEmbType);
-    return sizeof(mEmbType) * 2                       // mEmbType, mMhaType
-           + sizeof(mHiddenSize) * 6                  // mHiddenSize, mSeqLen, 3*VocabSize, mPadId
-           + sizeof(mUseFullMask)                     // mask type
-           + 2 * sizeof(half) * mHiddenSize           // beta + gamma
-           + wordSize * mHiddenSize * mWordVocabSize  // word emb
-           + wordSize * mHiddenSize * mPosVocabSize   // pos emb
-           + wordSize * mHiddenSize * mTokVocabSize   // tok emb
-        ;
-}
-
-void EmbLayerNormPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mEmbType);
-    serialize_value(&buffer, mMhaType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mSeqLen);
-    serialize_value(&buffer, mPadId);
-    serialize_value(&buffer, mWordVocabSize);
-    serialize_value(&buffer, mPosVocabSize);
-    serialize_value(&buffer, mTokVocabSize);
-    serialize_value(&buffer, mUseFullMask);
-
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, mBetaDev.get(), mHiddenSize);
-    serFromDev(d, mGammaDev.get(), mHiddenSize);
-    size_t const wordSize = getElementSize(mEmbType);
-    serFromDev(d, static_cast<char*>(mWordEmbDev.get()), mHiddenSize * mWordVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mPosEmbDev.get()), mHiddenSize * mPosVocabSize * wordSize);
-    serFromDev(d, static_cast<char*>(mTokEmbDev.get()), mHiddenSize * mTokVocabSize * wordSize);
-}
-
-void EmbLayerNormPluginDynamic::destroy() noexcept {
-    gLogInfo << "EmbLayerNormPluginDynamic destroy." << endl;
-    // This gets called when the network containing plugin is destroyed
-    mGammaDev.reset(nullptr);
-    mBetaDev.reset(nullptr);
-    mWordEmbDev.reset(nullptr);
-    mPosEmbDev.reset(nullptr);
-    mTokEmbDev.reset(nullptr);
-    delete this;
-}
-
-void EmbLayerNormPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* EmbLayerNormPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType EmbLayerNormPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                      int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0 || index == 1);
-    if (index == 0) {
-        IXRT_PLUGIN_ASSERT(mMhaType == DataType::kHALF || mMhaType == DataType::kFLOAT);
-        return mMhaType;
-    }
-    return DataType::kINT32;
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* EmbLayerNormPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "EmbLayerNormPluginDynamic clone." << endl;
-
-        auto p = new EmbLayerNormPluginDynamic(mLayerName, mEmbType, mMhaType, mBeta, mGamma, mWordEmb, mPosEmb,
-                                               mTokEmb, mUseFullMask);
-        p->mSeqLen = mSeqLen;
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs EmbLayerNormPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                         IExprBuilder& exprBuilder) noexcept {
-    try {
-        // Input should be input ids and token ids and the input mask
-        // Output should be the embeddings tensor and mask indices
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == 2);  // BxS
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[2].nbDims);
-
-        IXRT_PLUGIN_ASSERT(outputIndex == 0 || outputIndex == 1);
-
-        if (outputIndex == 0) {
-            DimsExprs ret;
-            ret.nbDims = 5;
-            ret.d[0] = inputs[0].d[0];
-            ret.d[1] = inputs[0].d[1];
-            ret.d[2] = exprBuilder.constant(mHiddenSize);
-            ret.d[3] = exprBuilder.constant(1);
-            ret.d[4] = exprBuilder.constant(1);
-            return ret;
-        }
-
-        DimsExprs ret;
-        ret.nbDims = 2;
-        ret.d[0] = inputs[0].d[BDIM];
-        ret.d[1] = inputs[0].d[SDIM];
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool EmbLayerNormPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                          int32_t nbOutputs) noexcept {
-    // 3 inputs of size BxS
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 2);
-
-    PluginTensorDesc const& desc = inOut[pos];
-    if (desc.format != TensorFormat::kLINEAR) {
-        return false;
-    }
-    if (pos == 0) {
-        return desc.type == DataType::kINT32;
-    }
-
-    PluginTensorDesc const& prev = inOut[pos - 1];
-    if (pos == 1 || pos == 2) {
-        return desc.type == DataType::kINT32 && desc.format == prev.format;
-    }
-
-    // embedded sequence
-    if (pos == 3) {
-        return desc.type == mMhaType && desc.format == prev.format;
-    }
-    // mask
-    return desc.type == ((mMhaType == DataType::kHALF) ? DataType::kINT32 : mMhaType);
-}
-
-void EmbLayerNormPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    gLogInfo << "EmbLayerNormPluginDynamic configurePlugin." << endl;
-
-    // Validate input arguments
-    IXRT_PLUGIN_ASSERT(nbOutputs == 2);
-    IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-    IXRT_PLUGIN_ASSERT(inputs[0].desc.dims.nbDims == 2);
-    int32_t const S = inputs[0].desc.dims.d[SDIM];
-    mSeqLen = S;
-    int32_t const B = inputs[0].desc.dims.d[BDIM];
-    TRT_UNUSED B;
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[1].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[1].desc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(mSeqLen == static_cast<size_t>(inputs[2].desc.dims.d[SDIM]));
-    IXRT_PLUGIN_ASSERT(B == inputs[2].desc.dims.d[BDIM]);
-
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(mSeqLen == outputs[0].desc.dims.d[SDIM])
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[BDIM] == B);
-    IXRT_PLUGIN_ASSERT(static_cast<size_t>(outputs[0].desc.dims.d[2]) == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[4] == 1);
-
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[0] == B);
-    IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[1] == mSeqLen);
-}
-
-size_t EmbLayerNormPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                   PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t EmbLayerNormPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                           void const* const* inputs, void* const* outputs, void* workspace,
-                                           cudaStream_t stream) noexcept {
-    gLogInfo << "enqueue EmbLayerNormPluginDynamic.." << endl;
-    try {
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t status = STATUS_SUCCESS;
-        int32_t fmha_S = S;
-        int32_t batch_tokens = B * fmha_S;
-
-        // Our plugin outputs only one tensor
-        auto const inputIds = static_cast<int32_t const*>(inputs[0]);
-        auto const segmentIds = static_cast<int32_t const*>(inputs[1]);
-
-        half const* beta = mBetaDev.get();
-        half const* gamma = mGammaDev.get();
-        if (mMhaType == DataType::kFLOAT) {
-            gLogError << "embLayerNormPlugin float type not supported!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        } else if (mMhaType == DataType::kHALF) {
-            auto output = static_cast<half*>(outputs[0]);
-            auto mNewMask = static_cast<int32_t*>(outputs[1]);
-            auto const wordEmb = static_cast<half const*>(mWordEmbDev.get());
-            auto const tokEmb = static_cast<half const*>(mTokEmbDev.get());
-            auto const posEmb = static_cast<half const*>(mPosEmbDev.get());
-
-            status =
-                embLayerNorm(stream, static_cast<int32_t>(mHiddenSize), B, S, inputIds, segmentIds, beta, gamma,
-                                wordEmb, posEmb, tokEmb, mWordVocabSize, mTokVocabSize, output, mNewMask, mPadId);
-            if (status != cudaSuccess) {
-                return STATUS_FAILURE;
-            }
-        }
-        else {
-            gLogError << "Unsupported type error, expected [kHALF,kFLOAT], but received "
-                      << static_cast<int32_t>(mMhaType) << endl;
-
-            return STATUS_NOT_SUPPORTED;
-        }
-
-        return status;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu
deleted file mode 100644
index 5766d382..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "embLayerNormPlugin.h"
-#include "backend/bert/bert_helper.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferBertEmbedLnKernel(const __half *token_emb, const __half *pos_emb, const __half *type_emb,
-                                         const int *tokens, __half *output, int *pad_mask, int *type_ids, int pad_id,
-                                         int batch_size, int seq_len, int hidden_dim, const __half *scale,
-                                         const __half *bias) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_dim;
-    output += block_start;
-
-    __half2 *p_output = (__half2 *)output;
-    __half2 *p_scale = (__half2 *)scale;
-    __half2 *p_bias = (__half2 *)bias;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-        int batch_idx, seq_idx, dim_idx;
-        batch_idx = blockIdx.x / seq_len;
-        seq_idx = blockIdx.x % seq_len;
-        dim_idx = element_index;
-        int tokens_idx = blockIdx.x;
-        int token = tokens[tokens_idx];
-        int token_type = type_ids[tokens_idx];
-
-        half2 value;
-
-        if (token == pad_id) {
-            if (dim_idx == 0) {
-                pad_mask[tokens_idx] = 1;
-            }
-            value.x = __float2half(0.f);
-            value.y = __float2half(0.f);
-
-        } else {
-            if (dim_idx == 0) {
-                pad_mask[tokens_idx] = 0;
-            }
-            value = ((half2 *)(token_emb + token * hidden_dim + dim_idx * 2))[0];
-            half2 pemb = ((half2 *)(pos_emb + seq_idx * hidden_dim + dim_idx * 2))[0];
-            half2 temb = ((half2 *)(type_emb + token_type * hidden_dim + dim_idx * 2))[0];
-
-            vals[it].x = __half2float(value.x) + __half2float(pemb.x) + __half2float(temb.x);
-            vals[it].y = __half2float(value.y) + __half2float(pemb.y) + __half2float(temb.y);
-
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        }
-
-        float mean = 0;
-        float m2 = 0;
-        float count = 0;
-        WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-        mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-        m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-        count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-        m2 = rsqrtf(m2 / hidden_dim + epsilon);
-
-#pragma unroll
-        for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-            int element_index = threadIdx.x + it * C10_WARP_SIZE;
-
-            __half2 scale_value = p_scale[element_index];
-            __half2 bias_value = p_bias[element_index];
-
-            float2 norm_value;
-            norm_value.x = (vals[it].x - mean) * m2 * __half2float(scale_value.x) + __half2float(bias_value.x);
-            norm_value.y = (vals[it].y - mean) * m2 * __half2float(scale_value.y) + __half2float(bias_value.y);
-
-            __half2 res;
-            res.x = __float2half(norm_value.x);
-            res.y = __float2half(norm_value.y);
-
-            int token = tokens[tokens_idx];
-            if (token == pad_id) {
-                res.x = __float2half(0.f);
-                res.y = __float2half(0.f);
-                p_output[element_index] = res;
-            } else {
-                p_output[element_index] = res;
-            }
-        }
-    }
-}
-
-void IxinferBertEmbedLn(const half *token_emb, const half *pos_emb, const half *type_emb,
-                                const int *tokens, half *output, int *pad_mask, int *type_ids, int pad_id,
-                                int batch_size, int seq_len, int hidden_size, const half *scale, const half *bias,
-                                cudaStream_t stream) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 2048");
-    }
-    if (hidden_size / 2 % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size / 2 // C10_WARP_SIZE != 0");
-    }
-    int batch_tokens = batch_size * seq_len;
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 2;
-
-    switch (num_warp) {
-        case 1:
-            IxinferBertEmbedLnKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 2:
-            IxinferBertEmbedLnKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 3:
-            IxinferBertEmbedLnKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 4:
-            IxinferBertEmbedLnKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 5:
-            IxinferBertEmbedLnKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 6:
-            IxinferBertEmbedLnKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 7:
-            IxinferBertEmbedLnKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 8:
-            IxinferBertEmbedLnKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 9:
-            IxinferBertEmbedLnKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 10:
-            IxinferBertEmbedLnKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 11:
-            IxinferBertEmbedLnKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 12:
-            IxinferBertEmbedLnKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 13:
-            IxinferBertEmbedLnKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 14:
-            IxinferBertEmbedLnKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 15:
-            IxinferBertEmbedLnKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        case 16:
-            IxinferBertEmbedLnKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids,
-                                                     pad_id, batch_size, seq_len, hidden_size, scale, bias);
-            break;
-        default:
-            throw std::runtime_error("IxinferBertEmbedLn");
-            break;
-    }
-}
-
-cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    half const* beta, half const* gamma, half const* wordEmb, half const* posEmb, half const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, half* output, int32_t* maskIdx, int32_t padId)
-{
-    IxinferBertEmbedLn(wordEmb, posEmb, tokEmb, inputIds, output, maskIdx, (int*)segmentIds,
-                                    padId, B, S, E, gamma, beta, stream);
-    return cudaSuccess;
-}
-
-void __global__ IxinferMaskPadKernel(const int32_t* mask, int32_t* new_mask, int bsz,
-                                    int ori_seq_len, int hsz, int fmha_seq_len) {
-    int batch_idx = blockIdx.x;
-    int seq_idx = blockIdx.y;
-
-    if (seq_idx < ori_seq_len) {
-        if (threadIdx.x == 0) {
-            new_mask[batch_idx * fmha_seq_len + seq_idx] = mask[batch_idx * ori_seq_len + seq_idx];
-        }
-    } else {
-        new_mask[batch_idx * fmha_seq_len + seq_idx] = 1;
-    }
-} 
-
-void IxinferMaskPad(int32_t* mask, int32_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream) {
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hsz/2>4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 !=0");
-    }
-    if (ori_seq_len > fmha_seq_len) {
-        throw std::runtime_error("ori_seq_len > fmha_seq_len");
-    }
-    if (bsz * ori_seq_len > batch_tokens) {
-        throw std::runtime_error("bsz*ori_seq_len > batch_tokens");
-    }
-    dim3 blockSize(bsz, fmha_seq_len);
-    IxinferMaskPadKernel<<<blockSize, hsz / 2, 0, stream>>>(mask, new_mask, bsz, ori_seq_len, hsz,
-                                                           fmha_seq_len);
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h
deleted file mode 100644
index f96e7d73..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-
-
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-cudaError embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds,
-    half const* beta, half const* gamma, half const* wordEmb, half const* posEmb, half const* tokEmb, int32_t const wordSize,
-    int32_t const tokSize, half* output, int32_t* maskIdx, int32_t padId);
-
-void IxinferMaskPad(int32_t* mask, int32_t* new_mask, int bsz, int ori_seq_len, int hsz,
-                   int fmha_seq_len, int batch_tokens, cudaStream_t stream);
-
-void IxinferBertEmbedLn(const half *token_emb, const half *pos_emb, const half *type_emb, const int *tokens, half *output,
-                        int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size,
-                        const half *scale, const half *bias, cudaStream_t stream);;
-
-class EmbLayerNormPluginDynamic : public IPluginV2DynamicExt {
-   public:
-    EmbLayerNormPluginDynamic(std::string const& name, nvinfer1::DataType const type, nvinfer1::DataType const mhaType,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& word_emb,
-        nvinfer1::Weights const& pos_emb, nvinfer1::Weights const& tok_emb, bool const useFullMask, int32_t padId = 0);
-    EmbLayerNormPluginDynamic(std::string const& name, void const* data, size_t length);
-    EmbLayerNormPluginDynamic() noexcept = delete;
-    ~EmbLayerNormPluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    size_t mHiddenSize;
-    size_t mSeqLen;
-    size_t mPadId;
-    DataType mEmbType;
-    bool mUseFullMask;
-    DataType mMhaType;
-    size_t mWordVocabSize, mPosVocabSize, mTokVocabSize;
-    cuda_unique_ptr<half> mGammaDev;
-    cuda_unique_ptr<half> mBetaDev;
-    cuda_unique_ptr<void> mWordEmbDev;
-    cuda_unique_ptr<void> mTokEmbDev;
-    cuda_unique_ptr<void> mPosEmbDev;
-    WeightsWithOwnership mBeta;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mWordEmb;
-    WeightsWithOwnership mTokEmb;
-    WeightsWithOwnership mPosEmb; 
-};
-
-class EmbLayerNormPluginDynamicCreator : public IPluginCreator {
-   public:
-    EmbLayerNormPluginDynamicCreator();
-
-    ~EmbLayerNormPluginDynamicCreator() override = default;
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    PluginFieldCollection const* getFieldNames() noexcept override;
-
-    IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override;
-
-    IPluginV2DynamicExt* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static PluginFieldCollection mFC;
-    static std::vector<PluginField> mPluginAttributes;
-    std::string mNamespace;
-
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp
deleted file mode 100644
index 30b47f88..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "ffnPlugin.h"
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "gelu/geluPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace {
-char const* const kFFN_VERSION{"1"};
-char const* const kFFN_NAME{"CustomFFNPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection FFNPluginDynamicCreator::mFFN{};
-std::vector<PluginField> FFNPluginDynamicCreator::mPluginAttributes;
-
-FFNPluginDynamicCreator::FFNPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("act_type", nullptr, PluginFieldType::kINT32, 1));
-
-    mFFN.nbFields = mPluginAttributes.size();
-    mFFN.fields = mPluginAttributes.data();
-}
-
-char const* FFNPluginDynamicCreator::getPluginName() const noexcept { return kFFN_NAME; }
-
-char const* FFNPluginDynamicCreator::getPluginVersion() const noexcept { return kFFN_VERSION; }
-
-PluginFieldCollection const* FFNPluginDynamicCreator::getFieldNames() noexcept { return &mFFN; }
-
-IPluginV2* FFNPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating FFNPluginDynamicCreator..." << endl;
-        IXRT_PLUGIN_ASSERT(name != nullptr);
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        int32_t outDims = 0;
-        int32_t typeId = -1;
-        int32_t act_type = -1;
-        Weights W1{DataType::kFLOAT, nullptr, 0LL};
-        Weights W2{DataType::kFLOAT, nullptr, 0LL};
-        Weights B1{DataType::kFLOAT, nullptr, 0LL};
-        ixrt_plugin::validateRequiredAttributesExist({"out_dims", "type_id", "W1", "W2", "B1"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string fieldName(fc->fields[i].name);
-            if (fieldName.compare("out_dims") == 0) {
-                outDims = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building outDims: " << outDims << endl;
-            }
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (fieldName.compare("W1") == 0) {
-                gLogInfo << "Building W1..." << endl;
-                W1.values = fc->fields[i].data;
-                W1.count = fc->fields[i].length;
-                W1.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W1 float32: " << (W1.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("W2") == 0) {
-                gLogInfo << "Building W2..." << endl;
-                W2.values = fc->fields[i].data;
-                W2.count = fc->fields[i].length;
-                W2.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is W2 float32: " << (W2.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("B1") == 0) {
-                gLogInfo << "Building B1..." << endl;
-                B1.values = fc->fields[i].data;
-                B1.count = fc->fields[i].length;
-                B1.type = fieldTypeToDataType(fc->fields[i].type);
-                gLogInfo << "Is B1 float32: " << (B1.type == DataType::kFLOAT) << endl;
-            }
-
-            if (fieldName.compare("act_type") == 0) {
-                gLogInfo << "Building act_type..." << endl;
-                act_type = static_cast<int32_t const*>(fc->fields[i].data)[0];
-                gLogInfo << "Building act_type: " << act_type << endl;
-            }
-        }
-
-        if (outDims <= 0) {
-            gLogInfo << "Invalid output dimension" << endl;
-        }
-        if (typeId < 0 || typeId > 1) {
-            gLogInfo << "Invalid type id" << typeId << endl;
-        }
-        if (W1.count == 0 || W1.values == nullptr) {
-            gLogInfo << "Invalid weights W1" << endl;
-        }
-        if (W2.count == 0 || W2.values == nullptr) {
-            gLogInfo << "Invalid weights W2" << endl;
-        }
-        if (B1.count == 0 || B1.values == nullptr) {
-            gLogInfo << "Invalid weights B1" << endl;
-        }
-
-        DataType type = typeId == 0 ? DataType::kFLOAT : DataType::kHALF;
-        return new FFNPluginDynamic(name, type, outDims, act_type, W1, W2, B1);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* FFNPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                      size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call FFNPluginDynamic::destroy()
-    try {
-        return new FFNPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void FFNPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FFNPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(FFNPluginDynamicCreator);
-//#########################################################################//
-FFNPluginDynamic::FFNPluginDynamic(std::string const name, DataType const type, int32_t const outDim,
-                                   int32_t const act_type, Weights const& W1, Weights const& W2, Weights const& B1)
-    : mLayerName(name),
-      mType(type),
-      mHiddenSize(outDim),
-      mActType(act_type),
-      mWdev1(nullptr),
-      mWdev2(nullptr),
-      mBdev1(nullptr) {
-    mW1.convertAndCopy(W1, mType);
-    mW2.convertAndCopy(W2, mType);
-    mB1.convertAndCopy(B1, mType);
-    copyToDevice(mW1, getWeightsSize(mW1, mType), mWdev1);
-    copyToDevice(mW2, getWeightsSize(mW2, mType), mWdev2);
-    copyToDevice(mB1, getWeightsSize(mB1, mType), mBdev1);
-}
-
-FFNPluginDynamic::FFNPluginDynamic(std::string const name, void const* data, size_t length)
-    : mLayerName(name), mWdev1(nullptr), mWdev2(nullptr), mBdev1(nullptr) {
-    gLogInfo << "FFNPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mActType);
-
-    char const* d = static_cast<char const*>(data);
-
-    mW1.convertAndCopy(d, mHiddenSize * mHiddenSize * 4, mType);
-    copyToDevice(mW1, getWeightsSize(mW1, mType), mWdev1);
-
-    mW2.convertAndCopy(d, mHiddenSize * mHiddenSize * 4, mType);
-    copyToDevice(mW2, getWeightsSize(mW2, mType), mWdev2);
-
-    mB1.convertAndCopy(d, mHiddenSize * 4, mType);
-    copyToDevice(mB1, getWeightsSize(mB1, mType), mBdev1);
-}
-
-// IPluginV2 Methods
-char const* FFNPluginDynamic::getPluginType() const noexcept { return kFFN_NAME; }
-
-char const* FFNPluginDynamic::getPluginVersion() const noexcept { return kFFN_VERSION; }
-
-int32_t FFNPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t FFNPluginDynamic::initialize() noexcept {
-    gLogInfo << "FFNPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void FFNPluginDynamic::terminate() noexcept { gLogInfo << "FFNPluginDynamic terminate" << endl; }
-
-size_t FFNPluginDynamic::getSerializationSize() const noexcept {
-    size_t wordSize = getElementSize(mType);
-    return wordSize * (mHiddenSize * mHiddenSize * 8 + mHiddenSize * 4) + sizeof(mType) + sizeof(mHiddenSize) +
-           sizeof(mActType);
-}
-
-void FFNPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mActType);
-
-    size_t wordSize = getElementSize(mType);
-    char* d = static_cast<char*>(buffer);
-    serFromDev(d, static_cast<char*>(mWdev1.get()), 4 * mHiddenSize * mHiddenSize * wordSize);
-    serFromDev(d, static_cast<char*>(mWdev2.get()), 4 * mHiddenSize * mHiddenSize * wordSize);
-    serFromDev(d, static_cast<char*>(mBdev1.get()), 4 * mHiddenSize * wordSize);
-}
-
-void FFNPluginDynamic::destroy() noexcept {
-    gLogInfo << "FFNPluginDynamic destroy" << endl;
-    mWdev1.reset(nullptr);
-    mWdev2.reset(nullptr);
-    mBdev1.reset(nullptr);
-    delete this;
-}
-
-void FFNPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* FFNPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType FFNPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                             int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* FFNPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "FFNPluginDynamic clone" << endl;
-
-        auto* p = new FFNPluginDynamic(mLayerName, mType, mHiddenSize, mActType, mW1, mW2, mB1);
-        p->setPluginNamespace(mNamespace.c_str());
-
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs FFNPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                                IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        DimsExprs ret;
-        ret.nbDims = 5;
-        ret.d[0] = inputs[0].d[0];
-        ret.d[1] = inputs[0].d[1];
-        ret.d[2] = exprBuilder.constant(mHiddenSize);
-        ret.d[3] = exprBuilder.constant(1);
-        ret.d[4] = exprBuilder.constant(1);
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool FFNPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                 int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    IXRT_PLUGIN_ASSERT(inOut != nullptr);
-
-    PluginTensorDesc const& in = inOut[pos];
-    if (pos == 0) {
-        return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-    }
-    PluginTensorDesc const& prev = inOut[pos - 1];
-
-    // output
-    return in.type == prev.type && in.format == prev.format;
-}
-
-void FFNPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                       DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-        auto const& inDims0 = inputs[0].desc.dims;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-        CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t FFNPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                          PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    int32_t const S = inputs[0].dims.d[SDIM];
-    int32_t const B = inputs[0].dims.d[BDIM];
-    return B * S * 4 * mHiddenSize * sizeof(half);
-}
-
-int32_t FFNPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                  void const* const* inputs, void* const* outputs, void* workSpace,
-                                  cudaStream_t stream) noexcept {
-    gLogInfo << "in FFNPluginDynamic.." << endl;
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-        int32_t const S = inputDesc->dims.d[SDIM];
-        int32_t const B = inputDesc->dims.d[BDIM];
-        int32_t const n = S * B;
-        IXRT_PLUGIN_ASSERT(n >= 0);
-
-        if (mType == DataType::kHALF) {
-            auto const* const input = static_cast<half const*>(inputs[0]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto weight1 = static_cast<half*>(mWdev1.get());
-            auto weight2 = static_cast<half*>(mWdev2.get());
-            auto bias1 = static_cast<half*>(mBdev1.get());
-            auto buffer = static_cast<half*>(workSpace);
-
-#ifdef __ILUVATAR__
-            cuinfer_gemm(weight1, input, bias1, buffer, 1, mHiddenSize * 4, n, mHiddenSize, 0, 0, 0, 1.0f, mActType,
-                         stream, cuinfer_handle);
-            cuinfer_gemm(weight2, buffer, nullptr, output, 1, mHiddenSize, n, 4 * mHiddenSize, 0, 0, 0, 1.0f, -1,
-                         stream, cuinfer_handle);
-#else
-            cublaslt_gemm(weight1, input, buffer, 1, mHiddenSize * 4, n, mHiddenSize, 0, 0, 0, 1.0f, blaslt_handle,
-                          stream);
-            computeGeluBias(buffer, buffer, bias1, 4 * mHiddenSize, n, stream);
-            cublaslt_gemm(weight2, buffer, output, 1, mHiddenSize, n, mHiddenSize * 4, 0, 0, 0, 1.0f, blaslt_handle,
-                          stream);
-#endif
-        } else {
-            gLogError << "Unsupported type error, expected [kHALF], but received " << static_cast<int32_t>(mType)
-                      << endl;
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h
deleted file mode 100644
index 21459c9b..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-
-#include <memory>
-
-#include "NvInferRuntime.h"
-#include "NvInferRuntimeCommon.h"
-#include "backend/cublas/cublas_helper.h"
-#include "bertCommon.h"
-#include <vector>
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-class FFNPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FFNPluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                     int32_t const out_type, nvinfer1::Weights const& W1, nvinfer1::Weights const& W2,
-                     nvinfer1::Weights const& B1);
-
-    FFNPluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FFNPluginDynamic without arguments, so we
-    // delete default constructor.
-    FFNPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mHiddenSize;
-    size_t mActType;
-
-    bert::WeightsWithOwnership mW1;
-    bert::WeightsWithOwnership mB1;
-    bert::WeightsWithOwnership mW2;
-    bert::cuda_unique_ptr<void> mWdev1;
-    bert::cuda_unique_ptr<void> mWdev2;
-    bert::cuda_unique_ptr<void> mBdev1;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FFNPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FFNPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFFN;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class FFNInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    FFNInt8PluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim,
-                         nvinfer1::Weights const& W, nvinfer1::Weights const& Bias, vector<float> const& scale);
-
-    FFNInt8PluginDynamic(std::string const name, void const* data, size_t length);
-
-    // It doesn't make sense to make FFNInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    FFNInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    std::string const mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    size_t mOutDim;  // leading dim
-    size_t mNumParams;
-    int32_t mNmax;
-    int32_t mK;
-    int32_t mNumBias;
-
-    vector<float> mScale;
-
-    bert::WeightsWithOwnership mW;
-    bert::cuda_unique_ptr<void> mWdev;
-
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasdev;
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-};
-
-class FFNInt8PluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    FFNInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp
deleted file mode 100644
index b9ae5177..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "geluPlugin.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-#include <cstdint>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kGELU_IXRT_PLUGIN_VERSION{"1"};
-char const* const kGELU_IXRT_PLUGIN_NAME{"CustomGeluPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection GeluPluginDynamicCreator::mFC{};
-std::vector<PluginField> GeluPluginDynamicCreator::mPluginAttributes;
-
-GeluPluginDynamicCreator::GeluPluginDynamicCreator() {
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("bias", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    // Fill PluginFieldCollection with PluginField arguments metadata
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* GeluPluginDynamicCreator::getPluginName() const noexcept { return kGELU_IXRT_PLUGIN_NAME; }
-
-char const* GeluPluginDynamicCreator::getPluginVersion() const noexcept { return kGELU_IXRT_PLUGIN_VERSION; }
-
-PluginFieldCollection const* GeluPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* GeluPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogVerbose << "GeluPluginDynamicCreator createPlugin\n";
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        int32_t typeId = -1;
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "ld"}, fc);
-        int32_t ld = 0;
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].name != nullptr);
-            std::string fieldName(fc->fields[i].name);
-
-            if (fieldName.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-            if (fieldName.compare("bias") == 0) {
-                bias.values = fc->fields[i].data;
-                bias.count = fc->fields[i].length;
-                bias.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-            if (fieldName.compare("ld") == 0) {
-                ld = *static_cast<int32_t const*>(fc->fields[i].data);
-            }
-        }
-
-        if (typeId < 0 || typeId > 3) {
-            gLogError << "GeluPluginDynamicCreator: invalid typeId " << typeId << std::endl;
-            return nullptr;
-        }
-
-        return new GeluPluginDynamic(name, static_cast<DataType>(typeId), bias, ld);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* GeluPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                       size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call GeluPluginDynamic::destroy()
-    try {
-        return new GeluPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void GeluPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* GeluPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(GeluPluginDynamicCreator);
-//#########################################################################//
-GeluPluginDynamic::GeluPluginDynamic(const std::string name, const DataType type, Weights const& bias, const int ld)
-    : mLayerName(name), mType(type), mLd(ld), mNumBias(bias.count) {
-    if (mNumBias > 0) {
-        mBias.convertAndCopy(bias, DataType::kHALF);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kHALF), mBiasDev);
-    }
-}
-
-GeluPluginDynamic::GeluPluginDynamic(const std::string name, void const* data, size_t length) : mLayerName(name) {
-    gLogVerbose << "GeluPluginDynamic deserialize\n";
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mLd);
-    deserialize_value(&data, &length, &mNumBias);
-
-    if (mNumBias > 0) {
-        IXRT_PLUGIN_ASSERT(mLd > 0);
-        char const* d = static_cast<char const*>(data);
-        mBias.convertAndCopy(d, mNumBias, DataType::kHALF);
-        copyToDevice(mBias, getWeightsSize(mBias, DataType::kHALF), mBiasDev);
-    }
-}
-
-// IPluginV2 Methods
-
-char const* GeluPluginDynamic::getPluginType() const noexcept { return kGELU_IXRT_PLUGIN_NAME; }
-
-char const* GeluPluginDynamic::getPluginVersion() const noexcept { return kGELU_IXRT_PLUGIN_VERSION; }
-
-int32_t GeluPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t GeluPluginDynamic::initialize() noexcept {
-    gLogVerbose << "GeluPluginDynamic initalize\n";
-    return 0;
-}
-
-void GeluPluginDynamic::terminate() noexcept { gLogVerbose << "GeluPluginDynamic terminate\n"; }
-
-size_t GeluPluginDynamic::getSerializationSize() const noexcept {
-    const size_t wordSize = getElementSize(mType);
-    return sizeof(mType) + sizeof(mLd) + sizeof(mNumBias) + mNumBias * sizeof(half);
-}
-
-void GeluPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mLd);
-    serialize_value(&buffer, mNumBias);
-    if (mNumBias > 0) {
-        IXRT_PLUGIN_ASSERT(mLd > 0);
-        char* d = static_cast<char*>(buffer);
-
-        serFromDev(d, static_cast<char*>(mBiasDev.get()), mLd * getElementSize(DataType::kHALF));
-    }
-}
-
-void GeluPluginDynamic::destroy() noexcept {
-    gLogVerbose << "GeluPluginDynamic destroy\n";
-    // This gets called when the network containing plugin is destroyed
-    if (mNumBias > 0) {
-        mBiasDev.reset();
-    }
-    delete this;
-}
-
-void GeluPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(libNamespace != nullptr);
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* GeluPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-nvinfer1::DataType GeluPluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                        int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(index == 0);
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF ||
-                           inputTypes[0] == DataType::kINT8);
-        return inputTypes[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* GeluPluginDynamic::clone() const noexcept {
-    try {
-        gLogVerbose << "GeluPluginDynamic clone\n";
-        auto* plugin = new GeluPluginDynamic(mLayerName, mType, mBias, mLd);
-        plugin->setPluginNamespace(mNamespace.c_str());
-        return plugin;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs,
-                                                           int32_t nbInputs,
-                                                           nvinfer1::IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool GeluPluginDynamic::supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut,
-                                                  int32_t nbInputs, int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(pos >= 0);
-        IXRT_PLUGIN_ASSERT(pos < nbInputs + nbOutputs);
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return false;
-    }
-
-    PluginTensorDesc const& input = inOut[0];
-    if (pos == 0) {
-        return (input.type == mType) && (input.format == TensorFormat::kLINEAR);
-    }
-    if (pos == 1) {
-        PluginTensorDesc const& output = inOut[1];
-        return (input.type == output.type) && (output.format == TensorFormat::kLINEAR) && (output.type == mType);
-    }
-    return false;
-}
-
-void GeluPluginDynamic::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    gLogVerbose << "GeluPluginDynamic configurePlugin\n";
-
-    try {
-        IXRT_PLUGIN_ASSERT(in != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 1);
-        IXRT_PLUGIN_ASSERT(mType == in[0].desc.type);
-        IXRT_PLUGIN_ASSERT(mType == DataType::kHALF || mType == DataType::kINT8);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t GeluPluginDynamic::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                                           nvinfer1::PluginTensorDesc const* outputs,
-                                           int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-template <typename TDataType>
-int32_t GeluPluginDynamic::enqueueTyped(void const* input_, void* output_, int32_t const inputVolume,
-                                        cudaStream_t stream) noexcept {
-    TDataType const* input = static_cast<TDataType const*>(input_);
-    TDataType* output = static_cast<TDataType*>(output_);
-    int32_t const cols = inputVolume / mLd;
-    int32_t const rows = mLd;
-
-    if (mNumBias > 0) {
-        TDataType const* bias = static_cast<TDataType*>(mBiasDev.get());
-        return computeGeluBias(output, input, bias, rows, cols, stream);
-    } else {
-        return computeGelu(stream, inputVolume, input, output);
-    }
-}
-
-int32_t GeluPluginDynamic::enqueueInt8(void const* input_, void* output_, float dequant_scale, float quant_scale,
-                                       int32_t const inputVolume, cudaStream_t stream) noexcept {
-    int8_t const* input = static_cast<int8_t const*>(input_);
-    int8_t* output = static_cast<int8_t*>(output_);
-    int32_t const cols = inputVolume / mLd;
-    int32_t const rows = mLd;
-
-    if (mNumBias > 0) {
-        half const* bias = static_cast<half*>(mBiasDev.get());
-        return computeGeluI8O8Bias(output, input, bias, rows, cols, dequant_scale, quant_scale, stream);
-    } else {
-        return computeGeluI8O8(stream, inputVolume, input, output, dequant_scale, quant_scale);
-    }
-}
-
-int32_t GeluPluginDynamic::enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
-                                   nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs,
-                                   void* const* outputs, void* workspace, cudaStream_t stream) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputDesc != nullptr);
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-
-    int32_t const inputVolume = volume(inputDesc[0].dims);
-    int32_t batch_token_num = inputDesc[0].dims.d[BDIM] * inputDesc[0].dims.d[SDIM];
-
-    // Our plugin outputs only one tensor.
-    // Launch CUDA kernel wrapper and save its return value.
-    switch (mType) {
-        case DataType::kFLOAT:
-            return enqueueTyped<float>(inputs[0], outputs[0], inputVolume, stream);
-        case DataType::kHALF:
-            return enqueueTyped<half>(inputs[0], outputs[0], inputVolume, stream);
-        case DataType::kINT8: {
-            int8_t* input = (int8_t*)(inputs[0]);
-            int8_t* output = (int8_t*)(outputs[0]);
-            IxinferBiasGeluI8II8O(batch_token_num, stream, (int8_t*)input, (int8_t*)output,
-                                           static_cast<half*>(mBiasDev.get()), mLd,  inputDesc[0].scale,
-                                           1.0/outputDesc[0].scale);
-            return STATUS_SUCCESS;
-        }
-        default:
-            return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu
deleted file mode 100644
index c36cac15..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "geluPlugin.h"
-
-namespace nvinfer1::ixrt_plugin {
-using namespace backend;
-namespace bert {
-// constants for approximating the normal cdf
-constexpr float A = 0.5f;
-constexpr float B = 0.7978845608028654f;    // sqrt(2.0/M_PI)
-constexpr float C = 0.035677408136300125f;  // 0.044715 * sqrt(2.0/M_PI)
-
-
-template <typename T>
-__global__ void IxinferBiasGeluI8II8OKernel(int8_t *input, int8_t *output, const T *bias, int feature_dim,
-                                            float dequant_scale, float quant_scale) {
-    int block_start = blockIdx.x * feature_dim;
-    int start = block_start + threadIdx.x;
-    int end = block_start + feature_dim;
-    for (int i = start; i < end; i += blockDim.x) {
-        int input_index = i;
-
-        float fout = gelu<float>(float(input[input_index]) * dequant_scale + __ldg(&bias[i - block_start]));
-
-        int output_index = i;
-        output[output_index] = float2int8(fout, quant_scale);
-    }
-}
-
-template <>
-__global__ void IxinferBiasGeluI8II8OKernel<__half>(int8_t *input, int8_t *output, const __half *bias, int feature_dim,
-                                                    float dequant_scale, float quant_scale) {
-    //  #pragma unroll
-    for (int block_index = 0; block_index < 2; block_index++) {
-        int block_start = (blockIdx.x * 2 + block_index) * feature_dim;
-        int start = block_start + threadIdx.x * 4;
-        int input_index = start;
-        char4 *p_input = (char4 *)(input + input_index);
-        half2 *p_bias = (half2 *)(bias + input_index - block_start);
-        float fout1 = gelu<float>(float(p_input[0].x) * dequant_scale + __half2float(p_bias[0].x));
-        float fout2 = gelu<float>(float(p_input[0].y) * dequant_scale + __half2float(p_bias[0].y));
-        float fout3 = gelu<float>(float(p_input[0].z) * dequant_scale + __half2float(p_bias[1].x));
-        float fout4 = gelu<float>(float(p_input[0].w) * dequant_scale + __half2float(p_bias[1].y));
-
-        int output_index = start;
-        char4 out;
-        out.x = float2int8(fout1, quant_scale);
-        out.y = float2int8(fout2, quant_scale);
-        out.z = float2int8(fout3, quant_scale);
-        out.w = float2int8(fout4, quant_scale);
-        char4 *p_output = (char4 *)(output + output_index);
-
-        p_output[0] = out;
-    }
-}
-
-template <typename T>
-void IxinferBiasGeluI8II8O(int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output, const T *bias,
-                           int feature_dim, float dequant_scale, float quant_scale) {
-    IxinferBiasGeluI8II8OKernel<T>
-        <<<batch_token_num, 1024, 0, stream>>>(input, output, bias, feature_dim, dequant_scale, quant_scale);
-}
-
-template void IxinferBiasGeluI8II8O<half>(int, cudaStream_t, int8_t*, int8_t *, const half *, int, float, float);
-
-template <unsigned TPB>
-__global__ void geluKernel(const half a, const half b, const half c, int n, const half* input, half* output) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const half in = input[idx];
-        const half cdf = a + a * __float2half(tanh(__half2float(in * (c * in * in + b))));
-        output[idx] = in * cdf;
-    }
-}
-
-template <unsigned TPB>
-__global__ void geluKernel(const float a, const float b, const float c, int n, const float* input, float* output) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const float in = input[idx];
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        output[idx] = in * cdf;
-    }
-}
-
-template <unsigned TPB>
-__global__ void geluKernel(const float a, const float b, const float c, int n, const int8_t* input, int8_t* output,
-                           float dequant_scale, float quant_scale) {
-    const int idx = blockIdx.x * TPB + threadIdx.x;
-
-    if (idx < n) {
-        const float in = float(input[idx]) * dequant_scale;
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        float i8_f = in * cdf * quant_scale;
-        int32_t i8 = floorf(i8_f + 0.5);
-        i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-        output[idx] = int8_t(i8);
-    }
-}
-
-int computeGelu(cudaStream_t stream, int n, const float* input, float* output) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output);
-
-    return 0;
-}
-
-int computeGelu(cudaStream_t stream, int n, const half* input, half* output) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output);
-
-    return 0;
-}
-
-int32_t computeGeluI8O8(cudaStream_t stream, int n, const int8_t* input, int8_t* output, float dequant_scale,
-                        float quant_scale) {
-    constexpr int blockSize = 256;
-    const int gridSize = (n + blockSize - 1) / blockSize;
-    geluKernel<blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, n, input, output, dequant_scale, quant_scale);
-
-    return 0;
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const half a, const half b, const half c, half* output, const half* input,
-                               const half* bias, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const half in = input[idx] + bias[it];
-        const half cdf = a + a * __float2half(tanh(__half2float(in * (c * in * in + b))));
-        output[idx] = in * cdf;
-    }
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const float a, const float b, const float c, float* output, const float* input,
-                               const float* bias, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const float in = input[idx] + bias[it];
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        output[idx] = in * cdf;
-    }
-}
-
-template <int TPB>
-__global__ void geluBiasKernel(const float a, const float b, const float c, int8_t* output, const int8_t* input,
-                               const half* bias, float dequant_scale, float quant_scale, const int ld) {
-    const int offset = blockIdx.x * ld;
-
-    for (int it = threadIdx.x; it < ld; it += TPB) {
-        const int idx = it + offset;
-        const float in = float(input[idx]) * dequant_scale + __half2float(bias[it]);
-        const float cdf = a + a * tanh(in * (c * in * in + b));
-        float i8_f = in * cdf * quant_scale;
-        int32_t i8 = floorf(i8_f + 0.5);
-        i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
-        output[idx] = int8_t(i8);
-    }
-}
-
-int computeGeluBias(float* output, const float* input, const float* bias, const int ld, const int cols,
-                    cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, ld);
-    return cudaPeekAtLastError();
-}
-
-int computeGeluBias(half* output, const half* input, const half* bias, const int ld, const int cols,
-                    cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, ld);
-    return cudaPeekAtLastError();
-}
-
-int32_t computeGeluI8O8Bias(int8_t* output, const int8_t* input, const half* bias, const int ld, const int cols,
-                            float dequant_scale, float quant_scale, cudaStream_t stream) {
-    geluBiasKernel<256><<<cols, 256, 0, stream>>>(A, B, C, output, input, bias, dequant_scale, quant_scale, ld);
-    return cudaPeekAtLastError();
-}
-
-}  // namespace bert
-}  // namespace nvinfer1::plugin
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h
deleted file mode 100644
index 182fe7f3..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <typename T>
-void IxinferBiasGeluI8II8O(int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output, const T *bias,
-                           int feature_dim, float dequant_scale, float quant_scale);
-
-int32_t computeGelu(cudaStream_t stream, int32_t n, float const* input, float* output);
-
-int32_t computeGelu(cudaStream_t stream, int32_t n, half const* input, half* output);
-
-int32_t computeGeluI8O8(cudaStream_t stream, int n, const int8_t* input, int8_t* output, float dequant_scale,
-                        float quant_scale);
-
-int32_t computeGeluBias(float* output, float const* input, float const* bias, int32_t const ld, int32_t const cols,
-                        cudaStream_t stream);
-
-int32_t computeGeluBias(half* output, half const* input, half const* bias, int32_t const ld, int32_t const cols,
-                        cudaStream_t stream);
-
-int32_t computeGeluI8O8Bias(int8_t* output, const int8_t* input, const half* bias, const int ld, const int cols,
-                            float dequant_scale, float quant_scale, cudaStream_t stream);
-
-class GeluPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
-   public:
-    GeluPluginDynamic(const std::string name, const nvinfer1::DataType type, nvinfer1::Weights const& bias,
-                      const int ld);
-
-    GeluPluginDynamic(const std::string name, void const* data, size_t length);
-
-    // It doesn't make sense to make GeluPluginDynamic without arguments, so we delete
-    // default constructor.
-    GeluPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                         int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-                                            nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                         nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-                            nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-                    void const* const* inputs, void* const* outputs, void* workspace,
-                    cudaStream_t stream) noexcept override;
-
-   private:
-    // Helper method for enqueue()
-    template <typename TDataType>
-    int32_t enqueueTyped(void const* input, void* output, int32_t const inputVolume, cudaStream_t stream) noexcept;
-    int32_t enqueueInt8(void const* input_, void* output_, float dequant_scale, float quant_scale,
-                        int32_t const inputVolume, cudaStream_t stream) noexcept;
-
-    const std::string mLayerName;
-    std::string mNamespace;
-
-    nvinfer1::DataType mType;
-    bert::WeightsWithOwnership mBias;
-    bert::cuda_unique_ptr<void> mBiasDev;
-    size_t mLd;
-    size_t mNumBias;
-};
-
-class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator {
-   public:
-    GeluPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData,
-                                           size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-   private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-}  // namespace bert
-}  // namespace nvinfer1::plugin
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp
deleted file mode 100644
index c3a25ba1..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "qkvToContextInt8Plugin.h"
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <iomanip>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION{"3"};
-char const* const kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic_IxRT"};
-}  // namespace
-
-PluginFieldCollection QKVToContextInt8PluginDynamicCreator::mFC{};
-std::vector<PluginField> QKVToContextInt8PluginDynamicCreator::mPluginAttributes;
-
-constexpr uint32_t IIDX = 0;  // index of the input tensor
-constexpr uint32_t MIDX = 1;  // index of the mask
-/*
-dq_probs:
-_arrange_qkv_amax
-_softmax_in_amax
-_softmax_out_amax
-*/
-QKVToContextInt8PluginDynamicCreator::QKVToContextInt8PluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("hidden_size", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("dq_probs", nullptr, PluginFieldType::kFLOAT32, 3));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginName() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION;
-}
-
-PluginFieldCollection const* QKVToContextInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* QKVToContextInt8PluginDynamicCreator::createPlugin(char const* name,
-                                                              PluginFieldCollection const* fc) noexcept {
-    try {
-        int32_t hiddenSize = 0;
-        // Since numHeads must always exist or validateRequiredAttributes will fail,
-        // we can set numHeads to -1 so that static analysis tools don't warn about
-        // a division by zero in QKVToContextInt8PluginDynamic constructor.
-        int32_t numHeads{-1};
-
-        vector<float> dqProbs;
-
-        ixrt_plugin::validateRequiredAttributesExist({"hidden_size", "num_heads"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-
-            if (field_name.compare("hidden_size") == 0) {
-                hiddenSize = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hiddenSize > 0,
-                                        ("QKV: Invalid hiddenSize " + std::to_string(hiddenSize)).c_str());
-                gLogInfo << "Building hiddenSize: " << hiddenSize << endl;
-            }
-            if (field_name.compare("num_heads") == 0) {
-                numHeads = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(numHeads > 0, ("QKV: Invalid numHeads " + std::to_string(numHeads)).c_str());
-                gLogInfo << "Building numHeads: " << numHeads << endl;
-            }
-            if (field_name.compare("dq_probs") == 0) {
-                IXRT_PLUGIN_CHECK_VALUE(fc->fields[i].length > 0,
-                                        ("QKV: dpProbs can not be empty, error: [dpProbs.length == 0]!"));
-                gLogInfo << "Building dqProbs: [";
-                for (auto j = 0; j < fc->fields[i].length; j++) {
-                    dqProbs.emplace_back(static_cast<float const*>((fc->fields[i].data))[j]);
-                    gLogInfo << std::setprecision(5) << dqProbs[j];
-                }
-                gLogInfo << "]" << endl;
-            }
-        }
-
-        QKVToContextInt8PluginDynamic* p = new QKVToContextInt8PluginDynamic(name, hiddenSize, numHeads, dqProbs);
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* QKVToContextInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                   size_t serialLength) noexcept {
-    try {
-        // This object will be deleted when the network is destroyed, which will
-        // call QKVToContextInt8PluginDynamic::destroy() noexcept
-        return new QKVToContextInt8PluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void QKVToContextInt8PluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* QKVToContextInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(QKVToContextInt8PluginDynamicCreator);
-//#########################################################################//
-QKVToContextInt8PluginDynamic::QKVToContextInt8PluginDynamic(std::string const& name, int32_t const hiddenSize,
-                                                             int32_t const numHeads, vector<float> const dqProbs)
-    : mLayerName(name),
-      mS(0),
-      mB(0),
-      mHeadSize(hiddenSize / numHeads),
-      mHiddenSize(hiddenSize),
-      mNumHeads(numHeads),
-      mDqProbs(dqProbs) {}
-
-QKVToContextInt8PluginDynamic::QKVToContextInt8PluginDynamic(std::string const& name, void const* data, size_t length)
-    : mLayerName(name) {
-    gLogInfo << "deserialize QKVToContextInt8PluginDynamic" << endl;
-    deserialize_value(&data, &length, &mNumHeads);
-    deserialize_value(&data, &length, &mHeadSize);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mDqProbs);
-}
-
-// IPluginV2 Methods
-char const* QKVToContextInt8PluginDynamic::getPluginType() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextInt8PluginDynamic::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION;
-}
-
-int32_t QKVToContextInt8PluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t QKVToContextInt8PluginDynamic::initialize() noexcept { return 0; }
-
-void QKVToContextInt8PluginDynamic::terminate() noexcept {}
-
-size_t QKVToContextInt8PluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mNumHeads) + sizeof(mHeadSize) + sizeof(mHiddenSize) + mDqProbs.size() * sizeof(float) +
-           sizeof(mDqProbs.size());
-}
-
-void QKVToContextInt8PluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mNumHeads);
-    serialize_value(&buffer, mHeadSize);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mDqProbs);
-}
-
-void QKVToContextInt8PluginDynamic::destroy() noexcept { delete this; }
-
-void QKVToContextInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* QKVToContextInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType QKVToContextInt8PluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                          int32_t nbInputs) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0)
-    return DataType::kINT8;
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* QKVToContextInt8PluginDynamic::clone() const noexcept {
-    try {
-        QKVToContextInt8PluginDynamic* ret =
-            new QKVToContextInt8PluginDynamic(mLayerName, mHiddenSize, mNumHeads, mDqProbs);
-
-        ret->setPluginNamespace(mNamespace.c_str());
-        return ret;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs QKVToContextInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                             int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    // input [B, S, 3*E] int8
-    // pad_mask [B, S] int8
-    
-    // output [B, S, E] int8
-    IXRT_PLUGIN_ASSERT(outputIndex == 0);
-    // Copy over everything
-    DimsExprs output(inputs[IIDX]);
-    // Divide last dim by three
-    auto const* three = exprBuilder.constant(3);
-    output.d[HDIM] = exprBuilder.constant(mHiddenSize);
-    return output;
-}
-bool QKVToContextInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                              int32_t nbInputs, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 2);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    return (inOut[pos].type == DataType::kINT8) && (inOut[pos].format == TensorFormat::kLINEAR);
-}
-
-void QKVToContextInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                                    DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 2);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    PluginTensorDesc const& inDesc = in[IIDX].desc;
-    PluginTensorDesc const& outDesc = out[0].desc;
-    IXRT_PLUGIN_ASSERT(inDesc.dims.nbDims == 5)
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[HDIM] == 3 * mHiddenSize);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[4] == 1);
-
-    PluginTensorDesc const& maskDesc = in[MIDX].desc;
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.nbDims == 2);
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.d[0] == inDesc.dims.d[0]);
-    IXRT_PLUGIN_ASSERT(maskDesc.dims.d[1] == inDesc.dims.d[1]);
-
-    const int32_t S = inDesc.dims.d[SDIM];
-
-    IXRT_PLUGIN_ASSERT(outDesc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[BDIM] == inDesc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[SDIM] == S);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[HDIM] == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[4] == 1);
-
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-    CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-}
-
-size_t QKVToContextInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                       PluginTensorDesc const* outputs,
-                                                       int32_t nbOutputs) const noexcept {
-    const int32_t B = inputs[0].dims.d[BDIM];
-    const int32_t S = inputs->dims.d[SDIM];
-    const int32_t E = inputs->dims.d[HDIM];
-    IXRT_PLUGIN_ASSERT(E == 3 * mHiddenSize);
-    int64_t buffer_size = B * S * E * sizeof(int8_t) + B * S * S * mNumHeads * sizeof(int8_t);
-#ifndef __ILUVATAR__
-    buffer_size += B * S * S * mNumHeads * sizeof(int32_t);
-#endif
-    return buffer_size;
-}
-
-int32_t QKVToContextInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                               void const* const* inputs, void* const* outputs, void* workspace,
-                                               cudaStream_t stream) noexcept {
-    try {
-#ifdef __ILUVATAR__
-        CUINFER_CHECK(cuinferSetStream(cuinfer_handle, 0));
-#endif
-        int32_t const B = inputDesc[0].dims.d[BDIM];
-        int32_t const S = inputDesc[0].dims.d[SDIM];
-
-        float qkv_out_amax_ = inputDesc[0].scale * 127;
-        float linear_in_amax_ = outputDesc[0].scale * 127;
-        float arrange_qkv_amax_ = mDqProbs[0];
-        float softmax_in_amax_ = mDqProbs[1];
-        float softmax_out_amax_ = mDqProbs[2];
-
-        int8_t* qkv_buffer_ = (int8_t*)inputs[0];
-        int8_t* qkv_out_ = (int8_t*)outputs[0];
-        int8_t* mask_ = (int8_t*)inputs[1];
-
-        int64_t buffer_size = B * S * mHiddenSize;
-        int64_t buffer_size2 = B * S * S * mNumHeads;
-        int8_t* q_buffer_ = static_cast<int8_t*>(workspace);
-        int8_t* k_buffer_ = q_buffer_ + buffer_size;
-        int8_t* v_buffer_ = k_buffer_ + buffer_size;
-        int8_t* qk_buffer_ = v_buffer_ + buffer_size;
-        
-#ifdef __ILUVATAR__
-        auto status =
-            fused_multihead_attetion_int8(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qkv_out_,
-                                          qk_buffer_, B, S, mHeadSize, mNumHeads, mHiddenSize, arrange_qkv_amax_,
-                                          softmax_in_amax_, softmax_out_amax_, linear_in_amax_, cuinfer_handle, stream);
-#else
-        int32_t* qk_out_ = reinterpret_cast<int32_t*>(qk_buffer_ + buffer_size2);
-        auto status =
-            fused_multihead_attetion_int8(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qk_out_, qkv_out_,
-                                          qk_buffer_, B, S, mHeadSize, mNumHeads, mHiddenSize, arrange_qkv_amax_,
-                                          softmax_in_amax_, softmax_out_amax_, linear_in_amax_, blaslt_handle, stream);
-#endif
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu
deleted file mode 100644
index 2330debf..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu
+++ /dev/null
@@ -1,488 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "backend/cublas/cublas_helper.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#endif
-#include "qkvToContextInt8Plugin.h"
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-const int _max_thread_per_block = 1024;
-const float _quant_range = 127.0;
-
-__global__ void IxinferArrangeEncselfQkvI8II8ONoBias(const int8_t *ori_qkv, int8_t *new_qkv, int max_batch_dim,
-                                                     int batch_seq_len, int dim_per_head, int head_num) {
-    int hidden_size = dim_per_head * head_num;
-    int batch_id = blockIdx.x / batch_seq_len;
-    int token_id = blockIdx.x % batch_seq_len;
-
-    int i = threadIdx.x;  // 1个线程处理4个数据
-
-    int head_id = (i * 4) / dim_per_head;
-    int dim_id = (i * 4) % dim_per_head;
-    int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num, batch_seq_len, dim_per_head);
-
-#pragma unroll
-    for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) {
-        char4 *p_ori_qkv = (char4 *)(ori_qkv + (blockIdx.x * 3 + qkv_idx) * hidden_size);
-        int qkv_offset = max_batch_dim * qkv_idx;
-        char4 *p_new_qkv = (char4 *)(new_qkv + qkv_offset + target_id);
-        p_new_qkv[0] = p_ori_qkv[i];
-    }
-}
-
-template <int log2_elements, int WARP_BATCH>
-__global__ void IxinferCorrelationSoftmaxEncselfI8II8OKernel(int8_t *correlation, const int8_t *src_padding_mask,
-                                                             int batch_seq_len, float quant_scale,
-                                                             float dequant_scale) {
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / SOFT_WARP_SIZE;
-    int local_idx = threadIdx.x;
-
-    for (int warp_idx = 0; warp_idx < WARP_BATCH; ++warp_idx) {
-        int start_idx = (blockIdx.x * gridDim.y * WARP_BATCH * gridDim.z * batch_seq_len +
-                         (blockIdx.y + gridDim.y * warp_idx) * gridDim.z * batch_seq_len + blockIdx.z * batch_seq_len);
-
-        char4 *p_correlation = (char4 *)(correlation + start_idx);
-        char4 *p_src_padding_mask = (char4 *)(src_padding_mask + blockIdx.x * batch_seq_len);
-
-        // load data from global memory
-        // float
-        float4 elements[WARP_ITERATIONS];
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            int element_index = local_idx + it * SOFT_WARP_SIZE;
-            if (element_index < batch_seq_len / 4) {
-                char4 mask = p_src_padding_mask[element_index];
-                char4 correlation_value = p_correlation[element_index];
-
-                elements[it].x =
-                    mask.x ? -INFINITY : (float)correlation_value.x * dequant_scale;
-                elements[it].y =
-                    mask.y ? -INFINITY : (float)correlation_value.y * dequant_scale;
-                elements[it].z =
-                    mask.z ? -INFINITY : (float)correlation_value.z * dequant_scale;
-                elements[it].w =
-                    mask.w ? -INFINITY : (float)correlation_value.w * dequant_scale;
-
-            } else {
-                elements[it].x = -INFINITY;
-                elements[it].y = -INFINITY;
-                elements[it].z = -INFINITY;
-                elements[it].w = -INFINITY;
-            }
-        }
-
-        // compute max_value
-        float max_value = elements[0].x;
-        max_value = (max_value > elements[0].y) ? max_value : elements[0].y;
-        max_value = (max_value > elements[0].z) ? max_value : elements[0].z;
-        max_value = (max_value > elements[0].w) ? max_value : elements[0].w;
-
-#pragma unroll
-        for (int it = 1; it < WARP_ITERATIONS; ++it) {
-            max_value = (max_value > elements[it].x) ? max_value : elements[it].x;
-            max_value = (max_value > elements[it].y) ? max_value : elements[it].y;
-            max_value = (max_value > elements[it].z) ? max_value : elements[it].z;
-            max_value = (max_value > elements[it].w) ? max_value : elements[it].w;
-        }
-
-        warp_reduce<float, SOFT_WARP_SIZE, Max>(&max_value);
-
-        // exp sum
-        float sum = 0.0f;
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            elements[it].x = __expf(elements[it].x - max_value);
-            elements[it].y = __expf(elements[it].y - max_value);
-            elements[it].z = __expf(elements[it].z - max_value);
-            elements[it].w = __expf(elements[it].w - max_value);
-
-            sum += (elements[it].x + elements[it].y + elements[it].z + elements[it].w);
-        }
-
-        warp_reduce<float, SOFT_WARP_SIZE, Add>(&sum);
-        sum = 1.0f / sum;
-        // store result
-#pragma unroll
-        for (int it = 0; it < WARP_ITERATIONS; ++it) {
-            int element_index = local_idx + it * SOFT_WARP_SIZE;
-            char4 correlation_value;
-            if (element_index < batch_seq_len / 4) {
-                correlation_value.x = float2int8(elements[it].x * sum, quant_scale);
-                correlation_value.y = float2int8(elements[it].y * sum, quant_scale);
-                correlation_value.z = float2int8(elements[it].z * sum, quant_scale);
-                correlation_value.w = float2int8(elements[it].w * sum, quant_scale);
-
-                p_correlation[element_index] = correlation_value;
-
-            } else {
-                break;
-            }
-        }
-    }
-}
-
-void IxinferCorrelationSoftmaxEncselfI8II8O(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                            int8_t *correlation, const int8_t *src_padding_mask, float quant_scale,
-                                            float dequant_scale) {
-    const int NUM_INT8_SOFTMAX_BATCH_WARP = 4;
-    if (batch_seq_len > 512) {
-        throw std::runtime_error("batch_seq_len should <= 512");
-    }
-    if (head_num % NUM_INT8_SOFTMAX_BATCH_WARP != 0) {
-        throw std::runtime_error("head_num % NUM_INT8_SOFTMAX_BATCH_WARP !0");
-    }
-    if (batch_seq_len % 4 != 0) {
-        throw std::runtime_error("batch_seq_len % 4 != 0");
-    }
-
-    int log2_elements = log2_ceil(batch_seq_len / 4);
-    int next_power_of_two = 1 << log2_elements;
-    int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    // dim3 blockSize(batch_size, head_num / NUM_INT8_SOFTMAX_BATCH_WARP,
-    // batch_seq_len);
-    //
-    dim3 grid(batch_size, head_num / NUM_INT8_SOFTMAX_BATCH_WARP, batch_seq_len);
-
-    dim3 block(SOFT_WARP_SIZE);
-
-    switch (log2_elements) {
-        case 0:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<0, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-
-            break;
-
-        case 1:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<1, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 2:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<2, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 3:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<3, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 4:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<4, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 5:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<5, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-
-        case 6:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<6, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 7:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<7, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 8:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<8, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        case 9:
-            IxinferCorrelationSoftmaxEncselfI8II8OKernel<9, NUM_INT8_SOFTMAX_BATCH_WARP>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale);
-            break;
-        default:
-            throw std::runtime_error(
-                "ker_correlation_softmax_encself_i8I_i8O_ix_ "
-                "NotImplementedError");
-            break;
-    }
-}
-
-
-__global__ void IxinferArrangeAttenOutputI8II8OKernel(const int8_t *ori_q, int8_t *new_q, int beam_size,
-                                                      int dim_per_head, int head_num, float quant_scale,
-                                                      float dequant_scale) {
-    int hidden_size = dim_per_head * head_num;
-
-#pragma unroll
-    for (int blockin = 0; blockin < 4; blockin++) {
-        int batch_id = (blockIdx.x * 4 + blockin) / beam_size;
-        // note, for encoder, beam_id is token_id; for decoder, beam_id is beam_id
-        int beam_id = (blockIdx.x * 4 + blockin) % beam_size;
-        int i = threadIdx.x;
-        int out_index = (blockIdx.x * 4 + blockin) * hidden_size + i;
-        int head_id = i / dim_per_head;
-        int dim_id = i % dim_per_head;
-
-        char4 *p_ori_q = (char4 *)ori_q;
-        char4 *p_new_q = (char4 *)new_q;
-        char4 value;
-
-        value = p_ori_q[targetid_4dim(batch_id, head_id, beam_id, dim_id, head_num, beam_size, dim_per_head)];
-        value.x = float2int8(value.x * dequant_scale, quant_scale);
-        value.y = float2int8(value.y * dequant_scale, quant_scale);
-        value.z = float2int8(value.z * dequant_scale, quant_scale);
-        value.w = float2int8(value.w * dequant_scale, quant_scale);
-        p_new_q[out_index] = value;
-    }
-}
-
-void IxinferArrangeAttenOutputI8II8O(int batch_token_num, int hidden_size, cudaStream_t stream, const int8_t *ori_q,
-                                     int8_t *new_q, int beam_size, int dim_per_head, int head_num,
-                                     int max_thread_per_block, float quant_scale, float dequant_scale) {
-    int qual_hidden_size = hidden_size >> 2;
-    int qual_dim_per_head = dim_per_head >> 2;
-    IxinferArrangeAttenOutputI8II8OKernel<<<batch_token_num / 4, qual_hidden_size, 0, stream>>>(
-        ori_q, new_q, beam_size, qual_dim_per_head, head_num, quant_scale, dequant_scale);
-}
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax, cuinferHandle_t& cuinfer_handle,
-                                          cudaStream_t& stream) {
-    int batch_token_num = batch_size * batch_seq_len;
-    int max_batch_dim = batch_token_num * hidden_size;
-
-    float scaleCtx = linear_in_amax / _quant_range;
-    float scaleArrange = arrange_qkv_amax / _quant_range;
-    float scaleSoftin = softmax_in_amax / _quant_range;
-    float scaleSoftout = softmax_out_amax / _quant_range;
-
-    float scaleBmm1 = scaleArrange * scaleArrange / scaleSoftin * sqrt(1.f / head_dim);
-    float scaleBmm2 = scaleSoftout * scaleArrange / scaleCtx;
-
-    IxinferArrangeEncselfQkvI8II8ONoBias<<<batch_token_num, hidden_size / 4, 0, stream>>>(
-        qkv_buffer, q_buffer, max_batch_dim, batch_seq_len, head_dim, head_num);
-
-    switch (head_dim) {
-        case 64:
-        case 128:
-        case 192:
-        case 256: {
-            cuinferFlashAttnConfigInfo flashAttnInfo;
-            flashAttnInfo.scaling = sqrt(1.f / (head_dim * 1.0));
-            flashAttnInfo.quantParam.q_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.k_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.v_amax = arrange_qkv_amax;
-            flashAttnInfo.quantParam.p_amax = softmax_out_amax;
-            flashAttnInfo.quantParam.o_amax = linear_in_amax;
-
-            cuinferTensorDescriptor_t qDesc, kDesc, vDesc, maskDesc, oDesc;
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&qDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&kDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&vDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&maskDesc));
-            CUINFER_CHECK(cuinferCreateTensorDescriptor(&oDesc));
-
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(qDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(kDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(vDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(maskDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, 1, 1, batch_seq_len));
-            CUINFER_CHECK(cuinferSetTensor4dDescriptor(oDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW,
-                                                       CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len,
-                                                       head_dim));
-
-            CUINFER_CHECK(cuinferFMHAForwardEx(cuinfer_handle, flashAttnInfo, qDesc, q_buffer, kDesc, k_buffer, vDesc,
-                                               v_buffer, maskDesc, mask, oDesc, qk_buffer));
-            break;
-        }
-        default: {
-            cuinfer_i8_gemm(k_buffer, q_buffer, nullptr, qkv_buffer, batch_size * head_num, batch_seq_len,
-                            batch_seq_len, head_dim, batch_seq_len * head_dim, batch_seq_len * head_dim,
-                            batch_seq_len * batch_seq_len, scaleBmm1, 0.0, 0, cuinfer_handle, stream);
-
-            IxinferCorrelationSoftmaxEncselfI8II8O(batch_size, batch_seq_len, head_num, stream, qkv_buffer, mask,
-                                                   1.0 / scaleSoftout, scaleSoftin);
-
-            cuinfer_nn_i8_gemm(v_buffer, qkv_buffer, qk_buffer, batch_size * head_num, head_dim, batch_seq_len,
-                               batch_seq_len, batch_seq_len * head_dim, batch_seq_len * batch_seq_len,
-                               batch_seq_len * head_dim, scaleBmm2, cuinfer_handle, stream);
-            break;
-        }
-    }
-
-    IxinferArrangeAttenOutputI8II8O(batch_token_num, hidden_size, stream, qk_buffer, qkv_out, batch_seq_len, head_dim,
-                                    head_num, _max_thread_per_block, 1.f, 1.f);
-    return cudaSuccess;
-}
-#else
-template <int THREAD_DATA_LEN>
-__global__ void quant_qkv_gemm(const int32_t* input, int8_t* output, int hidden_size, float quant_scale, int num_per_tca) {
-    float4 val[THREAD_DATA_LEN];
-
-    int block_id = blockIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z;
-    int block_start = block_id * hidden_size;
-    input += block_start;
-    output += block_start;
-
-    int4* p_input = (int4*)input;
-    char4* p_output = (char4*)output;
-
-    float4 bias_val;
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * num_per_tca;
-        char4 q_input;
-        q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale);
-        q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale);
-        q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale);
-        q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale);
-
-        p_output[element_index] = q_input;
-    }
-}
-
-void quantQKVGemm(int32_t* input, int8_t* output, int batch_size, int head_num, int batch_seq_len, int hidden_size, float dequant_scale, cudaStream_t stream) {
-    if (hidden_size > 4096) {
-        throw std::runtime_error("hidden_size should <= 4096");
-    }
-    int num_per_tca = min(hidden_size / 4, C10_WARP_SIZE); 
-    dim3 gridSize(batch_size, head_num, batch_seq_len);
-    dim3 blockSize(num_per_tca);
-
-    int num_warp = hidden_size / num_per_tca / 4;
-    switch (num_warp) {
-        case 1:
-            quant_qkv_gemm<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 2:
-            quant_qkv_gemm<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 3:
-            quant_qkv_gemm<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 4:
-            quant_qkv_gemm<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 5:
-            quant_qkv_gemm<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 6:
-            quant_qkv_gemm<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 7:
-            quant_qkv_gemm<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 8:
-            quant_qkv_gemm<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 9:
-            quant_qkv_gemm<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 10:
-            quant_qkv_gemm<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 11:
-            quant_qkv_gemm<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 12:
-            quant_qkv_gemm<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 13:
-            quant_qkv_gemm<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 14:
-            quant_qkv_gemm<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 15:
-            quant_qkv_gemm<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        case 16:
-            quant_qkv_gemm<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, output, hidden_size, dequant_scale, num_per_tca);
-            break;
-        default:
-            throw std::runtime_error("quantQKVGemm");
-            break;
-    }
-}
-
-
-cudaError_t fused_multihead_attetion_int8(int8_t *qkv_buffer, int8_t *mask, int8_t *q_buffer, int8_t *k_buffer,
-                                          int8_t *v_buffer, int32_t *qk_out, int8_t *qkv_out, int8_t *qk_buffer, int batch_size,
-                                          int batch_seq_len, int head_dim, int head_num, int hidden_size,
-                                          float arrange_qkv_amax, float softmax_in_amax, float softmax_out_amax,
-                                          float linear_in_amax, cublasLtHandle_t &cublas_lt_handle,
-                                          cudaStream_t &stream) {
-    int batch_token_num = batch_size * batch_seq_len;
-    int max_batch_dim = batch_token_num * hidden_size;
-
-    float scaleCtx = linear_in_amax / _quant_range;
-    float scaleArrange = arrange_qkv_amax / _quant_range;
-    float scaleSoftin = softmax_in_amax / _quant_range;
-    float scaleSoftout = softmax_out_amax / _quant_range;
-
-    float scaleBmm1 = scaleArrange * scaleArrange / scaleSoftin * sqrt(1.f / head_dim);
-    float scaleBmm2 = scaleSoftout * scaleArrange / scaleCtx;
-
-    IxinferArrangeEncselfQkvI8II8ONoBias<<<batch_token_num, hidden_size / 4, 0, stream>>>(
-        qkv_buffer, q_buffer, max_batch_dim, batch_seq_len, head_dim, head_num);
-
-    cublaslt_gemm(k_buffer, q_buffer, qk_out, batch_size * head_num, batch_seq_len, batch_seq_len, head_dim,
-                  batch_seq_len * head_dim, batch_seq_len * head_dim, batch_seq_len * batch_seq_len, 1,
-                  cublas_lt_handle, stream);
-    quantQKVGemm(qk_out, qk_buffer, batch_size, head_num, batch_seq_len, batch_seq_len, scaleBmm1, stream);
-
-    IxinferCorrelationSoftmaxEncselfI8II8O(batch_size, batch_seq_len, head_num, stream, qk_buffer, mask,
-                                           1.0 / scaleSoftout, scaleSoftin);
-
-    cublaslt_gemm_nn(v_buffer, qk_buffer, qk_out, batch_size * head_num, head_dim, batch_seq_len, batch_seq_len,
-                     batch_seq_len * head_dim, batch_seq_len * batch_seq_len, batch_seq_len * head_dim, 1,
-                     cublas_lt_handle, stream);
-    quantQKVGemm(qk_out, q_buffer, batch_size, head_num, batch_seq_len, head_dim, scaleBmm2, stream);
-
-    IxinferArrangeAttenOutputI8II8O(batch_token_num, hidden_size, stream, q_buffer, qkv_out, batch_seq_len, head_dim,
-                                    head_num, _max_thread_per_block, 1.f, 1.f);
-    return cudaSuccess;
-}
-#endif
-}  // namespace bert
-}  // namespace nvinfer1::ixrt_plugin
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h
deleted file mode 100644
index b5c501fc..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <cublasLt.h>
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-#include <string>
-#include <vector>
-#ifdef __ILUVATAR__
-#include "ixinfer.h"
-#endif
-
-namespace nvinfer1::ixrt_plugin
-{
-namespace bert
-{
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax, cuinferHandle_t& cuinfer_handle,
-                                          cudaStream_t& stream);
-#else
-cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer,
-                                          int8_t* v_buffer, int32_t* qk_out, int8_t* qkv_out, int8_t* qk_buffer,
-                                          int batch_size, int batch_seq_len, int head_dim, int head_num,
-                                          int hidden_size, float arrange_qkv_amax, float softmax_in_amax,
-                                          float softmax_out_amax, float linear_in_amax,
-                                          cublasLtHandle_t& cublas_lt_handle, cudaStream_t& stream);
-#endif
-
-void IxinferCorrelationSoftmaxEncselfI8II8O(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                            int8_t *correlation, const int8_t *src_padding_mask, float quant_scale,
-                                            float dequant_scale);
-
-void IxinferArrangeAttenOutputI8II8O(int batch_token_num, int hidden_size, cudaStream_t stream, const int8_t *ori_q,
-                                     int8_t *new_q, int beam_size, int dim_per_head, int head_num,
-                                     int max_thread_per_block, float quant_scale, float dequant_scale);
-class QKVToContextInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    QKVToContextInt8PluginDynamic(std::string const& name, int32_t const hiddenSize, int32_t const numHeads,
-        vector<float> const dqProbs);
-
-    QKVToContextInt8PluginDynamic(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make QKVToContextInt8PluginDynamic without arguments, so we
-    // delete default constructor.
-    QKVToContextInt8PluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;  
-
-protected:
-    void createMHARunner() noexcept;
-    int32_t getSMVersion() const noexcept;
-
-private:
-    std::string const& mLayerName;
-    std::string mNamespace;
-
-    int32_t mS;
-    int32_t mB;
-    int32_t mSM;
-    int32_t mHeadSize;
-    int32_t mHiddenSize;
-    int32_t mNumHeads;
-
-    cuda_unique_ptr<half> mQkvBias;
-
-    vector<float> mDqProbs;
-    bool mUseInt8ScaleMax{true};
-
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-};
-
-class QKVToContextInt8PluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    QKVToContextInt8PluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp
deleted file mode 100644
index a69fb957..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "qkvToContextPlugin.h"
-
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "common_def.cuh"
-#include "cuda_runtime_api.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-#include <cstddef>
-#include <cstdint>
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* const kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION{"1"};
-char const* const kQKV_TO_CONTEXT_VAR_SEQLEN_IXRT_PLUGIN_VERSION{"2"};
-char const* const kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection QKVToContextPluginDynamicCreator::mFC{};
-std::vector<PluginField> QKVToContextPluginDynamicCreator::mPluginAttributes;
-
-constexpr uint32_t IIDX = 0;  // index of the input tensor
-constexpr uint32_t MIDX = 1;  // index of the mask
-
-QKVToContextPluginDynamicCreator::QKVToContextPluginDynamicCreator() {
-    mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("hidden_size", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("has_mask", nullptr, PluginFieldType::kINT32, 1));
-    mPluginAttributes.emplace_back(PluginField("dq_probs", nullptr, PluginFieldType::kFLOAT32, 1));
-
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginName() const noexcept {
-    return kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME;
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginVersion() const noexcept {
-    return kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION;
-}
-
-PluginFieldCollection const* QKVToContextPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* QKVToContextPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "Creating QKV2ContextPlugin..." << endl;
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-        int32_t hiddenSize = 0;
-        // Since numHeads must always exist or validateRequiredAttributes will fail,
-        // we can set numHeads to -1 so that static analysis tools don't warn about
-        // a division by zero in QKVToContextPluginDynamic constructor.
-        int32_t numHeads{-1};
-        bool hasMask = false;
-        int32_t typeId = -1;
-
-        float dqProbs = -1;
-
-        IXRT_PLUGIN_ASSERT(fc->fields != nullptr);
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "hidden_size", "num_heads", "has_mask"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].name != nullptr);
-            IXRT_PLUGIN_ASSERT(fc->fields[i].data != nullptr);
-            std::string field_name(fc->fields[i].name);
-
-            if (field_name.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(typeId >= 0 && typeId <= 2,
-                                        ("QKV: Invalid TypeId " + std::to_string(typeId)).c_str());
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-            if (field_name.compare("hidden_size") == 0) {
-                hiddenSize = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hiddenSize > 0,
-                                        ("QKV: Invalid hiddenSize " + std::to_string(hiddenSize)).c_str());
-                gLogInfo << "Building hiddenSize: " << hiddenSize << endl;
-            }
-            if (field_name.compare("num_heads") == 0) {
-                numHeads = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(numHeads > 0, ("QKV: Invalid numHeads " + std::to_string(numHeads)).c_str());
-                gLogInfo << "Building numHeads: " << numHeads << endl;
-            }
-            if (field_name.compare("has_mask") == 0) {
-                auto hasMaskValue = *static_cast<int32_t const*>(fc->fields[i].data);
-                IXRT_PLUGIN_CHECK_VALUE(hasMaskValue == 0 || hasMaskValue == 1,
-                                        ("QKV: Invalid hasMask " + std::to_string(hasMaskValue)).c_str());
-                hasMask = static_cast<bool>(hasMaskValue);
-                gLogInfo << "Building hasMask: " << hasMask << endl;
-            }
-        }
-
-        gLogInfo << "Building the Plugin..." << endl;
-        auto type = static_cast<DataType>(typeId);
-        auto* p = new QKVToContextPluginDynamic(name, type, hiddenSize, numHeads, dqProbs, hasMask);
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* QKVToContextPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                               size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call QKVToContextPluginDynamic::destroy()
-    return new QKVToContextPluginDynamic(name, serialData, serialLength);
-}
-
-void QKVToContextPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* QKVToContextPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// REGISTER_TENSORRT_PLUGIN(QKVToContextPluginDynamicCreator);
-//#########################################################################//
-QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, const DataType type,
-                                                     const int32_t hiddenSize, const int32_t numHeads,
-                                                     float const dqProbs, bool hasImask)
-    : mLayerName(name),
-      mS(0),
-      mB(0),
-      mHeadSize(hiddenSize / numHeads),
-      mHiddenSize(hiddenSize),
-      mNumHeads(numHeads),
-      mHasImask(hasImask),
-      mType(type)
-
-{
-    //
-}
-
-QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, void const* data, size_t length)
-    : mLayerName(name) {
-    gLogInfo << "QKV Deser Start" << endl;
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mNumHeads);
-    deserialize_value(&data, &length, &mHeadSize);
-    deserialize_value(&data, &length, &mHasImask);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mS);
-    deserialize_value(&data, &length, &mB);
-
-    gLogInfo << "QKV Deser done" << endl;
-}
-
-// IPluginV2 Methods
-char const* QKVToContextPluginDynamic::getPluginType() const noexcept { return kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME; }
-
-char const* QKVToContextPluginDynamic::getPluginVersion() const noexcept { return kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION; }
-
-int32_t QKVToContextPluginDynamic::getNbOutputs() const noexcept { return 1; }
-
-int32_t QKVToContextPluginDynamic::initialize() noexcept { return 0; }
-
-void QKVToContextPluginDynamic::terminate() noexcept {}
-
-size_t QKVToContextPluginDynamic::getSerializationSize() const noexcept {
-    return sizeof(mNumHeads) + sizeof(mHeadSize) + sizeof(DataType) + sizeof(mHasImask) + sizeof(mHiddenSize) +
-           sizeof(mS) + sizeof(mB);
-}
-
-void QKVToContextPluginDynamic::serialize(void* buffer) const noexcept {
-    serialize_value(&buffer, mType);
-    serialize_value(&buffer, mNumHeads);
-    serialize_value(&buffer, mHeadSize);
-    serialize_value(&buffer, mHasImask);
-    serialize_value(&buffer, mHiddenSize);
-    serialize_value(&buffer, mS);
-    serialize_value(&buffer, mB);
-}
-
-void QKVToContextPluginDynamic::destroy() noexcept { delete this; }
-
-void QKVToContextPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* QKVToContextPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2Ext Methods
-DataType QKVToContextPluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes,
-                                                      int32_t /*nbInputs*/) const noexcept {
-    IXRT_PLUGIN_ASSERT(index == 0);
-    IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF ||
-                       inputTypes[0] == DataType::kINT8);
-    return inputTypes[0];
-}
-
-// IPluginV2DynamicExt Methods
-nvinfer1::IPluginV2DynamicExt* QKVToContextPluginDynamic::clone() const noexcept {
-    gLogInfo << "QKV Clone" << endl;
-
-    QKVToContextPluginDynamic* ret = nullptr;
-    ret = new QKVToContextPluginDynamic(mLayerName, mType, mHiddenSize, mNumHeads, mDqProbs, mHasImask);
-
-    ret->setPluginNamespace(mNamespace.c_str());
-    gLogInfo << "QKV Clone done" << endl;
-    return ret;
-}
-
-DimsExprs QKVToContextPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                         int32_t /*nbInputs*/, IExprBuilder& exprBuilder) noexcept {
-    // Input is BxSx3*N*H, output should be BxSxN*H
-    IXRT_PLUGIN_ASSERT(outputIndex == 0);
-    // Copy over everything
-    DimsExprs output(inputs[IIDX]);
-    // Divide last dim by three
-    auto const* three = exprBuilder.constant(3);
-    output.d[HDIM] = exprBuilder.constant(mHiddenSize);
-    return output;
-}
-bool QKVToContextPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                          int32_t /*nbOutputs*/) noexcept {
-    IXRT_PLUGIN_ASSERT(pos >= 0);
-    IXRT_PLUGIN_ASSERT(pos < 2 + mHasImask);
-    IXRT_PLUGIN_ASSERT(nbInputs == 1 + mHasImask);
-    auto const* in = inOut;
-    auto const* out = inOut + nbInputs;
-
-    if (pos == 0) {
-        return (in->type == mType) && (in->format == TensorFormat::kLINEAR);
-    }
-
-    // pos==1
-    if ((mHasImask && pos == 1))  // pos 1 is the mask
-    {
-        auto const* inMask = &inOut[1];
-
-        // detect full mask and check that it was produced
-        return (inMask->type == DataType::kINT32) &&       // precision
-               (inMask->format == TensorFormat::kLINEAR);  // format
-    }
-
-    if (!mHasImask || pos == 2)  // output pos
-    {
-        return (in->type == out->type) && (out->format == TensorFormat::kLINEAR);
-    }
-
-    return false;
-}
-void QKVToContextPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs,
-                                                DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept {
-    IXRT_PLUGIN_ASSERT(nbInputs == 1 + mHasImask);
-    IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-    PluginTensorDesc const& inDesc = in[IIDX].desc;
-    TRT_UNUSED inDesc;
-    PluginTensorDesc const& outDesc = out->desc;
-    TRT_UNUSED outDesc;
-    IXRT_PLUGIN_ASSERT(mType == inDesc.type);
-    IXRT_PLUGIN_ASSERT(mType == outDesc.type);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.nbDims == 5)
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[HDIM] == 3 * mHiddenSize);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(inDesc.dims.d[4] == 1);
-    if (mHasImask) {
-        PluginTensorDesc const& maskDesc = in[MIDX].desc;
-        TRT_UNUSED maskDesc;
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.nbDims == 2);
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.d[0] == inDesc.dims.d[0]);
-        IXRT_PLUGIN_ASSERT(maskDesc.dims.d[1] == inDesc.dims.d[1]);
-    }
-
-    const int32_t S = inDesc.dims.d[SDIM];
-    const int32_t B = inDesc.dims.d[BDIM] <= 0 ? in->max.d[BDIM] : inDesc.dims.d[BDIM];
-    mS = S;
-    mB = B;
-
-    IXRT_PLUGIN_ASSERT(outDesc.dims.nbDims == 5);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[BDIM] == inDesc.dims.d[BDIM]);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[SDIM] == mS);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[HDIM] == mHiddenSize);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[3] == 1);
-    IXRT_PLUGIN_ASSERT(outDesc.dims.d[4] == 1);
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferCreate(&cuinfer_handle));
-#else
-    CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle));
-#endif
-}
-
-size_t QKVToContextPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                   PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    const int32_t B = inputs->dims.d[BDIM];
-    const int32_t S = inputs->dims.d[SDIM];
-    const int32_t E = inputs->dims.d[2];
-    int32_t fmha_S = S;
-    int64_t buffer_size = B * fmha_S * E;
-#ifndef __ILUVATAR__
-    buffer_size += B * S * S * mNumHeads;
-#endif
-    return 4 * buffer_size * sizeof(mType);
-}
-
-inline void print_element(half* x, int num, string name) {
-    printf("%s: \n", name.c_str());
-    half* out = (half*)malloc(num * sizeof(half));
-    cudaMemcpy(out, x, num * sizeof(half), cudaMemcpyDeviceToHost);
-    for (auto i = 0; i < num; i++) {
-        printf("%f\n", __half2float(out[i]));
-    }
-    printf("\n");
-}
-
-int32_t QKVToContextPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                           void const* const* inputs, void* const* outputs, void* workspace,
-                                           cudaStream_t stream) noexcept {
-    gLogInfo << "in QKVToContextPluginDynamic.." << endl;
-    int32_t S = inputDesc->dims.d[SDIM];
-    int32_t B = inputDesc->dims.d[BDIM];
-    int32_t status = STATUS_SUCCESS;
-#ifdef __ILUVATAR__
-    CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream));
-#endif
-
-    try {
-        if (mType != DataType::kHALF) {
-            gLogError << "embLayerNormPlugin infer type{" << int(mType) << "} not supported!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        }
-        half* qkv_buffer_ = (half*)inputs[0];
-        half* qkv_out_ = (half*)outputs[0];
-        // [B, fmha_S]
-        int32_t* mask_ = mHasImask ? (int32_t*)inputs[1] : nullptr;
-        int fmha_seq_len = S;
-
-        int64_t buffer_size = B * fmha_seq_len * mHiddenSize;
-        half* q_buffer_ = reinterpret_cast<half*>(workspace);
-        half* k_buffer_ = q_buffer_ + buffer_size;
-        half* v_buffer_ = k_buffer_ + buffer_size;
-        
-
-        // [B, S, 3*E, 1, 1] [B, fmha_S]
-#ifdef __ILUVATAR__
-        auto status =
-            fused_multihead_attetion(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qkv_out_, B, mHeadSize,
-                                     mNumHeads, mHiddenSize, S, fmha_seq_len, cuinfer_handle, stream);
-#else    
-        half* qk_out_ = v_buffer_ + buffer_size;
-        auto status =
-            fused_multihead_attetion(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qk_out_, qkv_out_, B, mHeadSize,
-                                     mNumHeads, mHiddenSize, S, fmha_seq_len, blaslt_handle, stream);
-#endif
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-
-    } catch (std::exception const& e) {
-        caughtError(e);
-        return STATUS_FAILURE;
-    }
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu
deleted file mode 100644
index fb9455c6..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu
+++ /dev/null
@@ -1,317 +0,0 @@
-#include "qkvToContextPlugin.h"
-#include "backend/bert/bert_helper.h"
-#ifdef __ILUVATAR__
-#include "backend/ixinfer/ixinfer_gemm_helper.h"
-#else
-#include "backend/cublas/cublas_helper.h"
-#endif
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-void __global__ IxinferArrangeEncQkvKernel(half *ori_qkv, half *new_q, half *new_k, half *new_v,
-                                           int head_dim, int head_num, int batch_seq_len, int fmha_seq_len) {
-    int hidden_size = head_dim * head_num;
-    int batch_id = blockIdx.x;
-    int token_id = blockIdx.y;
-
-    int i = threadIdx.x;  // 1个线程处理2个数据
-    int head_id = (i * 2) / head_dim;
-    int dim_id = (i * 2) % head_dim;
-
-    half2 *p_ori_qkv = (half2 *)(ori_qkv + batch_id * batch_seq_len * hidden_size * 3 + token_id * hidden_size * 3);
-    half2 *p_new_qkv;
-
-    int target_id = batch_id * head_num * fmha_seq_len * head_dim + head_id * fmha_seq_len * head_dim +
-                    token_id * head_dim + dim_id;
-    /* q */
-    p_new_qkv = (half2 *)(new_q + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-    /* k */
-    p_ori_qkv += hidden_size / 2;
-    p_new_qkv = (half2 *)(new_k + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-    /* v */
-    p_ori_qkv += hidden_size / 2;
-    p_new_qkv = (half2 *)(new_v + target_id);
-    p_new_qkv[0] = p_ori_qkv[i];
-}
-
-void IxinferArrangeEncQkv(half *ori_qkv, half *new_q, half *new_k, half *new_v, int bsz,
-                          int head_num, int head_dim, int ori_seq_len, int fmha_seq_len, cudaStream_t stream) {
-    int hsz = head_num * head_dim;
-    if (hsz / 2 > 4096) {
-        throw std::runtime_error("hidden_size / 2 > 4096");
-    }
-    if (hsz % 2 != 0) {
-        throw std::runtime_error("hsz % 2 != 0");
-    }
-    if (head_dim % 2 != 0) {
-        throw std::runtime_error("head_dim %2 != 0");
-    }
-    dim3 blockSize(bsz, ori_seq_len);
-    IxinferArrangeEncQkvKernel<<<blockSize, hsz / 2, 0, stream>>>(ori_qkv, new_q, new_k, new_v, head_dim,
-                                                                  head_num, ori_seq_len, fmha_seq_len);
-}
-
-__global__ void IxinferEncAttnOutArrangeKernel(const half *ori_q, half *new_q, const int bsz, const int ori_seq_len,
-                                               const int fmha_seq_len, const int head_num, const int head_dim) {
-    half2 *p_ori_q = (half2 *)ori_q;
-    half2 *p_new_q = (half2 *)new_q;
-
-    int batch_token_num = ori_seq_len * head_dim * head_num;
-    int hidden_size = head_dim * head_num;
-    int date_length = bsz * ori_seq_len * head_num * head_dim;
-
-    int elem_idx = threadIdx.x + blockIdx.x * blockDim.x;
-    while (elem_idx < date_length / 2) {
-        int half_elem_idx = elem_idx * 2;
-
-        int bsz_idx = half_elem_idx / batch_token_num;
-        int seq_idx = half_elem_idx % batch_token_num / hidden_size;
-        int head_idx = half_elem_idx % batch_token_num % hidden_size / head_dim;
-        int dim_idx = half_elem_idx % batch_token_num % hidden_size % head_dim;
-
-        int src_index = bsz_idx * head_num * fmha_seq_len * head_dim + head_idx * fmha_seq_len * head_dim +
-                        seq_idx * head_dim + dim_idx;
-
-        p_new_q[elem_idx] = p_ori_q[src_index / 2];
-
-        elem_idx += gridDim.x * blockDim.x;
-    }
-}
-
-void IxinferEncAttnOutArrange(half *ori_q, half *new_q, int bsz, int ori_seq_len, int fmha_seq_len, int head_num,
-                              int head_dim, cudaStream_t stream) {
-    if (bsz * ori_seq_len * head_num * head_dim % 2 != 0) {
-        throw std::runtime_error("bsz * ori_seq_len * head_num * head_dim % 2 != 0");
-    }
-    int data_length = bsz * ori_seq_len * head_num * head_dim / 2;
-    int num_threads = 512;
-    int num_blocks = ((data_length - 1 + num_threads) / num_threads);
-    num_blocks = std::min(num_blocks, 128);
-    IxinferEncAttnOutArrangeKernel<<<num_blocks, num_threads, 0, stream>>>(ori_q, new_q, bsz, ori_seq_len, fmha_seq_len,
-                                                                           head_num, head_dim);
-}
-
-
-template <int log2_elements>
-__global__ void IxinferCorrelationSoftmaxEncselfKernel(__half *correlation, const int *src_padding_mask,
-                                                       const int batch_seq_len) {
-    constexpr int next_power_of_two = 1 << log2_elements;
-    constexpr int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-    constexpr int WARP_ITERATIONS = next_power_of_two / SOFT_WARP_SIZE;
-
-    int head_num = blockDim.y;
-    int seq_len = gridDim.y;
-    int start_idx = (blockIdx.x * head_num * seq_len * batch_seq_len + threadIdx.y * seq_len * batch_seq_len +
-                     blockIdx.y * batch_seq_len);
-
-    half2 *p_correlation = (half2 *)(correlation + start_idx);
-    int32_t *p_mask = (int32_t *)(src_padding_mask + blockIdx.x * batch_seq_len);
-
-    int local_idx = threadIdx.x;
-
-    float2 elements[WARP_ITERATIONS];
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        int element_index = local_idx + it * SOFT_WARP_SIZE;
-        if (element_index < batch_seq_len / 2) {
-            half2 correlation_value = p_correlation[element_index];
-
-            elements[it].x =
-                p_mask[element_index * 2] ? -INFINITY : __half2float(correlation_value.x);
-            elements[it].y = p_mask[element_index * 2 + 1] ? -INFINITY
-                                                           : __half2float(correlation_value.y);
-
-        } else {
-            elements[it].x = -INFINITY;
-            elements[it].y = -INFINITY;
-        }
-    }
-
-    float max_value = elements[0].x;
-    max_value = (max_value > elements[0].y) ? max_value : elements[0].y;
-
-#pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-        max_value = (max_value > elements[it].x) ? max_value : elements[it].x;
-        max_value = (max_value > elements[it].y) ? max_value : elements[it].y;
-    }
-
-    warp_reduce<float, SOFT_WARP_SIZE, Max>(&max_value);
-
-    float sum = 0.0f;
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        elements[it].x = __expf(elements[it].x - max_value);
-        elements[it].y = __expf(elements[it].y - max_value);
-
-        sum += (elements[it].x + elements[it].y);
-    }
-
-    warp_reduce<float, SOFT_WARP_SIZE, Add>(&sum);
-    sum = 1.0f / sum;
-
-#pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-        int element_index = local_idx + it * SOFT_WARP_SIZE;
-        half2 correlation_value;
-        if (element_index < batch_seq_len / 2) {
-            correlation_value.x = __float2half(elements[it].x * sum);
-            correlation_value.y = __float2half(elements[it].y * sum);
-
-            p_correlation[element_index] = correlation_value;
-
-        } else {
-            break;
-        }
-    }
-}
-
-void IxinferCorrelationSoftmaxEncself(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                      __half *correlation, const int *src_padding_mask) {
-    if (batch_seq_len > 4096) {
-        throw std::runtime_error("batch_seq_len should <= 4096");
-    }
-    if (batch_seq_len % 2 != 0) {
-        throw std::runtime_error("batch_seq_len % 2 != 0");
-    }
-
-    int log2_elements = log2_ceil(batch_seq_len / 2);
-    int next_power_of_two = 1 << log2_elements;
-    int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
-
-    dim3 grid(batch_size, batch_seq_len);
-
-    dim3 block(WARP_SIZE, head_num);
-
-    switch (log2_elements) {
-        case 0:
-            IxinferCorrelationSoftmaxEncselfKernel<0>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 1:
-            IxinferCorrelationSoftmaxEncselfKernel<1>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 2:
-            IxinferCorrelationSoftmaxEncselfKernel<2>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 3:
-            IxinferCorrelationSoftmaxEncselfKernel<3>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 4:
-            IxinferCorrelationSoftmaxEncselfKernel<4>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 5:
-            IxinferCorrelationSoftmaxEncselfKernel<5>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-
-        case 6:
-            IxinferCorrelationSoftmaxEncselfKernel<6>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 7:
-            IxinferCorrelationSoftmaxEncselfKernel<7>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 8:
-            IxinferCorrelationSoftmaxEncselfKernel<8>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 9:
-            IxinferCorrelationSoftmaxEncselfKernel<9>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 10:
-            IxinferCorrelationSoftmaxEncselfKernel<10>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 11:
-            IxinferCorrelationSoftmaxEncselfKernel<11>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        case 12:
-            IxinferCorrelationSoftmaxEncselfKernel<12>
-                <<<grid, block, 0, stream>>>(correlation, src_padding_mask, batch_seq_len);
-            break;
-        default:
-            throw std::runtime_error("IxinferCorrelationSoftmaxEncself NotImplementedError");
-            break;
-    }
-}
-
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask,
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cuinferHandle_t &cuinfer_handle, cudaStream_t &stream) {
-    /* qkv arrange*/
-    // bsz,ori_seq_len,3*hsz -> 3*(bsz,head_num,fmha_seq_len,head_dim)
-    IxinferArrangeEncQkv(qkv_buffer, q_buffer, k_buffer, v_buffer, bsz, head_num, head_dim, ori_seq_len,
-                         fmha_seq_len, stream);
-
-    cuinferTensorDescriptor_t qDesc, kDesc, vDesc, maskDesc, oDesc;
-    cuinferDataType_t _cuinferCompType = cuinferDataType_t::CUINFER_DATA_FLOAT;
-    cuinferDataType_t _cuinferDataType = cuinferDataType_t::CUINFER_DATA_HALF;
-    cuinferDataType_t _cuinferMaskType = cuinferDataType_t::CUINFER_DATA_INT32;
-    cuinferCreateTensorDescriptor(&qDesc);
-    cuinferCreateTensorDescriptor(&kDesc);
-    cuinferCreateTensorDescriptor(&vDesc);
-    cuinferCreateTensorDescriptor(&maskDesc);
-    cuinferCreateTensorDescriptor(&oDesc);
-
-    cuinferSetTensor4dDescriptor(qDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(kDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(vDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-    cuinferSetTensor4dDescriptor(maskDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferMaskType, bsz, 1, 1,
-                                    fmha_seq_len);
-    cuinferSetTensor4dDescriptor(oDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num,
-                                    fmha_seq_len, head_dim);
-
-    cuinferFMHAParam fmha_param;
-    cuinferFMHAForward(cuinfer_handle, fmha_param, _cuinferCompType, _cuinferDataType, _cuinferMaskType, qDesc,
-                        q_buffer, kDesc, k_buffer, vDesc, v_buffer, maskDesc, mask, oDesc, q_buffer, true);
-    
-    IxinferEncAttnOutArrange(q_buffer, qkv_out, bsz, ori_seq_len, fmha_seq_len, head_num, head_dim, stream);
-    return cudaSuccess;
-}
-#else
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qk_out, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cublasLtHandle_t &blaslt_handle, cudaStream_t &stream) {
-    /* qkv arrange*/
-    // bsz,ori_seq_len,3*hsz -> 3*(bsz,head_num,fmha_seq_len,head_dim)
-    IxinferArrangeEncQkv(qkv_buffer, q_buffer, k_buffer, v_buffer, bsz, head_num, head_dim, ori_seq_len,
-                         fmha_seq_len, stream);
-
-    cublaslt_gemm(k_buffer, q_buffer, qk_out, bsz * head_num, fmha_seq_len, fmha_seq_len, head_dim,
-                    fmha_seq_len * head_dim, fmha_seq_len * head_dim, fmha_seq_len * fmha_seq_len, 1.0/sqrt(head_dim*1.0), blaslt_handle, stream);
- 
-    IxinferCorrelationSoftmaxEncself(bsz, fmha_seq_len, head_num, stream, qk_out, mask);
- 
-    cublaslt_gemm_nn(v_buffer, qk_out, q_buffer, bsz * head_num, head_dim, fmha_seq_len, fmha_seq_len,
-                    fmha_seq_len * head_dim, fmha_seq_len * fmha_seq_len, fmha_seq_len * head_dim, 1.0f, blaslt_handle, stream);
-
-    IxinferEncAttnOutArrange(q_buffer, qkv_out, bsz, ori_seq_len, fmha_seq_len, head_num, head_dim, stream);
-    return cudaSuccess;                            
-}
-#endif
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h
deleted file mode 100644
index aaee52b7..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#ifdef __ILUVATAR__
-#include <ixinfer.h>
-#endif
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-#ifdef __ILUVATAR__
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cuinferHandle_t &cuinfer_handle, cudaStream_t &stream);
-#else
-cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, 
-                              half* q_buffer, half* k_buffer, half* v_buffer, half* qk_out, half* qkv_out,
-                              int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len,
-                              cublasLtHandle_t &blaslt_handle, cudaStream_t &stream);
-#endif
-
-void IxinferArrangeEncQkv(half *ori_qkv, half *new_q, half *new_k, half *new_v, int bsz,
-                          int head_num, int head_dim, int ori_seq_len, int fmha_seq_len, cudaStream_t stream);
-
-void IxinferEncAttnOutArrange(half *ori_q, half *new_q, int bsz, int ori_seq_len, int fmha_seq_len, int head_num,
-                              int head_dim, cudaStream_t stream);
-
-void IxinferCorrelationSoftmaxEncself(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream,
-                                      half *correlation, const int *src_padding_mask);
-
-class QKVToContextPluginDynamic : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    QKVToContextPluginDynamic(const std::string name, const nvinfer1::DataType type, const int32_t hiddenSize,
-        const int32_t numHeads, float const dqProbs, bool hasImask = false);
-
-    QKVToContextPluginDynamic(const std::string name, void const* data, size_t length);
-
-    // It doesn't make sense to make QKVToContextPluginDynamic without arguments, so we
-    // delete default constructor.
-    QKVToContextPluginDynamic() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-private:
-    const std::string mLayerName;
-    std::string mNamespace;
-
-    int32_t mS;
-    int32_t mB;
-    int32_t mSM;
-    int32_t mHeadSize;
-    int32_t mHiddenSize;
-    int32_t mNumHeads;
-    bool mHasImask;
-    nvinfer1::DataType mType;
-    float mDqProbs;
-#ifdef __ILUVATAR__
-    cuinferHandle_t cuinfer_handle;
-#else
-    cublasLtHandle_t blaslt_handle;
-#endif
-    cudaStream_t stream;
-    
-    half* query_;
-};
-
-class QKVToContextPluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    QKVToContextPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp
deleted file mode 100644
index 6e4e5a37..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "skipLayerNormInt8Plugin.h"
-
-#include "NvInferRuntime.h"
-#include "checkMacrosPlugin.h"
-#include "driver_types.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-// Clip plugin specific constants
-namespace {
-char const* kSKIP_LAYER_NORM_INT8_VERSION_HFACE{"3"};
-char const* kSKIP_LAYER_NORM_INT8_VERSION_MTRON{"4"};
-char const* kSKIP_LAYER_NORM_INT8_NAME{"CustomSkipLayerNormPluginDynamic_IxRT"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection SkipLayerNormInt8PluginBaseCreator::mFC{};
-std::vector<PluginField> SkipLayerNormInt8PluginBaseCreator::mPluginAttributes;
-
-constexpr auto param_type = DataType::kFLOAT;
-
-SkipLayerNormInt8PluginBaseCreator::SkipLayerNormInt8PluginBaseCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("beta"));
-    mPluginAttributes.emplace_back(PluginField("gamma"));
-    mPluginAttributes.emplace_back(PluginField("bias"));
-    mPluginAttributes.emplace_back(PluginField("output_fp32"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-SkipLayerNormInt8PluginHFaceCreator::SkipLayerNormInt8PluginHFaceCreator() : SkipLayerNormInt8PluginBaseCreator() {}
-
-char const* SkipLayerNormInt8PluginBaseCreator::getPluginName() const noexcept { return kSKIP_LAYER_NORM_INT8_NAME; }
-
-PluginFieldCollection const* SkipLayerNormInt8PluginBaseCreator::getFieldNames() noexcept { return &mFC; }
-
-void SkipLayerNormInt8PluginBaseCreator::setPluginNamespace(char const* libNamespace) noexcept {
-    mNamespace = libNamespace;
-}
-
-char const* SkipLayerNormInt8PluginBaseCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-char const* SkipLayerNormInt8PluginHFaceCreator::getPluginVersion() const noexcept {
-    return kSKIP_LAYER_NORM_INT8_VERSION_HFACE;
-}
-
-bool buildBetaAndGamma(PluginFieldCollection const* fc, Weights& beta, Weights& gamma, Weights& bias) {
-    ixrt_plugin::validateRequiredAttributesExist({"beta", "gamma"}, fc);
-
-    bool output_fp32 = false;
-
-    for (int32_t i = 0; i < fc->nbFields; i++) {
-        std::string field_name(fc->fields[i].name);
-
-        if (field_name.compare("beta") == 0) {
-            gLogInfo << "Building beta..." << endl;
-            beta.values = fc->fields[i].data;
-            beta.count = fc->fields[i].length;
-            beta.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("gamma") == 0) {
-            gLogInfo << "Building gamma..." << endl;
-            gamma.values = fc->fields[i].data;
-            gamma.count = fc->fields[i].length;
-            gamma.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("bias") == 0) {
-            gLogInfo << "Building bias..." << endl;
-            bias.values = fc->fields[i].data;
-            bias.count = fc->fields[i].length;
-            bias.type = fieldTypeToDataType(fc->fields[i].type);
-        }
-
-        if (field_name.compare("output_fp32") == 0) {
-            IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32);
-            output_fp32 = (static_cast<int32_t const*>(fc->fields[i].data)[0] == 1);
-            gLogInfo << "Building output_fp32" << output_fp32 << endl;
-        }
-    }
-
-    IXRT_PLUGIN_CHECK_VALUE(beta.values != nullptr, "SkipLayerNorm: invalid beta");
-    IXRT_PLUGIN_CHECK_VALUE(beta.count > 0, "SkipLayerNorm: invalid beta");
-
-    IXRT_PLUGIN_CHECK_VALUE(gamma.values != nullptr, "SkipLayerNorm: invalid gamma");
-    IXRT_PLUGIN_CHECK_VALUE(gamma.count > 0, "SkipLayerNorm: invalid gamma");
-    return output_fp32;
-}
-
-IPluginV2* SkipLayerNormInt8PluginHFaceCreator::createPlugin(char const* name,
-                                                             PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormInt8PluginHFaceCreator createPlugin" << endl;
-
-        Weights beta{DataType::kFLOAT, nullptr, 0};
-        Weights gamma{DataType::kFLOAT, nullptr, 0};
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        bool output_fp32 = buildBetaAndGamma(fc, beta, gamma, bias);
-        return new SkipLayerNormInt8PluginHFace(name, beta, gamma, bias, output_fp32);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-IPluginV2* SkipLayerNormInt8PluginHFaceCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                  size_t serialLength) noexcept {
-    // This object will be deleted when the network is destroyed, which will
-    // call SkipLayerNormInterleavedPlugin::destroy()
-    try {
-        gLogInfo << "SkipLayerNormInterleavedPluginHFaceCreator deserializePlugin" << endl;
-        return new SkipLayerNormInt8PluginHFace(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-// REGISTER_TENSORRT_PLUGIN(SkipLayerNormInt8PluginHFaceCreator);
-//#########################################################################//
-SkipLayerNormInt8PluginBase::SkipLayerNormInt8PluginBase(std::string const& name, Weights const& beta,
-                                                         Weights const& gamma, Weights const& bias, bool output_fp32)
-    : mLayerName(name),
-      mGammaDev(nullptr),
-      mBetaDev(nullptr),
-      mBiasDev(nullptr),
-      mLd(beta.count),
-      mParamsOnDevice(false),
-      output_fp32(output_fp32) {
-    IXRT_PLUGIN_ASSERT(mLd > 0);
-    IXRT_PLUGIN_ASSERT(beta.count == gamma.count);
-    // dataType for beta, gamma weights is always fp16
-    mParamWordsize = getElementSize(param_type);
-
-    mBeta.convertAndCopy(beta, param_type);
-    mGamma.convertAndCopy(gamma, param_type);
-
-    mHasBias = (bias.values != nullptr);
-    if (mHasBias) {
-        mBias.convertAndCopy(bias, param_type);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, param_type), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, param_type), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, param_type), mBiasDev);
-    }
-}
-
-SkipLayerNormInt8PluginBase::SkipLayerNormInt8PluginBase(std::string const& name, void const* data, size_t length)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mParamsOnDevice(false) {
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mLd);
-    deserialize_value(&data, &length, &mHasBias);
-    deserialize_value(&data, &length, &output_fp32);
-
-    mParamWordsize = getElementSize(param_type);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mLd, param_type);
-    mGamma.convertAndCopy(d, mLd, param_type);
-
-    if (mHasBias) {
-        mBias.convertAndCopy(d, mLd, param_type);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, param_type), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, param_type), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, param_type), mBiasDev);
-    }
-}
-
-SkipLayerNormInt8PluginHFace::SkipLayerNormInt8PluginHFace(std::string const& name, Weights const& beta,
-                                                           Weights const& gamma, Weights const& bias, bool output_fp32)
-    : SkipLayerNormInt8PluginBase(name, beta, gamma, bias, output_fp32) {}
-
-SkipLayerNormInt8PluginHFace::SkipLayerNormInt8PluginHFace(std::string const& name, void const* data, size_t length)
-    : SkipLayerNormInt8PluginBase(name, data, length) {
-    gLogInfo << "SkipLayerNormInt8PluginHFace deserialize" << endl;
-}
-
-// IPluginV2 Methods
-char const* SkipLayerNormInt8PluginBase::getPluginType() const noexcept { return kSKIP_LAYER_NORM_INT8_NAME; }
-
-size_t SkipLayerNormInt8PluginBase::getSerializationSize() const noexcept {
-    const size_t biasSize = mHasBias ? (mLd * mParamWordsize) : 0;
-    return 2 * mParamWordsize * mLd + sizeof(mLd) + sizeof(mHasBias) + sizeof(output_fp32) + biasSize;
-}
-
-void SkipLayerNormInt8PluginBase::serialize(void* buffer) const noexcept {
-    try {
-        serialize_value(&buffer, mLd);
-        serialize_value(&buffer, mHasBias);
-        serialize_value(&buffer, output_fp32);
-
-        char* d = static_cast<char*>(buffer);
-        serFromDev(d, static_cast<char*>(mBetaDev.get()), mLd * mParamWordsize);
-        serFromDev(d, static_cast<char*>(mGammaDev.get()), mLd * mParamWordsize);
-        if (mHasBias) {
-            serFromDev(d, static_cast<char*>(mBiasDev.get()), mLd * mParamWordsize);
-        }
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormInt8PluginBase::destroy() noexcept {
-    try {
-        // This gets called when the network containing plugin is destroyed
-        mGammaDev.reset(nullptr);
-        mBetaDev.reset(nullptr);
-        if (mHasBias) {
-            mBiasDev.reset(nullptr);
-        }
-        delete this;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormInt8PluginBase::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; }
-
-char const* SkipLayerNormInt8PluginBase::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// HFace
-int32_t SkipLayerNormInt8PluginHFace::initialize() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace initialize" << endl;
-    return 0;
-}
-
-void SkipLayerNormInt8PluginHFace::terminate() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace terminate" << endl;
-}
-
-void SkipLayerNormInt8PluginHFace::destroy() noexcept {
-    gLogInfo << "SkipLayerNormInterleavedPluginHFace destroy" << endl;
-    SkipLayerNormInt8PluginBase::destroy();
-}
-
-char const* SkipLayerNormInt8PluginHFace::getPluginVersion() const noexcept {
-    return kSKIP_LAYER_NORM_INT8_VERSION_HFACE;
-}
-
-int32_t SkipLayerNormInt8PluginHFace::getNbOutputs() const noexcept { return 2; }
-
-// IPluginV2Ext Methods
-DataType SkipLayerNormInt8PluginBase::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                        int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(index >= 0 && index < getNbOutputs());
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        if (index == 0) {
-            return output_fp32 ? DataType::kFLOAT : DataType::kINT8;
-        }
-        return DataType::kFLOAT;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2DynamicExt Methods
-DimsExprs SkipLayerNormInt8PluginBase::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                           int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        IXRT_PLUGIN_ASSERT(outputIndex >= 0 && outputIndex < getNbOutputs());
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool SkipLayerNormInt8PluginBase::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut,
-                                                            int32_t nbInputs, int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-        IXRT_PLUGIN_ASSERT(nbOutputs == getNbOutputs());
-        IXRT_PLUGIN_ASSERT(pos >= 0 && pos < (nbInputs + nbOutputs));
-
-        PluginTensorDesc const& desc = inOut[pos];
-        if (pos == 2 || pos == 4 || (output_fp32 && pos == 3)) {
-            return desc.type == DataType::kFLOAT && desc.format == TensorFormat::kLINEAR;
-        }
-        return desc.type == DataType::kINT8 && desc.format == TensorFormat::kLINEAR;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return false;
-}
-
-void SkipLayerNormInt8PluginBase::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                  DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbOutputs == getNbOutputs());
-        IXRT_PLUGIN_ASSERT(nbInputs == 3);
-
-        auto const& inDims0 = inputs[0].desc.dims;
-        auto const& inDims1 = inputs[1].desc.dims;
-        auto const& inDims2 = inputs[2].desc.dims;
-        TRT_UNUSED inDims1;
-        TRT_UNUSED inDims2;
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims1.nbDims);
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d));
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims2.nbDims);
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims2.d));
-
-        mParamWordsize = getElementSize(param_type);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t SkipLayerNormInt8PluginBase::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                     PluginTensorDesc const* outputs,
-                                                     int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-// HFace IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* SkipLayerNormInt8PluginHFace::clone() const noexcept {
-    try {
-        gLogInfo << "SkipLayerNormInterleavedPluginHFace clone" << endl;
-        auto* p = new SkipLayerNormInt8PluginHFace(mLayerName, mBeta, mGamma, mBias, output_fp32);
-        p->initialize();
-        p->setPluginNamespace(mNamespace.c_str());
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-int32_t SkipLayerNormInt8PluginHFace::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                              void const* const* inputs, void* const* outputs, void* workspace,
-                                              cudaStream_t stream) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        auto const iDesc = inputDesc[0];
-        auto const oDesc = outputDesc[0];
-
-        const int32_t B = iDesc.dims.d[0];
-        const int32_t S = iDesc.dims.d[1];
-        const int32_t E = iDesc.dims.d[2];
-        int batch_token_num = B * S;
-        float const dqScaleIn = iDesc.scale;
-        IXRT_PLUGIN_ASSERT(dqScaleIn > 1e-9);
-        float const qScale = oDesc.scale;
-        int8_t const* input = static_cast<int8_t const*>(inputs[0]);
-        int8_t const* skip = static_cast<int8_t const*>(inputs[1]);
-        float* residual = (float*)inputs[2];
-        float const* gamma = static_cast<float const*>(mGammaDev.get());
-        float const* beta = static_cast<float const*>(mBetaDev.get());
-        float const* bias = static_cast<float const*>(mBiasDev.get());
-        float* residual_out = static_cast<float*>(outputs[1]);
-
-        if (!output_fp32) {
-            int8_t* output = static_cast<int8_t*>(outputs[0]);
-            skipLayerNormI8II8O(input, gamma, beta, bias, output, residual, residual_out, batch_token_num, E,
-                                dqScaleIn, 1.0 / qScale, 1024, stream, true);
-        } else {
-            float* output = static_cast<float*>(outputs[0]);
-            skipLayerNormI8IF32O(input, gamma, beta, bias, output, residual, residual_out, batch_token_num, E,
-                                 1.0 / dqScaleIn, 1.0 / qScale, 1024, stream, true);
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu
deleted file mode 100644
index 7cd3e564..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#include "backend/bert/bert_helper.h"
-#include "skipLayerNormInt8Plugin.h"
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void skipLayernormI8II8OKernel(const int8_t *input, const float *scale, const float *bias,
-                                        const float *residual_bias, int8_t *output, float *residual, float* residual_out, 
-                                        int hidden_size, float dequant_scale, float quant_scale,
-                                        bool is_post_ln) {
-    // register
-    // process 2 data
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x *  hidden_size / 4;
-    char4 *p_input = (char4 *)input;
-    char4 *p_output = (char4 *)output;
-    float4 *p_residual = (float4 *)residual;
-    float4 *p_residual_out = (float4 *)residual_out;
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_residual_bias = (float4 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start; 
-    p_residual_out += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-    // load data from global memory
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-         // vals = dequant(input) + residual + bias
-        p_residual_out[element_index].x = p_residual[element_index].x + p_residual_bias[element_index].x;
-        p_residual_out[element_index].y = p_residual[element_index].y + p_residual_bias[element_index].y;
-        p_residual_out[element_index].z = p_residual[element_index].z + p_residual_bias[element_index].z;
-        p_residual_out[element_index].w = p_residual[element_index].w + p_residual_bias[element_index].w;
-        vals[it] = char4addfloat4_dequant(p_input[element_index], p_residual_out[element_index], dequant_scale);
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    // mean var
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_size, epsilon,
-                                                      p_scale[element_index], p_bias[element_index]);
-
-        p_residual_out[element_index].x = norm_value.x;
-        p_residual_out[element_index].y = norm_value.y;
-        p_residual_out[element_index].z = norm_value.z;
-        p_residual_out[element_index].w = norm_value.w;
-
-        char4 res = float42char4(norm_value, quant_scale);
-        p_output[element_index] = res;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void skipLayernormI8IF32OKernel(const int8_t *input, const float *scale, const float *bias,
-                                        const float *residual_bias, float *output, float *residual, float* residual_out, 
-                                        int hidden_size, float dequant_scale, float quant_scale,
-                                        bool is_post_ln) {
-    // register
-    // process 2 data
-    float4 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 4;
-    char4 *p_input = (char4 *)input;
-    float4 *p_output = (float4 *)output;
-    float4 *p_residual = (float4 *)residual;
-    float4 *p_residual_out = (float4 *)residual_out;
-    float4 *p_scale = (float4 *)scale;
-    float4 *p_bias = (float4 *)bias;
-    float4 *p_residual_bias = (float4 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-    p_residual_out += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-    // load data from global memory
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-         // vals = dequant(input) + residual + bias
-        p_residual_out[element_index].x = p_residual[element_index].x + p_residual_bias[element_index].x;
-        p_residual_out[element_index].y = p_residual[element_index].y + p_residual_bias[element_index].y;
-        p_residual_out[element_index].z = p_residual[element_index].z + p_residual_bias[element_index].z;
-        p_residual_out[element_index].w = p_residual[element_index].w + p_residual_bias[element_index].w;
-        vals[it] = char4addfloat4_dequant(p_input[element_index], p_residual_out[element_index], dequant_scale);
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    // mean var
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_size, epsilon,
-                                                      p_scale[element_index], p_bias[element_index]);
-        
-        p_output[element_index].x = norm_value.x;
-        p_output[element_index].y = norm_value.y;
-        p_output[element_index].z = norm_value.z;
-        p_output[element_index].w = norm_value.w;
-    }
-}
-
-
-void skipLayerNormI8II8O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias, 
-                       int8_t *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln) {
-
-    if (hidden_size > 1024) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            skipLayernormI8II8OKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 2:
-            skipLayernormI8II8OKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 3:
-            skipLayernormI8II8OKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 4:
-            skipLayernormI8II8OKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 5:
-            skipLayernormI8II8OKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 6:
-            skipLayernormI8II8OKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 7:
-            skipLayernormI8II8OKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 8:
-            skipLayernormI8II8OKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 9:
-            skipLayernormI8II8OKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 10:
-            skipLayernormI8II8OKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 11:
-            skipLayernormI8II8OKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 12:
-            skipLayernormI8II8OKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 13:
-            skipLayernormI8II8OKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 14:
-            skipLayernormI8II8OKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 15:
-            skipLayernormI8II8OKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 16:
-            skipLayernormI8II8OKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        default:
-            throw std::runtime_error("skipLayernormI8II8OKernel");
-            break;
-    }
-}
-
-void skipLayerNormI8IF32O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias,
-                       float *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln) {
-    if (hidden_size > 1024) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % C10_WARP_SIZE != 0) {
-        throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0");
-    }
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int num_warp = hidden_size / C10_WARP_SIZE / 4;
-
-    switch (num_warp) {
-        case 1:
-            skipLayernormI8IF32OKernel<1>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 2:
-            skipLayernormI8IF32OKernel<2>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 3:
-            skipLayernormI8IF32OKernel<3>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 4:
-            skipLayernormI8IF32OKernel<4>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 5:
-            skipLayernormI8IF32OKernel<5>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 6:
-            skipLayernormI8IF32OKernel<6>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 7:
-            skipLayernormI8IF32OKernel<7>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 8:
-            skipLayernormI8IF32OKernel<8>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 9:
-            skipLayernormI8IF32OKernel<9>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 10:
-            skipLayernormI8IF32OKernel<10>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 11:
-            skipLayernormI8IF32OKernel<11>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 12:
-            skipLayernormI8IF32OKernel<12>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 13:
-            skipLayernormI8IF32OKernel<13>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 14:
-            skipLayernormI8IF32OKernel<14>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 15:
-            skipLayernormI8IF32OKernel<15>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        case 16:
-            skipLayernormI8IF32OKernel<16>
-                <<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size,
-                                                     dequant_scale, quant_scale, is_post_ln);
-            break;
-        default:
-            throw std::runtime_error("skipLayernormI8II8OKernel");
-            break;    
-    }             
-}
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin 
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h
deleted file mode 100644
index f752f59f..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*/
-#pragma once
-
-#include <string>
-#include <vector>
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-
-void skipLayerNormI8II8O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias, 
-                       int8_t *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln);
-
-void skipLayerNormI8IF32O(const int8_t *input, const  float *scale, const float *bias, const float *residual_bias,
-                       float *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale,
-                       float quant_scale, int max_thread_per_block, cudaStream_t stream,
-                       bool is_post_ln);
-
-class SkipLayerNormInt8PluginBase : public nvinfer1::IPluginV2DynamicExt
-{
-public:
-    SkipLayerNormInt8PluginBase(
-        std::string const& name, nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias, bool output_fp32);
-
-    SkipLayerNormInt8PluginBase(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make SkipLayerNormInterleavedPlugin without
-    // arguments, so we delete default constructor.
-    SkipLayerNormInt8PluginBase() = delete;
-
-    // IPluginV2 Methods
-    char const* getPluginType() const noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext Methods
-    nvinfer1::DataType getOutputDataType(
-        int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs,
-        nvinfer1::IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(
-        int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override;
-    void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs,
-        nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs,
-        nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override;
-
-protected:
-    std::string const& mLayerName;
-    std::string mNamespace;
-
-    bert::cuda_unique_ptr<void> mGammaDev;
-    bert::cuda_unique_ptr<void> mBetaDev;
-    size_t mLd{}; // leading dim
-    bert::WeightsWithOwnership mGamma;
-    bert::WeightsWithOwnership mBeta;
-
-    size_t mParamWordsize{};
-    bool mParamsOnDevice{};
-    bool mHasBias{};
-    cuda_unique_ptr<void> mBiasDev;
-    WeightsWithOwnership mBias;
-    bool output_fp32{};
-};
-
-class SkipLayerNormInt8PluginHFace : public SkipLayerNormInt8PluginBase
-{
-public:
-    SkipLayerNormInt8PluginHFace(
-        std::string const& name, nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias, bool output_fp32);
-
-    SkipLayerNormInt8PluginHFace(std::string const& name, void const* data, size_t length);
-
-    // It doesn't make sense to make SkipLayerNormInterleavedPlugin without
-    // arguments, so we delete default constructor.
-    SkipLayerNormInt8PluginHFace() = delete;
-
-    // IPluginV2DynamicExt Methods
-    nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
-    int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc,
-        void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-    // IPluginV2 Methods
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    void destroy() noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-};
-
-class SkipLayerNormInt8PluginBaseCreator : public nvinfer1::IPluginCreator
-{
-public:
-    SkipLayerNormInt8PluginBaseCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-class SkipLayerNormInt8PluginHFaceCreator : public SkipLayerNormInt8PluginBaseCreator
-{
-public:
-    SkipLayerNormInt8PluginHFaceCreator();
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp
deleted file mode 100644
index 4ca63061..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp
+++ /dev/null
@@ -1,430 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include "skipLayerNormPlugin.h"
-
-#include "bertCommon.h"
-#include "checkMacrosPlugin.h"
-#include "plugin.h"
-#include "serialize.h"
-
-using namespace nvinfer1;
-using namespace nvinfer1::ixrt_plugin;
-using namespace nvinfer1::ixrt_plugin::bert;
-
-namespace {
-char const* kSKIP_LAYER_NORM_VERSION{"1"};
-char const* kSKIP_LAYER_NORM_NAME{"CustomSkipLayerNormPluginDynamic_IxRT"};
-char const* kSKIP_LAYER_NORM_VAR_SEQLEN_VERSION{"2"};
-}  // namespace
-
-// Static class fields initialization
-PluginFieldCollection SkipLayerNormPluginDynamicCreator::mFC{};
-std::vector<PluginField> SkipLayerNormPluginDynamicCreator::mPluginAttributes;
-
-// REGISTER_TENSORRT_PLUGIN(SkipLayerNormPluginDynamicCreator);
-
-static inline DataType getParamWordType(DataType cfgType) noexcept {
-    if (cfgType == DataType::kINT8) {
-        return DataType::kHALF;
-    }
-
-    return cfgType;
-}
-
-SkipLayerNormPluginDynamicCreator::SkipLayerNormPluginDynamicCreator() {
-    mPluginAttributes.clear();
-    mPluginAttributes.emplace_back(PluginField("ld"));
-    mPluginAttributes.emplace_back(PluginField("type_id"));
-    mPluginAttributes.emplace_back(PluginField("beta"));
-    mPluginAttributes.emplace_back(PluginField("gamma"));
-    mPluginAttributes.emplace_back(PluginField("bias"));
-    mFC.nbFields = mPluginAttributes.size();
-    mFC.fields = mPluginAttributes.data();
-}
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginName() const noexcept { return kSKIP_LAYER_NORM_NAME; }
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginVersion() const noexcept { return kSKIP_LAYER_NORM_VERSION; }
-
-PluginFieldCollection const* SkipLayerNormPluginDynamicCreator::getFieldNames() noexcept { return &mFC; }
-
-IPluginV2* SkipLayerNormPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamicCreator createPlugin" << endl;
-
-        int32_t ld = 0;
-        Weights beta{DataType::kFLOAT, nullptr, 0};
-        Weights gamma{DataType::kFLOAT, nullptr, 0};
-        Weights bias{DataType::kFLOAT, nullptr, 0};
-        int32_t typeId = -1;
-
-        IXRT_PLUGIN_ASSERT(fc != nullptr);
-
-        ixrt_plugin::validateRequiredAttributesExist({"type_id", "beta", "ld", "gamma"}, fc);
-
-        for (int32_t i = 0; i < fc->nbFields; i++) {
-            std::string field_name(fc->fields[i].name);
-            if (field_name.compare("ld") == 0) {
-                ld = *static_cast<int32_t const*>(fc->fields[i].data);
-                gLogInfo << "Building ld: " << ld << endl;
-            }
-
-            if (field_name.compare("type_id") == 0) {
-                typeId = *static_cast<int32_t const*>(fc->fields[i].data);
-                gLogInfo << "Building typeId: " << typeId << endl;
-            }
-
-            if (field_name.compare("beta") == 0) {
-                gLogInfo << "Building beta..." << endl;
-                beta.values = fc->fields[i].data;
-                beta.count = fc->fields[i].length;
-                beta.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("gamma") == 0) {
-                gLogInfo << "Building gamma..." << endl;
-                gamma.values = fc->fields[i].data;
-                gamma.count = fc->fields[i].length;
-                gamma.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-
-            if (field_name.compare("bias") == 0) {
-                gLogInfo << "Building bias..." << endl;
-                bias.values = fc->fields[i].data;
-                bias.count = fc->fields[i].length;
-                bias.type = fieldTypeToDataType(fc->fields[i].type);
-            }
-        }
-        gLogInfo << "Type " << typeId << endl;
-
-        IXRT_PLUGIN_CHECK_VALUE(typeId >= 0 && typeId <= 3,
-                                ("SkipLayerNorm: Invalid type ID: " + std::to_string(typeId)).c_str());
-
-        IXRT_PLUGIN_CHECK_VALUE(beta.values != nullptr, "SkipLayerNorm: invalid beta");
-        IXRT_PLUGIN_CHECK_VALUE(beta.count > 0, "SkipLayerNorm: invalid beta");
-
-        IXRT_PLUGIN_CHECK_VALUE(gamma.values != nullptr, "SkipLayerNorm: invalid gamma");
-        IXRT_PLUGIN_CHECK_VALUE(gamma.count > 0, "SkipLayerNorm: invalid gamma");
-
-        IXRT_PLUGIN_CHECK_VALUE(typeId == (int)DataType::kHALF, "typeId != DataType::kHALF error");
-
-        return new SkipLayerNormPluginDynamic(name, static_cast<DataType>(typeId), ld, beta, gamma, bias);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-nvinfer1::IPluginV2* SkipLayerNormPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData,
-                                                                          size_t serialLength) noexcept {
-    try {
-        return new SkipLayerNormPluginDynamic(name, serialData, serialLength);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-void SkipLayerNormPluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept {
-    try {
-        mNamespace = pluginNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* SkipLayerNormPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-//#########################################################################//
-SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string name, const DataType type, int32_t const ld,
-                                                       Weights const& beta, Weights const& gamma, Weights const& bias)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mHiddenSize(ld), mType(type), mBiasDev(nullptr) {
-    IXRT_PLUGIN_ASSERT(mType == nvinfer1::DataType::kFLOAT || mType == nvinfer1::DataType::kHALF ||
-                       mType == nvinfer1::DataType::kINT8);
-
-    mCfgType = mType == DataType::kINT8 ? DataType::kHALF : mType;
-    mParamWordsize = getElementSize(mCfgType);
-
-    mBeta.convertAndCopy(beta, mCfgType);
-    mGamma.convertAndCopy(gamma, mCfgType);
-
-    mHasBias = (bias.values != nullptr);
-    if (mHasBias) {
-        mBias.convertAndCopy(bias, mCfgType);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, mCfgType), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, mCfgType), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, mCfgType), mBiasDev);
-    }
-}
-
-SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string& name, void const* data, size_t length)
-    : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mBiasDev(nullptr) {
-    gLogInfo << "SkipLayerNormPluginDynamic deserialize" << endl;
-
-    // Deserialize in the same order as serialization
-    deserialize_value(&data, &length, &mType);
-    deserialize_value(&data, &length, &mCfgType);
-    deserialize_value(&data, &length, &mHiddenSize);
-    deserialize_value(&data, &length, &mHasBias);
-
-    IXRT_PLUGIN_ASSERT(mCfgType == nvinfer1::DataType::kFLOAT || mCfgType == nvinfer1::DataType::kHALF);
-    mParamWordsize = getElementSize(mCfgType);
-
-    char const* d = static_cast<char const*>(data);
-    mBeta.convertAndCopy(d, mHiddenSize, mCfgType);
-    mGamma.convertAndCopy(d, mHiddenSize, mCfgType);
-    if (mHasBias) {
-        mBias.convertAndCopy(d, mHiddenSize, mCfgType);
-    }
-
-    copyToDevice(mGamma, getWeightsSize(mGamma, mCfgType), mGammaDev);
-    copyToDevice(mBeta, getWeightsSize(mBeta, mCfgType), mBetaDev);
-    if (mHasBias) {
-        copyToDevice(mBias, getWeightsSize(mBias, mCfgType), mBiasDev);
-    }
-}
-
-// IPluginV2Ext Methods
-DataType SkipLayerNormPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes,
-                                                       int32_t nbInputs) const noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputTypes != nullptr);
-        IXRT_PLUGIN_ASSERT(index == 0);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        return inputTypes[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DataType{};
-}
-
-// IPluginV2 Methods
-char const* SkipLayerNormPluginDynamic::getPluginType() const noexcept { return kSKIP_LAYER_NORM_NAME; }
-
-char const* SkipLayerNormPluginDynamic::getPluginVersion() const noexcept { return kSKIP_LAYER_NORM_VERSION; }
-
-int32_t SkipLayerNormPluginDynamic::getNbOutputs() const noexcept { return 1; }
-int32_t SkipLayerNormPluginDynamic::initialize() noexcept {
-    gLogInfo << "SkipLayerNormPluginDynamic initialize" << endl;
-    return 0;
-}
-
-void SkipLayerNormPluginDynamic::terminate() noexcept { gLogInfo << "SkipLayerNormPluginDynamic terminate" << endl; }
-
-size_t SkipLayerNormPluginDynamic::getSerializationSize() const noexcept {
-    const size_t biasSize = mHasBias ? (mHiddenSize * mParamWordsize) : 0;
-    return 2 * mParamWordsize * mHiddenSize + 2 * sizeof(DataType) + sizeof(mHiddenSize) + biasSize + sizeof(mHasBias);
-}
-
-void SkipLayerNormPluginDynamic::serialize(void* buffer) const noexcept {
-    try {
-        serialize_value(&buffer, mType);
-        serialize_value(&buffer, mCfgType);
-        serialize_value(&buffer, mHiddenSize);
-        serialize_value(&buffer, mHasBias);
-
-        char* d = static_cast<char*>(buffer);
-        serFromDev(d, static_cast<char*>(mBetaDev.get()), mHiddenSize * mParamWordsize);
-        serFromDev(d, static_cast<char*>(mGammaDev.get()), mHiddenSize * mParamWordsize);
-        if (mHasBias) {
-            serFromDev(d, static_cast<char*>(mBiasDev.get()), mHiddenSize * mParamWordsize);
-        }
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormPluginDynamic::destroy() noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic destroy" << endl;
-        // This gets called when the network containing plugin is destroyed
-        mGammaDev.reset(nullptr);
-        mBetaDev.reset(nullptr);
-        if (mHasBias) {
-            mBiasDev.reset(nullptr);
-        }
-        delete this;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-void SkipLayerNormPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept {
-    try {
-        mNamespace = libNamespace;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-char const* SkipLayerNormPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); }
-
-// IPluginV2DynamicExt Methods
-IPluginV2DynamicExt* SkipLayerNormPluginDynamic::clone() const noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic clone" << endl;
-
-        auto* p = new SkipLayerNormPluginDynamic(mLayerName, mType, mHiddenSize, mBeta, mGamma, mBias);
-        p->initialize();
-        p->setPluginNamespace(mNamespace.c_str());
-        return p;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return nullptr;
-}
-
-DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs,
-                                                          int32_t nbInputs, IExprBuilder& exprBuilder) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        IXRT_PLUGIN_ASSERT(outputIndex == 0);
-        IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims);
-        return inputs[0];
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return DimsExprs{};
-}
-
-bool SkipLayerNormPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                                           int32_t nbOutputs) noexcept {
-    try {
-        IXRT_PLUGIN_ASSERT(inOut != nullptr);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(pos >= 0 && pos < (nbInputs + nbOutputs));
-
-        PluginTensorDesc const& in = inOut[pos];
-        if (pos == 0) {
-            return (in.type == mType) && (in.format == TensorFormat::kLINEAR);
-        }
-        PluginTensorDesc const& prev = inOut[pos - 1];
-
-        return in.type == prev.type && in.format == prev.format && (in.type == DataType::kHALF);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return false;
-}
-
-void SkipLayerNormPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs,
-                                                 DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept {
-    try {
-        gLogInfo << "SkipLayerNormPluginDynamic configurePlugin" << endl;
-
-        // Validate input arguments
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        IXRT_PLUGIN_ASSERT(nbOutputs == 1);
-        IXRT_PLUGIN_ASSERT(nbInputs == 2);
-        if (mType == DataType::kFLOAT || mType == DataType::kHALF) {
-            IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type);
-            IXRT_PLUGIN_ASSERT(mType == inputs[1].desc.type);
-        } else {
-            IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type || DataType::kFLOAT == inputs[0].desc.type);
-            IXRT_PLUGIN_ASSERT(mType == inputs[1].desc.type || DataType::kFLOAT == inputs[1].desc.type);
-        }
-        auto const& inDims0 = inputs[0].desc.dims;
-        auto const& inDims1 = inputs[1].desc.dims;
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims1.nbDims);
-
-        IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d));
-
-        IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5);
-        mHiddenSize = inDims0.d[HDIM];  // hiddensize
-        IXRT_PLUGIN_ASSERT(mHiddenSize != 0U);
-        IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1);
-        IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1);
-        IXRT_PLUGIN_ASSERT(outputs[0].desc.type == DataType::kHALF);
-
-        mCfgType = inputs[0].desc.type == DataType::kINT8 ? DataType::kHALF : inputs[0].desc.type;
-
-        auto const paramType = getParamWordType(mCfgType);
-        mParamWordsize = getElementSize(paramType);
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-}
-
-size_t SkipLayerNormPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs,
-                                                    PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept {
-    return 0;
-}
-
-int32_t SkipLayerNormPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc,
-                                            void const* const* inputs, void* const* outputs, void* workspace,
-                                            cudaStream_t stream) noexcept {
-    gLogInfo << "in SkipLayerNormPluginDynamic.." << endl;
-    int32_t status = -1;
-    try {
-        IXRT_PLUGIN_ASSERT(inputs != nullptr);
-        IXRT_PLUGIN_ASSERT(outputs != nullptr);
-        int32_t const inputVolume = volume(inputDesc[0].dims);
-        DataType iType = inputDesc->type;
-
-        // Our plugin outputs only one tensor
-        // Launch CUDA kernel wrapper and save its return value
-        if (iType == DataType::kFLOAT) {
-            gLogInfo << "SkipLayerNormPlugin fp32 not supported yet!" << endl;
-            return STATUS_NOT_SUPPORTED;
-        } else if (iType == DataType::kHALF) {
-            auto const* input = static_cast<half const*>(inputs[0]);
-            auto skip = (half*)(inputs[1]);
-            auto* output = static_cast<half*>(outputs[0]);
-            auto const* const bias = static_cast<half const*>(mBiasDev.get());
-            auto const* const beta = static_cast<half const*>(mBetaDev.get());
-            auto const* const gamma = static_cast<half const*>(mGammaDev.get());
-
-            if (mHasBias) {
-                status = computeSkipLayerNorm<half, true>(stream, static_cast<int32_t>(mHiddenSize), inputVolume, input,
-                                                          gamma, beta, bias, skip, output);
-            } else {
-                status = computeSkipLayerNorm<half, false>(stream, static_cast<int32_t>(mHiddenSize), inputVolume,
-                                                           input, gamma, beta, bias, skip, output);
-            }
-        } else {
-            IXRT_PLUGIN_CHECK_VALUE(false, "Unsupported type error, expected [kHALF,kFLOAT], but received " +
-                                               std::to_string(static_cast<int32_t>(iType)));
-        }
-        if (status != cudaSuccess) {
-            return STATUS_FAILURE;
-        }
-        return STATUS_SUCCESS;
-    } catch (std::exception const& e) {
-        caughtError(e);
-    }
-    return STATUS_FAILURE;
-}
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu
deleted file mode 100644
index 1b127fc5..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu
+++ /dev/null
@@ -1,401 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-
-#include "backend/bert/bert_helper.h"
-#include "skipLayerNormPlugin.h"
-// #include "backend/transformer/transformer_add_norm.h"
-
-using namespace nvinfer1::ixrt_plugin::backend;
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias,
-                                         const half *residual_bias, half *output, half *residual, int hidden_size,
-                                         bool is_post_ln) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 2;
-    half2 *p_input = (half2 *)input;
-    half2 *p_output = (half2 *)output;
-    half2 *p_residual = (half2 *)residual;
-    half2 *p_scale = (half2 *)scale;
-    half2 *p_bias = (half2 *)bias;
-    half2 *p_residual_bias = (half2 *)residual_bias;
-    // one line start
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        if (element_index < hidden_size / 2) {
-            half2 value1 = p_input[element_index];
-            half2 value2 = p_residual[element_index];
-
-            vals[it].x = __half2float(value1.x) + __half2float(value2.x);
-            vals[it].y = __half2float(value1.y) + __half2float(value2.y);
-
-            half2 res_bias_val_1;
-            if (residual_bias == nullptr) {
-                res_bias_val_1.x = __float2half(0.0f);
-                res_bias_val_1.y = __float2half(0.0f);
-            } else {
-                res_bias_val_1 = p_residual_bias[element_index];
-            }
-            vals[it].x = vals[it].x + __half2float(res_bias_val_1.x);
-            vals[it].y = vals[it].y + __half2float(res_bias_val_1.y);
-
-            WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-            WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-        }
-    }
-
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        if (element_index < hidden_size / 2) {
-            float2 norm_value;
-            half2 scale_1 = p_scale[element_index];
-            half2 bias_1 = p_bias[element_index];
-            norm_value.x = (vals[it].x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) +
-                           __half2float(bias_1.x);
-            norm_value.y = (vals[it].y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) +
-                           __half2float(bias_1.y);
-
-            half2 res;
-            res.x = __float2half(norm_value.x);
-            res.y = __float2half(norm_value.y);
-
-            p_output[element_index] = res;
-
-            half2 r1;
-            if (is_post_ln) {
-                r1 = res;
-            } else {
-                r1.x = __float2half(vals[it].x);
-                r1.y = __float2half(vals[it].y);
-            }
-            p_residual[element_index] = r1;
-        }
-    }
-}
-
-void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                              half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                              bool is_post_ln) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if (hidden_size % 2 != 0) {
-        throw std::runtime_error("hidden_size % 2 != 0");
-    }
-
-    dim3 gridSize(batch_tokens);
-    dim3 blockSize(C10_WARP_SIZE);
-
-    int neareast_hidden_size = hidden_size;
-    if (neareast_hidden_size % (C10_WARP_SIZE * 2) != 0) {
-        neareast_hidden_size = neareast_hidden_size + C10_WARP_SIZE * 2 - neareast_hidden_size % (C10_WARP_SIZE * 2);
-    }
-
-    int num_warp = neareast_hidden_size / C10_WARP_SIZE / 2;
-
-    switch (num_warp) {
-        case 1:
-            IxinferResidualBiasLnPad<1><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 2:
-            IxinferResidualBiasLnPad<2><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 3:
-            IxinferResidualBiasLnPad<3><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 4:
-            IxinferResidualBiasLnPad<4><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 5:
-            IxinferResidualBiasLnPad<5><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 6:
-            IxinferResidualBiasLnPad<6><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 7:
-            IxinferResidualBiasLnPad<7><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 8:
-            IxinferResidualBiasLnPad<8><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 9:
-            IxinferResidualBiasLnPad<9><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                            residual, hidden_size, is_post_ln);
-            break;
-        case 10:
-            IxinferResidualBiasLnPad<10><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 11:
-            IxinferResidualBiasLnPad<11><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 12:
-            IxinferResidualBiasLnPad<12><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 13:
-            IxinferResidualBiasLnPad<13><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 14:
-            IxinferResidualBiasLnPad<14><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 15:
-            IxinferResidualBiasLnPad<15><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        case 16:
-            IxinferResidualBiasLnPad<16><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-            break;
-        default:
-            std::cout << "hidden size: " << hidden_size << std::endl;
-            throw std::runtime_error("IxinferResidualBiasLnPad not supported!");
-            break;
-    }
-}
-
-template <int THREAD_DATA_LEN>
-__global__ void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                                      half *output, half *residual, int hidden_size, bool is_post_ln) {
-    float2 vals[THREAD_DATA_LEN];
-    int block_start = blockIdx.x * hidden_size / 2;
-    half2 *p_input = (half2 *)input;
-    half2 *p_output = (half2 *)output;
-    half2 *p_residual = (half2 *)residual;
-    half2 *p_scale = (half2 *)scale;
-    half2 *p_bias = (half2 *)bias;
-    half2 *p_residual_bias = (half2 *)residual_bias;
-
-    p_input += block_start;
-    p_output += block_start;
-    p_residual += block_start;
-
-    float thread_m2 = 0;
-    float thread_mean = 0;
-    float thread_count = 0;
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        half2 value1 = p_input[element_index];
-        half2 value2 = p_residual[element_index];
-
-        vals[it].x = __half2float(value1.x) + __half2float(value2.x);
-        vals[it].y = __half2float(value1.y) + __half2float(value2.y);
-
-        half2 res_bias_val_1;
-        if (residual_bias == nullptr) {
-            res_bias_val_1.x = __float2half(0.0f);
-            res_bias_val_1.y = __float2half(0.0f);
-        } else {
-            res_bias_val_1 = p_residual_bias[element_index];
-        }
-        vals[it].x = vals[it].x + __half2float(res_bias_val_1.x);
-        vals[it].y = vals[it].y + __half2float(res_bias_val_1.y);
-
-        WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count);
-        WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count);
-    }
-
-    float mean = 0;
-    float m2 = 0;
-    float count = 0;
-    WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count);
-    mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE);
-    m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE);
-    count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE);
-
-#pragma unroll
-    for (int it = 0; it < THREAD_DATA_LEN; ++it) {
-        int element_index = threadIdx.x + it * C10_WARP_SIZE;
-        float2 norm_value;
-        half2 scale_1 = p_scale[element_index];
-        half2 bias_1 = p_bias[element_index];
-        norm_value.x =
-            (vals[it].x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + __half2float(bias_1.x);
-        norm_value.y =
-            (vals[it].y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + __half2float(bias_1.y);
-
-        half2 res;
-        res.x = __float2half(norm_value.x);
-        res.y = __float2half(norm_value.y);
-
-        p_output[element_index] = res;
-
-        half2 r1;
-        if (is_post_ln) {
-            r1 = res;
-        } else {
-            r1.x = __float2half(vals[it].x);
-            r1.y = __float2half(vals[it].y);
-        }
-        p_residual[element_index] = r1;
-    }
-}
-
-void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                           half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                           bool is_post_ln) {
-    if (hidden_size > 2048) {
-        throw std::runtime_error("hidden_size should <= 1024");
-    }
-    if ((hidden_size % 2 == 0) && (hidden_size % (C10_WARP_SIZE * 2) != 0)) {
-        IxinferResidualBiasLnPad(input, scale, bias, residual_bias, output, residual, batch_tokens, hidden_size, stream,
-                                 is_post_ln);
-    } else {
-        if (hidden_size % (C10_WARP_SIZE * 2) != 0) {
-            throw std::runtime_error("hidden_size // (C10_WARP_SIZE*2) != 0");
-        }
-        dim3 gridSize(batch_tokens);
-        dim3 blockSize(C10_WARP_SIZE);
-
-        int num_warp = hidden_size / C10_WARP_SIZE / 2;
-
-        switch (num_warp) {
-            case 1:
-                IxinferResidualBiasLn<1><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 2:
-                IxinferResidualBiasLn<2><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 3:
-                IxinferResidualBiasLn<3><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 4:
-                IxinferResidualBiasLn<4><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 5:
-                IxinferResidualBiasLn<5><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 6:
-                IxinferResidualBiasLn<6><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 7:
-                IxinferResidualBiasLn<7><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 8:
-                IxinferResidualBiasLn<8><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 9:
-                IxinferResidualBiasLn<9><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                             residual, hidden_size, is_post_ln);
-                break;
-            case 10:
-                IxinferResidualBiasLn<10><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 11:
-                IxinferResidualBiasLn<11><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 12:
-                IxinferResidualBiasLn<12><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 13:
-                IxinferResidualBiasLn<13><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 14:
-                IxinferResidualBiasLn<14><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 15:
-                IxinferResidualBiasLn<15><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            case 16:
-                IxinferResidualBiasLn<16><<<gridSize, blockSize, 0, stream>>>(input, scale, bias, residual_bias, output,
-                                                                              residual, hidden_size, is_post_ln);
-                break;
-            default:
-                throw std::runtime_error("IxinferResidualBiasLn");
-                break;
-        }
-    }
-}
-
-template <typename T, bool has_bias>
-int32_t computeSkipLayerNorm(cudaStream_t stream, int32_t E, int32_t volume, const T* input, const T* gamma, const T* beta, const T* bias, T* skip, T* output)
-{
-    assert(volume % E == 0);
-    int32_t batch_tokens = volume / E;
-    IxinferResidualBiasLn(input, gamma, beta, bias, output, skip, batch_tokens, E, stream, true);
-    return 0;
-}
-
-template int32_t computeSkipLayerNorm<half, true>(cudaStream_t, int32_t, int32_t, const half*, const half*, const half*, const half*, half*, half*);
-template int32_t computeSkipLayerNorm<half, false>(cudaStream_t, int32_t, int32_t, const half*, const half*, const half*, const half*, half*, half*);
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h
deleted file mode 100644
index fa37318f..00000000
--- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-* All Rights Reserved.
-*
-*    Licensed under the Apache License, Version 2.0 (the "License"); you may
-*    not use this file except in compliance with the License. You may obtain
-*    a copy of the License at
-*
-*         http://www.apache.org/licenses/LICENSE-2.0
-*
-*    Unless required by applicable law or agreed to in writing, software
-*    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-*    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-*    License for the specific language governing permissions and limitations
-*    under the License.
-*
-* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-* SPDX-License-Identifier: Apache-2.0
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-#pragma once
-#include <string>
-#include <vector>
-
-#include "NvInferRuntime.h"
-#include "bertCommon.h"
-
-namespace nvinfer1::ixrt_plugin {
-namespace bert {
-
-template <typename T, bool has_bias>
-int32_t computeSkipLayerNorm(cudaStream_t stream, int32_t E, int32_t volume, const T* input, const T* gamma, const T* beta, const T* bias, T* skip, T* output);
-
-void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                           half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                           bool is_post_ln);
-
-void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, const half *residual_bias,
-                              half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream,
-                              bool is_post_ln);
-class SkipLayerNormPluginDynamic : public IPluginV2DynamicExt {
-   public:
-    SkipLayerNormPluginDynamic(const std::string name, const nvinfer1::DataType type, int32_t const ld,
-        nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias);
-    SkipLayerNormPluginDynamic(const std::string &name, void const* data, size_t length);
-    SkipLayerNormPluginDynamic() noexcept = delete;
-    ~SkipLayerNormPluginDynamic() override = default;
-
-    // IPluginV2 methods
-    char const* getPluginType() const noexcept override;
-    char const* getPluginVersion() const noexcept override;
-    int32_t getNbOutputs() const noexcept override;
-    int32_t initialize() noexcept override;
-    void terminate() noexcept override;
-    size_t getSerializationSize() const noexcept override;
-    void serialize(void* buffer) const noexcept override;
-    void destroy() noexcept override;
-    void setPluginNamespace(char const* libNamespace) noexcept override;
-    char const* getPluginNamespace() const noexcept override;
-
-    // IPluginV2Ext methods
-    DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override;
-
-    // IPluginV2DynamicExt methods
-    IPluginV2DynamicExt* clone() const noexcept override;
-    DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs,
-                                  IExprBuilder& exprBuilder) noexcept override;
-    bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs,
-                                   int32_t nbOutputs) noexcept override;
-    void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out,
-                         int32_t nbOutputs) noexcept override;
-    size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs,
-                            int32_t nbOutputs) const noexcept override;
-    int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs,
-                    void* const* outputs, void* workspace, cudaStream_t stream) noexcept override;
-
-   private:
-    const std::string mLayerName;
-    std::string mNamespace;
-    cuda_unique_ptr<void> mGammaDev;
-    cuda_unique_ptr<void> mBetaDev;
-    WeightsWithOwnership mGamma;
-    WeightsWithOwnership mBeta;
-    size_t mHiddenSize{};
-    size_t mParamWordsize{};
-    DataType mType;
-    DataType mCfgType;
-    // mCfgType is the dataType for beta, gamma bias weights, always fp16 or fp32
-    // mType is the plugin IO datatype, can be int8
-    
-    bool mHasBias{};
-    cuda_unique_ptr<void> mBiasDev;
-    WeightsWithOwnership mBias;
-};
-
-class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator
-{
-public:
-    SkipLayerNormPluginDynamicCreator();
-
-    char const* getPluginName() const noexcept override;
-
-    char const* getPluginVersion() const noexcept override;
-
-    nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override;
-
-    nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override;
-
-    nvinfer1::IPluginV2* deserializePlugin(
-        char const* name, void const* serialData, size_t serialLength) noexcept override;
-
-    void setPluginNamespace(char const* pluginNamespace) noexcept override;
-
-    char const* getPluginNamespace() const noexcept override;
-
-private:
-    static nvinfer1::PluginFieldCollection mFC;
-    static std::vector<nvinfer1::PluginField> mPluginAttributes;
-    std::string mNamespace;
-};
-
-} // namespace bert
-} // namespace nvinfer1::ixrt_plugin
\ No newline at end of file
diff --git a/tests/model_info.json b/tests/model_info.json
index 660bdfcd..0f025cf8 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -6012,7 +6012,8 @@
             "download_url": "https://local/bert-large-uncased",
             "need_third_part": "",
             "precisions": [
-                "fp16"
+                "fp16",
+                "int8"
             ],
             "type": "inference",
             "hasDemo": false,
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index 0508d8d8..796ef109 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -469,16 +469,16 @@ def run_nlp_testcase(model):
         elif model_name == "bert_large_squad":
             script = f"""
             set -x
-            cd ../{model['model_path']}/python
-            bash script/build_engine.sh --bs 32
-            bash script/inference_squad.sh --bs 32
+            cd ../{model['model_path']}
+            bash scripts/infer_bert_large_squad_fp16_accuracy.sh
+            bash scripts/infer_bert_large_squad_fp16_performance.sh
             """
             if prec == "int8":
                 script = f"""
                 set -x
-                cd ../{model['model_path']}/python
-                bash script/build_engine.sh --bs 32 --int8
-                bash script/inference_squad.sh --bs 32 --int8
+                cd ../{model['model_path']}
+                bash scripts/infer_bert_large_squad_int8_accuracy.sh
+                bash scripts/infer_bert_large_squad_int8_performance.sh
                 """
 
         r, t = run_script(script)
-- 
Gitee


From 2f373dfc01a9e58b82f0daa5c692df2800e7b582 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 3 Jul 2025 15:24:14 +0800
Subject: [PATCH 12/15] test ixrt modelzoo yolov3

---
 .../yolov3/ixrt/build_nms_engine.py           | 18 ++++-
 .../yolov3/ixrt/calibration_dataset.py        |  3 -
 .../cv/object_detection/yolov3/ixrt/common.py | 22 ++++++-
 .../yolov3/ixrt/datasets/common.py            |  4 +-
 .../yolov3/ixrt/datasets/post_process.py      | 44 ++++++++++++-
 .../yolov3/ixrt/datasets/pre_process.py       | 22 ++++++-
 .../cv/object_detection/yolov3/ixrt/deploy.py | 53 +++++++--------
 .../object_detection/yolov3/ixrt/inference.py | 65 +++++++++++++------
 .../yolov3/ixrt/modify_batchsize.py           | 19 +-----
 .../cv/object_detection/yolov3/ixrt/quant.py  |  3 -
 .../scripts/infer_yolov3_fp16_accuracy.sh     | 17 ++---
 .../scripts/infer_yolov3_fp16_performance.sh  | 22 +++----
 .../scripts/infer_yolov3_int8_accuracy.sh     | 17 ++---
 .../scripts/infer_yolov3_int8_performance.sh  | 19 ++----
 14 files changed, 201 insertions(+), 127 deletions(-)

diff --git a/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
index 25f0ab8a..d260fe48 100644
--- a/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
+++ b/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
@@ -5,9 +5,9 @@ import onnx
 from onnx import helper
 from onnx import TensorProto, numpy_helper
 import tensorrt
+from os.path import dirname, exists, join
+import ctypes
 
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
 def create_onnx(args):
     nms = helper.make_node(
         "DetectionNMS_IxRT",
@@ -44,10 +44,24 @@ def create_onnx(args):
     model = onnx.helper.make_model(graph, opset_imports=[op])
     onnx_path = args.path + "/nms.onnx"
     onnx.save(model, onnx_path)
+    
+def load_ixrt_plugin(
+    logger=tensorrt.Logger(tensorrt.Logger.WARNING), namespace="", dynamic_path=""
+):
+    if not dynamic_path:
+        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+    if not exists(dynamic_path):
+        raise FileNotFoundError(
+            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!"
+        )
+    ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+    tensorrt.init_libnvinfer_plugins(logger, namespace)
+    print(f"Loaded plugin from {dynamic_path}")
 
 def build_engine(args):
     onnx_path = args.path + "/nms.onnx"
     IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+    load_ixrt_plugin(IXRT_LOGGER)
     builder = tensorrt.Builder(IXRT_LOGGER)
     EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
     network = builder.create_network(EXPLICIT_BATCH)
diff --git a/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
index 578e013d..2473f7d0 100644
--- a/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
+++ b/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
@@ -2,9 +2,6 @@ import os
 import torch
 import torchvision.datasets
 from torch.utils.data import DataLoader
-
-
-
 from datasets.coco import CocoDetection
 
 def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
diff --git a/models/cv/object_detection/yolov3/ixrt/common.py b/models/cv/object_detection/yolov3/ixrt/common.py
index 5f543555..7d9a078e 100644
--- a/models/cv/object_detection/yolov3/ixrt/common.py
+++ b/models/cv/object_detection/yolov3/ixrt/common.py
@@ -2,8 +2,7 @@ import numpy as np
 from tqdm import tqdm
 
 import tensorrt
-import cuda.cuda as cuda
-import cuda.cudart as cudart
+from cuda import cuda, cudart
 
 # input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
 # output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
@@ -37,6 +36,23 @@ def save2json(batch_img_id, pred_boxes, json_result, class_trans):
                         "score": p,
                     }
                 )
+def save2json_nonms(batch_img_id, pred_boxes, json_result):
+    for i, boxes in enumerate(pred_boxes):
+        image_id = int(batch_img_id)
+        if boxes is not None:
+            x, y, w, h, c, p = boxes
+            if image_id!=-1:
+                
+                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
+                c = int(c)
+                json_result.append(
+                    {
+                    "image_id": image_id,
+                    "category_id": c,
+                    "bbox": [x, y, w, h],
+                    "score": p,
+                    }
+                    )
 
 def create_engine_context(engine_path, logger):
     with open(engine_path, "rb") as f:
@@ -68,7 +84,7 @@ def get_io_bindings(engine):
         for s in shape:
             size *= s
         err, allocation = cudart.cudaMalloc(size)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
+        assert err == cudart.cudaError_t.cudaSuccess
         binding = {
             "index": i,
             "name": name,
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/common.py b/models/cv/object_detection/yolov3/ixrt/datasets/common.py
index e120e00f..a8e5e6e7 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/common.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/common.py
@@ -63,4 +63,6 @@ def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
 def clip_boxes(boxes, shape):
 
     boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
+    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
+    
+    return boxes
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
index a58c02f8..7b411a50 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
@@ -1,6 +1,8 @@
 import cv2
 import math
 import numpy as np
+import torch
+import torch.nn.functional as F
 
 from .common import letterbox, scale_boxes, clip_boxes
 
@@ -11,6 +13,8 @@ def get_post_process(data_process_type):
         return Yolov3Postprocess
     elif data_process_type == "yolox":
         return YoloxPostprocess
+    elif data_process_type == "detr":
+        return DetrPostprocess
     return None
 
 def Yolov3Postprocess(
@@ -112,4 +116,42 @@ def YoloxPostprocess(
         all_box.append(boxes)
         data_offset += max_det * 6
 
-    return all_box
\ No newline at end of file
+    return all_box
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def convert_to_xywh(boxes):
+    xmin, ymin, xmax, ymax = boxes.unbind(-1)
+    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+
+def DetrPostprocess(pred_logits, pred_boxes, target_sizes):
+    
+    out_logits = torch.from_numpy(pred_logits) 
+    out_bbox = torch.from_numpy(pred_boxes)
+    assert len(target_sizes) == 2
+    
+    prob = F.softmax(out_logits, -1)
+    scores, labels = prob[..., :-1].max(-1)
+    
+    # convert to [x0, y0, x1, y1] format 
+    boxes = box_cxcywh_to_xyxy(out_bbox)
+    # and from relative [0, 1] to absolute [0, height] coordinates
+    img_w, img_h = target_sizes
+    scale_fct = torch.tensor([img_w, img_h, img_w, img_h])
+    boxes = boxes * scale_fct
+   
+    boxes = clip_boxes(boxes, target_sizes)
+    boxes = convert_to_xywh(boxes)
+
+    labels = labels.unsqueeze(1)
+    scores =scores.unsqueeze(1)
+    pred_boxes = torch.cat([
+            boxes, 
+            labels, 
+            scores], dim=1).numpy().tolist()
+    return pred_boxes
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
index 8cc643a8..e5b4ddfb 100644
--- a/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
+++ b/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
@@ -11,6 +11,8 @@ def get_post_process(data_process_type):
         return Yolov3Preprocess
     elif data_process_type == "yolox":
         return YoloxPreprocess
+    elif data_process_type == "detr":
+        return DetrPreprocess
     return None
 
 def Yolov3Preprocess(image, img_size):
@@ -53,4 +55,22 @@ def YoloxPreprocess(img, img_size, swap=(2,0,1)):
     padded_img = padded_img.transpose(swap)
     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
 
-    return padded_img
\ No newline at end of file
+    return padded_img
+
+def DetrPreprocess(image, img_size):    
+    # img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+    # img = img.resize((img_size, img_size))
+    
+    std = [0.485, 0.456, 0.406] 
+    mean = [0.229, 0.224, 0.225]
+    
+    image = cv2.resize(image, (img_size, img_size))
+    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
+    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
+    
+    image[0,:,:] = (image[0,:,:]- std[0])/mean[0]
+    image[1,:,:] = (image[1,:,:]- std[1])/mean[1]
+    image[2,:,:] = (image[2,:,:]- std[2])/mean[2]
+    
+    return image
+    
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/deploy.py b/models/cv/object_detection/yolov3/ixrt/deploy.py
index ec56b7ab..a686f4ff 100644
--- a/models/cv/object_detection/yolov3/ixrt/deploy.py
+++ b/models/cv/object_detection/yolov3/ixrt/deploy.py
@@ -62,16 +62,17 @@ def customize_ops(graph, args):
         stride=16,
         faster_impl=args.faster
     )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num*2:num*2+1],
-        outputs=["decoder_32"],
-        op_type=args.decoder_type,
-        anchor=args.decoder32_anchor,
-        num_class=args.num_class,
-        stride=32,
-        faster_impl=args.faster
-    )
+    
     if args.decoder64_anchor is not None:
+        graph = t.AddYoloDecoderOp(
+            inputs=decoder_input[num*2:num*2+1],
+            outputs=["decoder_32"],
+            op_type=args.decoder_type,
+            anchor=args.decoder32_anchor,
+            num_class=args.num_class,
+            stride=32,
+            faster_impl=args.faster
+        )
         graph = t.AddYoloDecoderOp(
             inputs=decoder_input[num*2+1:],
             outputs=["decoder_64"],
@@ -86,24 +87,25 @@ def customize_ops(graph, args):
             outputs=["output"],
             axis=1
         )
-    elif args.with_nms:
+    else:
+        graph = t.AddYoloDecoderOp(
+            inputs=decoder_input[num*2:],
+            outputs=["decoder_32"],
+            op_type=args.decoder_type,
+            anchor=args.decoder32_anchor,
+            num_class=args.num_class,
+            stride=32,
+            faster_impl=args.faster
+        )
         graph = t.AddConcatOp(
             inputs=["decoder_32", "decoder_16", "decoder_8"],
             outputs=["output"],
             axis=1
         )
 
-        graph.outputs.clear()
-        graph.add_output("output")
-        graph.outputs["output"].dtype = "FLOAT"
-    else:
-        graph.outputs.clear()
-        graph.add_output("decoder_8")
-        graph.outputs["decoder_8"].dtype = "FLOAT"
-        graph.add_output("decoder_16")
-        graph.outputs["decoder_16"].dtype = "FLOAT"
-        graph.add_output("decoder_32")
-        graph.outputs["decoder_32"].dtype = "FLOAT"
+    graph.outputs.clear()
+    graph.add_output("output")
+    graph.outputs["output"].dtype = "FLOAT"
     return graph
 
 def parse_args():
@@ -111,12 +113,11 @@ def parse_args():
     parser.add_argument("--src", type=str)
     parser.add_argument("--dst", type=str)
     parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
-    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
     parser.add_argument("--decoder_input_names", nargs='+', type=str)
-    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
+    parser.add_argument("--decoder8_anchor", nargs='*', type=float)
+    parser.add_argument("--decoder16_anchor", nargs='*', type=float)
+    parser.add_argument("--decoder32_anchor", nargs='*', type=float)
+    parser.add_argument("--decoder64_anchor", nargs='*', type=float, default=None)
     parser.add_argument("--num_class", type=int, default=80)
     parser.add_argument("--faster", type=int, default=1)
     parser.add_argument("--focus_input", type=str, default=None)
diff --git a/models/cv/object_detection/yolov3/ixrt/inference.py b/models/cv/object_detection/yolov3/ixrt/inference.py
index a2cc7d79..99f22322 100644
--- a/models/cv/object_detection/yolov3/ixrt/inference.py
+++ b/models/cv/object_detection/yolov3/ixrt/inference.py
@@ -10,11 +10,10 @@ import sys
 
 import torch
 import numpy as np
-import cuda.cuda as cuda
-import cuda.cudart as cudart
+from cuda import cuda, cudart
 
 from coco_labels import coco80_to_coco91_class, labels
-from common import save2json, box_class85to6
+from common import save2json, save2json_nonms, box_class85to6
 from common import create_engine_context, get_io_bindings
 from calibration_dataset import create_dataloaders
 from datasets.post_process import get_post_process
@@ -47,7 +46,7 @@ def main(config):
 
     bsz = config.bsz
     num_samples = 5000
-    if config.loop_count > 0:
+    if config.loop_count > 0 and config.loop_count < num_samples/bsz :
         num_samples = bsz * config.loop_count
     num_batch = len(dataloader)
     print("=" * 30)
@@ -67,7 +66,7 @@ def main(config):
     inputs, outputs, allocations = get_io_bindings(engine)
 
     # Load nms_engine
-    if config.test_mode == "MAP" and config.nms_type == "GPU":
+    if config.test_mode == "MAP" and config.nms_type == "GPU" and not config.no_nms:
         nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
         nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
         nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
@@ -83,8 +82,14 @@ def main(config):
         print("Warm Done.")
 
     # Prepare the output data
-    output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    print(f"output shape : {output.shape} output type : {output.dtype}")
+    if config.no_nms:
+        batch_pred_logits = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+        batch_pred_boxes = np.zeros(outputs[1]["shape"], outputs[1]["dtype"])
+        print(f"pred_logits shape : {batch_pred_logits.shape} pred_logits type : {batch_pred_logits.dtype}")
+        print(f"pred_boxes shape : {batch_pred_boxes.shape} pred_boxes type : {batch_pred_boxes.dtype}")
+    else:
+        output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+        print(f"output shape : {output.shape} output type : {output.dtype}")
 
     for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
         batch_data = batch_data.numpy()
@@ -96,13 +101,14 @@ def main(config):
         # Set input
         err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
         assert(err == cuda.CUresult.CUDA_SUCCESS)
+
         # Forward
-        # start_time = time.time()
+        start_time = time.time()
         context.execute_v2(allocations)
-        # end_time = time.time()
-        # forward_time += end_time - start_time
+        end_time = time.time()
+        forward_time += end_time - start_time
 
-        if config.test_mode == "MAP":
+        if config.test_mode == "MAP" and not config.no_nms:
             # Fetch output
             err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
             assert(err == cuda.CUresult.CUDA_SUCCESS)
@@ -124,6 +130,7 @@ def main(config):
             if config.nms_type == "GPU":
 
                 # Set nms input
+                
                 err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
                 assert(err == cuda.CUresult.CUDA_SUCCESS)
                 nms_context.execute_v2(nms_allocations)
@@ -142,16 +149,31 @@ def main(config):
                 max_det=config.max_det
             )
             save2json(batch_img_id, pred_boxes, json_result, class_map)
+        elif config.test_mode == "MAP" and config.no_nms:
+            # Fetch output
+            err, = cuda.cuMemcpyDtoH(batch_pred_logits, outputs[0]["allocation"], outputs[0]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
+            err, = cuda.cuMemcpyDtoH(batch_pred_boxes, outputs[1]["allocation"], outputs[1]["nbytes"])
+            assert(err == cuda.CUresult.CUDA_SUCCESS)
 
-    # fps = num_samples / forward_time
+            for (pred_logits, pred_boxes, img_h, img_w, img_id) in zip(
+                batch_pred_logits, 
+                batch_pred_boxes, 
+                batch_img_shape[0],
+                batch_img_shape[1], 
+                batch_img_id):
+                pred_boxes = post_process_func(pred_logits, pred_boxes, [img_w, img_h])  
+                # print(img_id)
+                # print(img_w, img_h)
+                
+                # import ipdb
+                # ipdb.set_trace()
+                      
+                save2json_nonms(img_id, pred_boxes, json_result)
+
+    fps = num_samples / forward_time
 
     if config.test_mode == "FPS":
-        start_time = time.time()       
-        for i in range(config.loop_count):
-            context.execute_v2(allocations)  
-        end_time = time.time()  
-        forward_time = end_time - start_time      
-        fps = (config.loop_count*config.bsz) / forward_time
         print("FPS : ", fps)
         print(f"Performance Check : Test {fps} >= target {config.fps_target}")
         if fps >= config.fps_target:
@@ -159,12 +181,12 @@ def main(config):
             exit()
         else:
             print("failed!")
-            exit(10)
+            exit(1)
 
     if config.test_mode == "MAP":
         if len(json_result) == 0:
             print("Predict zero box!")
-            exit(10)
+            exit(1)
 
         if not os.path.exists(config.pred_dir):
             os.makedirs(config.pred_dir)
@@ -195,7 +217,7 @@ def main(config):
             exit()
         else:
             print("failed!")
-            exit(10)
+            exit(1)
 
 def parse_config():
     parser = argparse.ArgumentParser()
@@ -249,6 +271,7 @@ def parse_config():
     parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
     parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
     parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
+    parser.add_argument("--no_nms", type=bool, default=False, help="NMS")
 
     config = parser.parse_args()
     print("config:", config)
diff --git a/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
index f696ae55..00ed65dd 100644
--- a/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
+++ b/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
@@ -1,7 +1,5 @@
 import onnx
 import argparse
-import copy
-import numpy as np
 
 def change_input_dim(model, bsz):
     batch_size = bsz
@@ -33,22 +31,7 @@ def parse_args():
     args = parser.parse_args()
     return args
 
-def modify_resize_nodes(model, bsz):
-    print("modify resize")
-    for node in model.graph.node:
-        if node.op_type == "Resize":
-            if len(node.input) >= 4 and node.input[3]:
-                sizes_name = node.input[3]
-                for initializer in model.graph.initializer:
-                    if initializer.name == sizes_name:
-                        shape = copy.deepcopy(onnx.numpy_helper.to_array(initializer))
-                        shape[0] = shape[0] * bsz
-                        new_sizes = np.array(shape, dtype=np.int64)
-                        initializer.CopyFrom(onnx.numpy_helper.from_array(new_sizes, name=initializer.name))
-                        break
-    
 args = parse_args()
 model = onnx.load(args.origin_model)
 change_input_dim(model, args.batch_size)
-modify_resize_nodes(model, args.batch_size)
-onnx.save(model, args.output_model)
+onnx.save(model, args.output_model)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov3/ixrt/quant.py b/models/cv/object_detection/yolov3/ixrt/quant.py
index d73212ca..bcf5d9b6 100644
--- a/models/cv/object_detection/yolov3/ixrt/quant.py
+++ b/models/cv/object_detection/yolov3/ixrt/quant.py
@@ -5,9 +5,6 @@ import numpy as np
 from tensorrt.deploy import static_quantize
 
 import torch
-import sys
-sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
-print(sys.path)
 from calibration_dataset import create_dataloaders
 
 def setseed(seed=42):
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
index 932edf9d..81f27858 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_accuracy.sh
@@ -3,16 +3,15 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=0.65
+TGT=-1
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -115,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -123,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV3Decoder             \
-            --with_nms             True                   \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -138,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -154,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
index 5f3360a6..357fb10b 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_fp16_performance.sh
@@ -3,17 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=1010
-LOOP_COUNT=100
+TGT=-1
+LOOP_COUNT=10
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -80,7 +76,6 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
-
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -116,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -124,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV3Decoder             \
-            --with_nms             False                   \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -139,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -155,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -182,7 +176,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}              \
+    --model_engine=${ENGINE_FILE}               \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
index 3e468467..a0961b5c 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_accuracy.sh
@@ -3,16 +3,15 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=0.65
+TGT=-1
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -115,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -123,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV3Decoder             \
-            --with_nms             True                   \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -138,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_with_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -154,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
index c7ac4c1b..d0f1f48a 100644
--- a/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
+++ b/models/cv/object_detection/yolov3/ixrt/scripts/infer_yolov3_int8_performance.sh
@@ -3,17 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=1010
-LOOP_COUNT=100
+TGT=-1
+LOOP_COUNT=10
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -116,7 +112,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -124,7 +120,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV3Decoder             \
-            --with_nms             False                   \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -139,7 +134,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -155,7 +150,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
-- 
Gitee


From 25aa27d200eccc687af5998b32fc2e751e198f72 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 3 Jul 2025 16:38:34 +0800
Subject: [PATCH 13/15] add ixrt common in detection

---
 .../ixrt => ixrt_common}/build_engine.py      |   0
 .../ixrt => ixrt_common}/build_nms_engine.py  |   0
 .../calibration_dataset.py                    |   0
 .../ixrt => ixrt_common}/coco_labels.py       |   0
 .../{yolov3/ixrt => ixrt_common}/common.py    |   0
 .../ixrt => ixrt_common}/config/YOLOV3_CONFIG |   0
 .../config/YOLOV5M_CONFIG                     |   0
 .../config/YOLOV5S_CONFIG                     |   0
 .../ixrt => ixrt_common}/config/YOLOV7_CONFIG |   0
 .../{yolov3/ixrt => ixrt_common}/cut_model.py |   0
 .../ixrt => ixrt_common}/datasets/__init__.py |   0
 .../ixrt => ixrt_common}/datasets/coco.py     |   0
 .../ixrt => ixrt_common}/datasets/common.py   |   0
 .../datasets/post_process.py                  |   0
 .../datasets/pre_process.py                   |   0
 .../ixrt => ixrt_common}/datasets/vision.py   |   0
 .../{yolov3/ixrt => ixrt_common}/deploy.py    |   0
 .../{yolov3/ixrt => ixrt_common}/inference.py |   0
 .../ixrt => ixrt_common}/load_ixrt_plugin.py  |   0
 .../ixrt => ixrt_common}/modify_batchsize.py  |   0
 .../{yolov3/ixrt => ixrt_common}/quant.py     |   0
 .../ixrt => ixrt_common}/requirements.txt     |   0
 .../ixrt => ixrt_common}/simplify_model.py    |   0
 .../cv/object_detection/yolov3/ixrt/README.md |   8 +-
 .../yolov3/ixrt/ci/prepare.sh                 |   6 +-
 .../cv/object_detection/yolov5/ixrt/README.md |  10 +-
 .../yolov5/ixrt/build_engine.py               |  43 ---
 .../yolov5/ixrt/build_nms_engine.py           |  81 ------
 .../yolov5/ixrt/calibration_dataset.py        |  30 --
 .../yolov5/ixrt/ci/prepare.sh                 |   2 +-
 .../yolov5/ixrt/coco_labels.py                |  89 ------
 .../cv/object_detection/yolov5/ixrt/common.py |  86 ------
 .../object_detection/yolov5/ixrt/cut_model.py |  16 --
 .../yolov5/ixrt/datasets/__init__.py          |   0
 .../yolov5/ixrt/datasets/coco.py              | 116 --------
 .../yolov5/ixrt/datasets/common.py            |  66 -----
 .../yolov5/ixrt/datasets/post_process.py      | 115 --------
 .../yolov5/ixrt/datasets/pre_process.py       |  56 ----
 .../yolov5/ixrt/datasets/vision.py            | 136 ---------
 .../cv/object_detection/yolov5/ixrt/deploy.py | 134 ---------
 .../object_detection/yolov5/ixrt/inference.py | 260 -----------------
 .../yolov5/ixrt/load_ixrt_plugin.py           |  12 -
 .../yolov5/ixrt/modify_batchsize.py           |  37 ---
 .../cv/object_detection/yolov5/ixrt/quant.py  |  55 ----
 .../yolov5/ixrt/requirements.txt              |   7 -
 .../scripts/infer_yolov5_fp16_accuracy.sh     |  17 +-
 .../scripts/infer_yolov5_fp16_performance.sh  |  22 +-
 .../scripts/infer_yolov5_int8_accuracy.sh     |  17 +-
 .../scripts/infer_yolov5_int8_performance.sh  |  19 +-
 .../yolov5/ixrt/simplify_model.py             |  21 --
 .../object_detection/yolov5s/ixrt/README.md   |   8 +-
 .../yolov5s/ixrt/build_engine.py              |  43 ---
 .../yolov5s/ixrt/build_nms_engine.py          |  81 ------
 .../yolov5s/ixrt/calibration_dataset.py       |  31 ---
 .../yolov5s/ixrt/ci/prepare.sh                |   2 +-
 .../yolov5s/ixrt/coco_labels.py               |  89 ------
 .../object_detection/yolov5s/ixrt/common.py   |  86 ------
 .../yolov5s/ixrt/cut_model.py                 |  16 --
 .../yolov5s/ixrt/datasets/__init__.py         |   0
 .../yolov5s/ixrt/datasets/coco.py             | 116 --------
 .../yolov5s/ixrt/datasets/common.py           |  66 -----
 .../yolov5s/ixrt/datasets/post_process.py     | 115 --------
 .../yolov5s/ixrt/datasets/pre_process.py      |  56 ----
 .../yolov5s/ixrt/datasets/vision.py           | 136 ---------
 .../object_detection/yolov5s/ixrt/deploy.py   | 134 ---------
 .../yolov5s/ixrt/inference.py                 | 260 -----------------
 .../yolov5s/ixrt/load_ixrt_plugin.py          |  12 -
 .../yolov5s/ixrt/modify_batchsize.py          |  37 ---
 .../cv/object_detection/yolov5s/ixrt/quant.py |  55 ----
 .../yolov5s/ixrt/requirements.txt             |   6 -
 .../scripts/infer_yolov5s_fp16_accuracy.sh    |  17 +-
 .../scripts/infer_yolov5s_fp16_performance.sh |  19 +-
 .../scripts/infer_yolov5s_int8_accuracy.sh    |  17 +-
 .../scripts/infer_yolov5s_int8_performance.sh |  22 +-
 .../yolov5s/ixrt/simplify_model.py            |  21 --
 .../cv/object_detection/yolov7/ixrt/README.md |   8 +-
 .../yolov7/ixrt/build_engine.py               |  43 ---
 .../yolov7/ixrt/build_nms_engine.py           |  81 ------
 .../yolov7/ixrt/calibration_dataset.py        |  31 ---
 .../yolov7/ixrt/ci/prepare.sh                 |   2 +-
 .../yolov7/ixrt/coco_labels.py                |  89 ------
 .../cv/object_detection/yolov7/ixrt/common.py |  86 ------
 .../object_detection/yolov7/ixrt/cut_model.py |  16 --
 .../yolov7/ixrt/datasets/__init__.py          |   0
 .../yolov7/ixrt/datasets/coco.py              | 116 --------
 .../yolov7/ixrt/datasets/common.py            |  66 -----
 .../yolov7/ixrt/datasets/post_process.py      | 115 --------
 .../yolov7/ixrt/datasets/pre_process.py       |  56 ----
 .../yolov7/ixrt/datasets/vision.py            | 136 ---------
 .../cv/object_detection/yolov7/ixrt/deploy.py | 125 ---------
 .../object_detection/yolov7/ixrt/inference.py | 261 ------------------
 .../yolov7/ixrt/load_ixrt_plugin.py           |  12 -
 .../yolov7/ixrt/modify_batchsize.py           |  37 ---
 .../cv/object_detection/yolov7/ixrt/quant.py  |  55 ----
 .../yolov7/ixrt/requirements.txt              |   7 -
 .../scripts/infer_yolov7_fp16_accuracy.sh     |  14 +-
 .../scripts/infer_yolov7_fp16_performance.sh  |  19 +-
 .../scripts/infer_yolov7_int8_accuracy.sh     |  10 +-
 .../scripts/infer_yolov7_int8_performance.sh  |  12 +-
 .../yolov7/ixrt/simplify_model.py             |  21 --
 tests/run_ixrt.py                             |  44 +--
 101 files changed, 125 insertions(+), 4243 deletions(-)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/build_engine.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/build_nms_engine.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/calibration_dataset.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/coco_labels.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/common.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/config/YOLOV3_CONFIG (100%)
 rename models/cv/object_detection/{yolov5/ixrt => ixrt_common}/config/YOLOV5M_CONFIG (100%)
 rename models/cv/object_detection/{yolov5s/ixrt => ixrt_common}/config/YOLOV5S_CONFIG (100%)
 mode change 100755 => 100644
 rename models/cv/object_detection/{yolov7/ixrt => ixrt_common}/config/YOLOV7_CONFIG (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/cut_model.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/datasets/__init__.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/datasets/coco.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/datasets/common.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/datasets/post_process.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/datasets/pre_process.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/datasets/vision.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/deploy.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/inference.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/load_ixrt_plugin.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/modify_batchsize.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/quant.py (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/requirements.txt (100%)
 rename models/cv/object_detection/{yolov3/ixrt => ixrt_common}/simplify_model.py (100%)
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/build_engine.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/build_nms_engine.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/calibration_dataset.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/coco_labels.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/common.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/cut_model.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/datasets/__init__.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/datasets/coco.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/datasets/common.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/datasets/post_process.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/datasets/vision.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/deploy.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/inference.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/modify_batchsize.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/quant.py
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/requirements.txt
 delete mode 100644 models/cv/object_detection/yolov5/ixrt/simplify_model.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/build_engine.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/coco_labels.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/common.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/cut_model.py
 delete mode 100755 models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py
 delete mode 100755 models/cv/object_detection/yolov5s/ixrt/datasets/coco.py
 delete mode 100755 models/cv/object_detection/yolov5s/ixrt/datasets/common.py
 delete mode 100755 models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py
 delete mode 100755 models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py
 delete mode 100755 models/cv/object_detection/yolov5s/ixrt/datasets/vision.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/deploy.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/inference.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/quant.py
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/requirements.txt
 delete mode 100644 models/cv/object_detection/yolov5s/ixrt/simplify_model.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/build_engine.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/build_nms_engine.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/calibration_dataset.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/coco_labels.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/common.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/cut_model.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/datasets/__init__.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/datasets/coco.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/datasets/common.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/datasets/post_process.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/datasets/vision.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/deploy.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/inference.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/modify_batchsize.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/quant.py
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/requirements.txt
 delete mode 100644 models/cv/object_detection/yolov7/ixrt/simplify_model.py

diff --git a/models/cv/object_detection/yolov3/ixrt/build_engine.py b/models/cv/object_detection/ixrt_common/build_engine.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/build_engine.py
rename to models/cv/object_detection/ixrt_common/build_engine.py
diff --git a/models/cv/object_detection/yolov3/ixrt/build_nms_engine.py b/models/cv/object_detection/ixrt_common/build_nms_engine.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/build_nms_engine.py
rename to models/cv/object_detection/ixrt_common/build_nms_engine.py
diff --git a/models/cv/object_detection/yolov3/ixrt/calibration_dataset.py b/models/cv/object_detection/ixrt_common/calibration_dataset.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/calibration_dataset.py
rename to models/cv/object_detection/ixrt_common/calibration_dataset.py
diff --git a/models/cv/object_detection/yolov3/ixrt/coco_labels.py b/models/cv/object_detection/ixrt_common/coco_labels.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/coco_labels.py
rename to models/cv/object_detection/ixrt_common/coco_labels.py
diff --git a/models/cv/object_detection/yolov3/ixrt/common.py b/models/cv/object_detection/ixrt_common/common.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/common.py
rename to models/cv/object_detection/ixrt_common/common.py
diff --git a/models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG b/models/cv/object_detection/ixrt_common/config/YOLOV3_CONFIG
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/config/YOLOV3_CONFIG
rename to models/cv/object_detection/ixrt_common/config/YOLOV3_CONFIG
diff --git a/models/cv/object_detection/yolov5/ixrt/config/YOLOV5M_CONFIG b/models/cv/object_detection/ixrt_common/config/YOLOV5M_CONFIG
similarity index 100%
rename from models/cv/object_detection/yolov5/ixrt/config/YOLOV5M_CONFIG
rename to models/cv/object_detection/ixrt_common/config/YOLOV5M_CONFIG
diff --git a/models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG b/models/cv/object_detection/ixrt_common/config/YOLOV5S_CONFIG
old mode 100755
new mode 100644
similarity index 100%
rename from models/cv/object_detection/yolov5s/ixrt/config/YOLOV5S_CONFIG
rename to models/cv/object_detection/ixrt_common/config/YOLOV5S_CONFIG
diff --git a/models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG b/models/cv/object_detection/ixrt_common/config/YOLOV7_CONFIG
similarity index 100%
rename from models/cv/object_detection/yolov7/ixrt/config/YOLOV7_CONFIG
rename to models/cv/object_detection/ixrt_common/config/YOLOV7_CONFIG
diff --git a/models/cv/object_detection/yolov3/ixrt/cut_model.py b/models/cv/object_detection/ixrt_common/cut_model.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/cut_model.py
rename to models/cv/object_detection/ixrt_common/cut_model.py
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/__init__.py b/models/cv/object_detection/ixrt_common/datasets/__init__.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/datasets/__init__.py
rename to models/cv/object_detection/ixrt_common/datasets/__init__.py
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/coco.py b/models/cv/object_detection/ixrt_common/datasets/coco.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/datasets/coco.py
rename to models/cv/object_detection/ixrt_common/datasets/coco.py
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/common.py b/models/cv/object_detection/ixrt_common/datasets/common.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/datasets/common.py
rename to models/cv/object_detection/ixrt_common/datasets/common.py
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/post_process.py b/models/cv/object_detection/ixrt_common/datasets/post_process.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/datasets/post_process.py
rename to models/cv/object_detection/ixrt_common/datasets/post_process.py
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py b/models/cv/object_detection/ixrt_common/datasets/pre_process.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/datasets/pre_process.py
rename to models/cv/object_detection/ixrt_common/datasets/pre_process.py
diff --git a/models/cv/object_detection/yolov3/ixrt/datasets/vision.py b/models/cv/object_detection/ixrt_common/datasets/vision.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/datasets/vision.py
rename to models/cv/object_detection/ixrt_common/datasets/vision.py
diff --git a/models/cv/object_detection/yolov3/ixrt/deploy.py b/models/cv/object_detection/ixrt_common/deploy.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/deploy.py
rename to models/cv/object_detection/ixrt_common/deploy.py
diff --git a/models/cv/object_detection/yolov3/ixrt/inference.py b/models/cv/object_detection/ixrt_common/inference.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/inference.py
rename to models/cv/object_detection/ixrt_common/inference.py
diff --git a/models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/ixrt_common/load_ixrt_plugin.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/load_ixrt_plugin.py
rename to models/cv/object_detection/ixrt_common/load_ixrt_plugin.py
diff --git a/models/cv/object_detection/yolov3/ixrt/modify_batchsize.py b/models/cv/object_detection/ixrt_common/modify_batchsize.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/modify_batchsize.py
rename to models/cv/object_detection/ixrt_common/modify_batchsize.py
diff --git a/models/cv/object_detection/yolov3/ixrt/quant.py b/models/cv/object_detection/ixrt_common/quant.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/quant.py
rename to models/cv/object_detection/ixrt_common/quant.py
diff --git a/models/cv/object_detection/yolov3/ixrt/requirements.txt b/models/cv/object_detection/ixrt_common/requirements.txt
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/requirements.txt
rename to models/cv/object_detection/ixrt_common/requirements.txt
diff --git a/models/cv/object_detection/yolov3/ixrt/simplify_model.py b/models/cv/object_detection/ixrt_common/simplify_model.py
similarity index 100%
rename from models/cv/object_detection/yolov3/ixrt/simplify_model.py
rename to models/cv/object_detection/ixrt_common/simplify_model.py
diff --git a/models/cv/object_detection/yolov3/ixrt/README.md b/models/cv/object_detection/yolov3/ixrt/README.md
index e0003094..dc7dad6f 100644
--- a/models/cv/object_detection/yolov3/ixrt/README.md
+++ b/models/cv/object_detection/yolov3/ixrt/README.md
@@ -30,7 +30,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 ```
 
 ### Model Conversion
@@ -50,13 +50,13 @@ mv weights/export.onnx /Path/to/checkpoints/yolov3.onnx
 ## Model Inference
 
 ```bash
-export PROJ_DIR=/Path/to/yolov3/ixrt
+export PROJ_DIR=./
 export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=./coco/annotations/instances_val2017.json
 export EVAL_DIR=./coco/images/val2017
-export RUN_DIR=/Path/to/yolov3/ixrt
-export CONFIG_DIR=config/YOLOV3_CONFIG
+export RUN_DIR=../../ixrt_common
+export CONFIG_DIR=../../ixrt_common/config/YOLOV3_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
index 9fb652f1..481003d1 100644
--- a/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
@@ -25,10 +25,10 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 mkdir checkpoints
-unzip -q /root/data/3rd_party/onnx_tflite_yolov3.zip -d ./
-cp /root/data/checkpoints/yolov3.weights onnx_tflite_yolov3/weights
+unzip -q /mnt/deepspark/data/3rd_party/onnx_tflite_yolov3.zip -d ./
+cp /mnt/deepspark/data/checkpoints/yolov3.weights onnx_tflite_yolov3/weights
 cd onnx_tflite_yolov3
 python3 detect.py --cfg cfg/yolov3.cfg --weights weights/yolov3.weights
 mv weights/export.onnx ../checkpoints/yolov3.onnx
diff --git a/models/cv/object_detection/yolov5/ixrt/README.md b/models/cv/object_detection/yolov5/ixrt/README.md
index 4394ba8c..69f56840 100644
--- a/models/cv/object_detection/yolov5/ixrt/README.md
+++ b/models/cv/object_detection/yolov5/ixrt/README.md
@@ -30,7 +30,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 ```
 
 ### Model Conversion
@@ -54,13 +54,13 @@ mv yolov5m.onnx /Path/to/checkpoints
 ## Model Inference
 
 ```bash
-export PROJ_DIR=/Path/to/yolov5/ixrt
-export DATASETS_DIR=/Path/to/coco2017/
+export PROJ_DIR=./
+export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
 export EVAL_DIR=${DATASETS_DIR}/images/val2017
-export RUN_DIR=/Path/to/yolov5/ixrt
-export CONFIG_DIR=config/YOLOV5M_CONFIG
+export RUN_DIR=../../ixrt_common
+export CONFIG_DIR=../../ixrt_common/config/YOLOV5M_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov5/ixrt/build_engine.py b/models/cv/object_detection/yolov5/ixrt/build_engine.py
deleted file mode 100644
index d47e45e5..00000000
--- a/models/cv/object_detection/yolov5/ixrt/build_engine.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import os
-import cv2
-import argparse
-import numpy as np
-
-import torch
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    # engine args
-    parser.add_argument("--engine", type=str, default=None)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py
deleted file mode 100644
index 25f0ab8a..00000000
--- a/models/cv/object_detection/yolov5/ixrt/build_nms_engine.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import os
-import argparse
-import torch
-import onnx
-from onnx import helper
-from onnx import TensorProto, numpy_helper
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-def create_onnx(args):
-    nms = helper.make_node(
-        "DetectionNMS_IxRT",
-        name="NMS",
-        inputs=["nms_input"],
-        outputs=["nms_output0", "nms_output1"],
-        nMaxKeep=args.max_box_pre_img,
-        fIoUThresh=args.iou_thresh,
-        fScoreThresh=args.score_thresh
-    )
-    graph = helper.make_graph(
-        nodes=[nms],
-        name="gpu_nms",
-        inputs=[
-            helper.make_tensor_value_info(
-                "nms_input", onnx.TensorProto.FLOAT, (args.bsz, args.all_box_num, 6)
-            )
-        ],
-        outputs=[
-            helper.make_tensor_value_info(
-                "nms_output0", onnx.TensorProto.FLOAT, (args.bsz, args.max_box_pre_img, 6)
-            ),
-            helper.make_tensor_value_info(
-                "nms_output1", onnx.TensorProto.INT32, (args.bsz,)
-            )
-        ],
-        initializer=[]
-    )
-
-    op = onnx.OperatorSetIdProto()
-    op.version = 13
-    model = onnx.helper.make_model(graph)
-
-    model = onnx.helper.make_model(graph, opset_imports=[op])
-    onnx_path = args.path + "/nms.onnx"
-    onnx.save(model, onnx_path)
-
-def build_engine(args):
-    onnx_path = args.path + "/nms.onnx"
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(onnx_path)
-    plan = builder.build_serialized_network(network, build_config)
-
-    engine_path = args.path + "/nms.engine"
-    with open(engine_path, "wb") as f:
-        f.write(plan)
-
-def main(args):
-    create_onnx(args)
-    build_engine(args)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--bsz", type=int, default=1, help="batch size")
-    parser.add_argument("--path", type=str)
-    parser.add_argument("--all_box_num", type=int, default=25200)
-    parser.add_argument("--max_box_pre_img", type=int, default=1000)
-    parser.add_argument("--iou_thresh", type=float, default=0.6)
-    parser.add_argument("--score_thresh", type=float, default=0.001)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py
deleted file mode 100644
index 7d3e3e48..00000000
--- a/models/cv/object_detection/yolov5/ixrt/calibration_dataset.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import os
-import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-
-
-from datasets.coco import CocoDetection
-
-def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
-    dataset = CocoDetection(
-        root=data_path,
-        annFile=annFile,
-        img_size=img_sz,
-        data_process_type=data_process_type
-    )
-    calibration_dataset = dataset
-    num_samples = min(5000, batch_size * step)
-    if num_samples > 0:
-        calibration_dataset = torch.utils.data.Subset(
-            dataset, indices=range(num_samples)
-        )
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
index b66c06b5..b99ab99d 100644
--- a/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov5/ixrt/ci/prepare.sh
@@ -25,7 +25,7 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 
 mkdir checkpoints
 cp -r /root/data/3rd_party/yolov5 ./
diff --git a/models/cv/object_detection/yolov5/ixrt/coco_labels.py b/models/cv/object_detection/yolov5/ixrt/coco_labels.py
deleted file mode 100644
index 69d38878..00000000
--- a/models/cv/object_detection/yolov5/ixrt/coco_labels.py
+++ /dev/null
@@ -1,89 +0,0 @@
-labels = [
-    "person",
-    "bicycle",
-    "car",
-    "motorcycle",
-    "airplane",
-    "bus",
-    "train",
-    "truck",
-    "boat",
-    "traffic light",
-    "fire hydrant",
-    "stop sign",
-    "parking meter",
-    "bench",
-    "bird",
-    "cat",
-    "dog",
-    "horse",
-    "sheep",
-    "cow",
-    "elephant",
-    "bear",
-    "zebra",
-    "giraffe",
-    "backpack",
-    "umbrella",
-    "handbag",
-    "tie",
-    "suitcase",
-    "frisbee",
-    "skis",
-    "snowboard",
-    "sports ball",
-    "kite",
-    "baseball bat",
-    "baseball glove",
-    "skateboard",
-    "surfboard",
-    "tennis racket",
-    "bottle",
-    "wine glass",
-    "cup",
-    "fork",
-    "knife",
-    "spoon",
-    "bowl",
-    "banana",
-    "apple",
-    "sandwich",
-    "orange",
-    "broccoli",
-    "carrot",
-    "hot dog",
-    "pizza",
-    "donut",
-    "cake",
-    "chair",
-    "couch",
-    "potted plant",
-    "bed",
-    "dining table",
-    "toilet",
-    "tv",
-    "laptop",
-    "mouse",
-    "remote",
-    "keyboard",
-    "cell phone",
-    "microwave",
-    "oven",
-    "toaster",
-    "sink",
-    "refrigerator",
-    "book",
-    "clock",
-    "vase",
-    "scissors",
-    "teddy bear",
-    "hair drier",
-    "toothbrush",
-]
-def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
-    return [
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
-        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
-
-__all__ = ["labels"]
diff --git a/models/cv/object_detection/yolov5/ixrt/common.py b/models/cv/object_detection/yolov5/ixrt/common.py
deleted file mode 100644
index 5f543555..00000000
--- a/models/cv/object_detection/yolov5/ixrt/common.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import numpy as np
-from tqdm import tqdm
-
-import tensorrt
-import cuda.cuda as cuda
-import cuda.cudart as cudart
-
-# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
-# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
-def box_class85to6(input):
-    center_x_y = input[:, :2]
-    side = input[:, 2:4]
-    conf = input[:, 4:5]
-    class_id = np.argmax(input[:, 5:], axis = -1)
-    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
-    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
-    x1_y1 = center_x_y - 0.5 * side
-    x2_y2 = center_x_y + 0.5 * side
-    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
-    return nms_input
-
-def save2json(batch_img_id, pred_boxes, json_result, class_trans):
-    for i, boxes in enumerate(pred_boxes):
-        if boxes is not None:
-            image_id = int(batch_img_id[i])
-            # have no target
-            if image_id == -1:
-                continue
-            for x, y, w, h, c, p in boxes:
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                        "image_id": image_id,
-                        "category_id": class_trans[c - 1],
-                        "bbox": [x, y, w, h],
-                        "score": p,
-                    }
-                )
-
-def create_engine_context(engine_path, logger):
-    with open(engine_path, "rb") as f:
-        runtime = tensorrt.Runtime(logger)
-        assert runtime
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine
-        context = engine.create_execution_context()
-        assert context
-
-    return engine, context
-
-def get_io_bindings(engine):
-    # Setup I/O bindings
-    inputs = []
-    outputs = []
-    allocations = []
-
-    for i in range(engine.num_bindings):
-        is_input = False
-        if engine.binding_is_input(i):
-            is_input = True
-        name = engine.get_binding_name(i)
-        dtype = engine.get_binding_dtype(i)
-        shape = engine.get_binding_shape(i)
-        if is_input:
-            batch_size = shape[0]
-        size = np.dtype(tensorrt.nptype(dtype)).itemsize
-        for s in shape:
-            size *= s
-        err, allocation = cudart.cudaMalloc(size)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-        binding = {
-            "index": i,
-            "name": name,
-            "dtype": np.dtype(tensorrt.nptype(dtype)),
-            "shape": list(shape),
-            "allocation": allocation,
-            "nbytes": size,
-        }
-        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
-        allocations.append(allocation)
-        if engine.binding_is_input(i):
-            inputs.append(binding)
-        else:
-            outputs.append(binding)
-    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/cut_model.py b/models/cv/object_detection/yolov5/ixrt/cut_model.py
deleted file mode 100644
index af0a3a4f..00000000
--- a/models/cv/object_detection/yolov5/ixrt/cut_model.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import onnx
-import argparse
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    parser.add_argument("--input_names", nargs='+', type=str)
-    parser.add_argument("--output_names", nargs='+', type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
-print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov5/ixrt/datasets/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/coco.py b/models/cv/object_detection/yolov5/ixrt/datasets/coco.py
deleted file mode 100644
index 7f355b84..00000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/coco.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import os.path
-from typing import Any, Callable, List, Optional, Tuple
-
-import cv2
-
-from .vision import VisionDataset
-from .pre_process import get_post_process
-class CocoDetection(VisionDataset):
-    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        annFile: str,
-        img_size: int,
-        data_process_type: str,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-        
-    ) -> None:
-        super().__init__(root, transforms, transform, target_transform)
-        from pycocotools.coco import COCO
-
-        self.coco = COCO(annFile)
-        self.ids = list(sorted(self.coco.imgs.keys()))
-        self.img_size = img_size
-        
-        self.transforms = get_post_process(data_process_type)
-
-    def _load_image(self, id: int):
-        path = self.coco.loadImgs(id)[0]["file_name"]
-        data = cv2.imread(os.path.join(self.root, path))
-        return data
-
-    def _load_target(self, id: int) -> List[Any]:
-        return self.coco.loadAnns(self.coco.getAnnIds(id))
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        id = self.ids[index]
-        image = self._load_image(id)
-        target = self._load_target(id)
-        origin_shape = image.shape[:2]
-
-        if self.transforms is not None:
-            image = self.transforms(image, self.img_size)
-
-        if len(target) > 0:
-            image_id = target[0]["image_id"]
-        else:
-            # have no target
-            image_id = -1
-        return image, origin_shape, image_id
-
-    def __len__(self) -> int:
-        return len(self.ids)
-
-
-class CocoCaptions(CocoDetection):
-    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-
-    Example:
-
-        .. code:: python
-
-            import torchvision.datasets as dset
-            import torchvision.transforms as transforms
-            cap = dset.CocoCaptions(root = 'dir where images are',
-                                    annFile = 'json annotation file',
-                                    transform=transforms.PILToTensor())
-
-            print('Number of samples: ', len(cap))
-            img, target = cap[3] # load 4th sample
-
-            print("Image Size: ", img.size())
-            print(target)
-
-        Output: ::
-
-            Number of samples: 82783
-            Image Size: (3L, 427L, 640L)
-            [u'A plane emitting smoke stream flying over a mountain.',
-            u'A plane darts across a bright blue sky behind a mountain covered in snow',
-            u'A plane leaves a contrail above the snowy mountain top.',
-            u'A mountain that has a plane flying overheard in the distance.',
-            u'A mountain view with a plume of smoke in the background']
-
-    """
-
-    def _load_target(self, id: int) -> List[str]:
-        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/common.py b/models/cv/object_detection/yolov5/ixrt/datasets/common.py
deleted file mode 100644
index e120e00f..00000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/common.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-    if not scaleup:  # only scale down, do not scale up (for better val mAP)
-        r = min(r, 1.0)
-
-    # Compute padding
-    ratio = r, r  # width, height ratios
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-    if auto:  # minimum rectangle
-        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
-    elif scaleFill:  # stretch
-        dw, dh = 0.0, 0.0
-        new_unpad = (new_shape[1], new_shape[0])
-        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
-    return im, ratio, (dw, dh)
-
-def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
-    # Rescale boxes (xyxy) from net_shape to ori_shape
-
-    if use_letterbox:
-
-        gain = min(
-            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
-        )  # gain  = new / old
-        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
-            net_shape[0] - ori_shape[0] * gain
-        ) / 2.0
-
-        boxes[:, [0, 2]] -= pad[0]  # x padding
-        boxes[:, [1, 3]] -= pad[1]  # y padding
-        boxes[:, :4] /= gain
-    else:
-        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
-
-        boxes[:, 0] /= x_scale
-        boxes[:, 1] /= y_scale
-        boxes[:, 2] /= x_scale
-        boxes[:, 3] /= y_scale
-
-    clip_boxes(boxes, ori_shape)
-    return boxes
-
-def clip_boxes(boxes, shape):
-
-    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py
deleted file mode 100644
index a58c02f8..00000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/post_process.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox, scale_boxes, clip_boxes
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Postprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Postprocess
-    elif data_process_type == "yolox":
-        return YoloxPostprocess
-    return None
-
-def Yolov3Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=False
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def Yolov5Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=True
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def YoloxPostprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
-            boxes[:, :4] /= r
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py
deleted file mode 100644
index 8cc643a8..00000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/pre_process.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Preprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Preprocess
-    elif data_process_type == "yolox":
-        return YoloxPreprocess
-    return None
-
-def Yolov3Preprocess(image, img_size):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    image = cv2.resize(image, (img_size, img_size))
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def Yolov5Preprocess(image, img_size, augment=False):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    if r != 1:  # if sizes are not equal
-        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
-        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
-
-    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
-
-    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def YoloxPreprocess(img, img_size, swap=(2,0,1)):
-
-    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
-    r = min(img_size / img.shape[0], img_size / img.shape[1])
-    resized_img = cv2.resize(
-        img,
-        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        interpolation=cv2.INTER_LINEAR, 
-    ).astype(np.uint8)
-
-    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-    padded_img = padded_img.transpose(swap)
-    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-
-    return padded_img
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/datasets/vision.py b/models/cv/object_detection/yolov5/ixrt/datasets/vision.py
deleted file mode 100644
index 32da4a78..00000000
--- a/models/cv/object_detection/yolov5/ixrt/datasets/vision.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import os
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-import torch.utils.data as data
-
-from types import FunctionType
-
-def _log_api_usage_once(obj: Any) -> None:
-
-    """
-    Logs API usage(module and name) within an organization.
-    In a large ecosystem, it's often useful to track the PyTorch and
-    TorchVision APIs usage. This API provides the similar functionality to the
-    logging module in the Python stdlib. It can be used for debugging purpose
-    to log which methods are used and by default it is inactive, unless the user
-    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
-    Please note it is triggered only once for the same API call within a process.
-    It does not collect any data from open-source users since it is no-op by default.
-    For more information, please refer to
-    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
-    * Logging policy: https://github.com/pytorch/vision/issues/5052;
-
-    Args:
-        obj (class instance or method): an object to extract info from.
-    """
-    module = obj.__module__
-    if not module.startswith("torchvision"):
-        module = f"torchvision.internal.{module}"
-    name = obj.__class__.__name__
-    if isinstance(obj, FunctionType):
-        name = obj.__name__
-    torch._C._log_api_usage_once(f"{module}.{name}")
-
-class VisionDataset(data.Dataset):
-    """
-    Base Class For making datasets which are compatible with torchvision.
-    It is necessary to override the ``__getitem__`` and ``__len__`` method.
-
-    Args:
-        root (string): Root directory of dataset.
-        transforms (callable, optional): A function/transforms that takes in
-            an image and a label and returns the transformed versions of both.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-
-    .. note::
-
-        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
-    """
-
-    _repr_indent = 4
-
-    def __init__(
-        self,
-        root: str,
-        transforms: Optional[Callable] = None,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-    ) -> None:
-        _log_api_usage_once(self)
-        if isinstance(root, str):
-            root = os.path.expanduser(root)
-        self.root = root
-
-        has_transforms = transforms is not None
-        has_separate_transform = transform is not None or target_transform is not None
-        if has_transforms and has_separate_transform:
-            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
-
-        # for backwards-compatibility
-        self.transform = transform
-        self.target_transform = target_transform
-
-        if has_separate_transform:
-            transforms = StandardTransform(transform, target_transform)
-        self.transforms = transforms
-
-    def __getitem__(self, index: int) -> Any:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            (Any): Sample and meta data, optionally transformed by the respective transforms.
-        """
-        raise NotImplementedError
-
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    def __repr__(self) -> str:
-        head = "Dataset " + self.__class__.__name__
-        body = [f"Number of datapoints: {self.__len__()}"]
-        if self.root is not None:
-            body.append(f"Root location: {self.root}")
-        body += self.extra_repr().splitlines()
-        if hasattr(self, "transforms") and self.transforms is not None:
-            body += [repr(self.transforms)]
-        lines = [head] + [" " * self._repr_indent + line for line in body]
-        return "\n".join(lines)
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def extra_repr(self) -> str:
-        return ""
-
-
-class StandardTransform:
-    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
-        self.transform = transform
-        self.target_transform = target_transform
-
-    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
-        if self.transform is not None:
-            input = self.transform(input)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-        return input, target
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def __repr__(self) -> str:
-        body = [self.__class__.__name__]
-        if self.transform is not None:
-            body += self._format_transform_repr(self.transform, "Transform: ")
-        if self.target_transform is not None:
-            body += self._format_transform_repr(self.target_transform, "Target transform: ")
-
-        return "\n".join(body)
diff --git a/models/cv/object_detection/yolov5/ixrt/deploy.py b/models/cv/object_detection/yolov5/ixrt/deploy.py
deleted file mode 100644
index ec56b7ab..00000000
--- a/models/cv/object_detection/yolov5/ixrt/deploy.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# !/usr/bin/env python
-# -*- coding: utf-8 -*-
-import argparse
-from tensorrt.deploy.api import GraphTransform, create_source, create_target
-
-class Transform:
-    def __init__(self, graph):
-        self.t = GraphTransform(graph)
-        self.graph = graph
-
-    def ReplaceFocus(self, input_edge, outputs, to_op):
-        input_var = self.graph.get_variable(input_edge)
-        op = self.graph.get_operator(to_op)
-        self.t.delete_operators_between_var_op(
-            from_var=input_var, to_op=op
-        )
-        self.t.make_operator(
-            "Focus", inputs=input_edge, outputs=outputs
-        )
-        return self.graph
-
-    def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
-        if attributes["anchor"] is None:
-            del attributes["anchor"]
-        self.t.make_operator(
-            op_type, inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-    def AddConcatOp(self, inputs: list, outputs, **attributes):
-        self.t.make_operator(
-            "Concat", inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-def customize_ops(graph, args):
-    t = Transform(graph)
-    fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
-    if fuse_focus:
-        graph = t.ReplaceFocus(
-            input_edge=args.focus_input,
-            outputs=args.focus_output,
-            to_op=args.focus_last_node
-        )
-    decoder_input = args.decoder_input_names
-    num = len(decoder_input) // 3
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[:num],
-        outputs=["decoder_8"],
-        op_type=args.decoder_type,
-        anchor=args.decoder8_anchor,
-        num_class=args.num_class,
-        stride=8,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num:num*2],
-        outputs=["decoder_16"],
-        op_type=args.decoder_type,
-        anchor=args.decoder16_anchor,
-        num_class=args.num_class,
-        stride=16,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num*2:num*2+1],
-        outputs=["decoder_32"],
-        op_type=args.decoder_type,
-        anchor=args.decoder32_anchor,
-        num_class=args.num_class,
-        stride=32,
-        faster_impl=args.faster
-    )
-    if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2+1:],
-            outputs=["decoder_64"],
-            op_type=args.decoder_type,
-            anchor=args.decoder64_anchor,
-            num_class=args.num_class,
-            stride=64,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
-            outputs=["output"],
-            axis=1
-        )
-    elif args.with_nms:
-        graph = t.AddConcatOp(
-            inputs=["decoder_32", "decoder_16", "decoder_8"],
-            outputs=["output"],
-            axis=1
-        )
-
-        graph.outputs.clear()
-        graph.add_output("output")
-        graph.outputs["output"].dtype = "FLOAT"
-    else:
-        graph.outputs.clear()
-        graph.add_output("decoder_8")
-        graph.outputs["decoder_8"].dtype = "FLOAT"
-        graph.add_output("decoder_16")
-        graph.outputs["decoder_16"].dtype = "FLOAT"
-        graph.add_output("decoder_32")
-        graph.outputs["decoder_32"].dtype = "FLOAT"
-    return graph
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--src", type=str)
-    parser.add_argument("--dst", type=str)
-    parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
-    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
-    parser.add_argument("--decoder_input_names", nargs='+', type=str)
-    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
-    parser.add_argument("--num_class", type=int, default=80)
-    parser.add_argument("--faster", type=int, default=1)
-    parser.add_argument("--focus_input", type=str, default=None)
-    parser.add_argument("--focus_output", type=str, default=None)
-    parser.add_argument("--focus_last_node", type=str, default=None)
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-
-    args = parse_args()
-    graph = create_source(args.src)()
-    graph = customize_ops(graph, args)
-    create_target(saved_path=args.dst).export(graph)
-    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/inference.py b/models/cv/object_detection/yolov5/ixrt/inference.py
deleted file mode 100644
index 5f5452d5..00000000
--- a/models/cv/object_detection/yolov5/ixrt/inference.py
+++ /dev/null
@@ -1,260 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import argparse
-import glob
-import json
-import os
-import time
-import sys
-
-import torch
-import numpy as np
-import cuda.cuda as cuda
-import cuda.cudart as cudart
-
-from coco_labels import coco80_to_coco91_class, labels
-from common import save2json, box_class85to6
-from common import create_engine_context, get_io_bindings
-from calibration_dataset import create_dataloaders
-from datasets.post_process import get_post_process
-
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-from tqdm import tqdm
-from tqdm.contrib import tzip
-
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-
-    # Load dataloader
-    dataloader = create_dataloaders(
-        data_path=config.eval_dir,
-        annFile=config.coco_gt,
-        img_sz=config.imgsz,
-        batch_size=config.bsz,
-        step=config.loop_count,
-        data_process_type=config.data_process_type
-    )
-
-    # Load post process func
-    if config.test_mode == "MAP":
-        post_process_func = get_post_process(config.data_process_type)
-
-    bsz = config.bsz
-    num_samples = 5000
-    if config.loop_count > 0:
-        num_samples = bsz * config.loop_count
-    num_batch = len(dataloader)
-    print("=" * 30)
-    print(f"Test Mode : {'Asynchronous' if config.use_async else 'Synchronous'}")
-    print(f"Total sample : {num_samples}\nBatch_size : {bsz}\nRun Batch : {num_batch}")
-    print("=" * 30)
-
-    json_result = []
-    forward_time = 0.0
-    class_map = coco80_to_coco91_class()
-
-    host_mem = tensorrt.IHostMemory
-    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-
-    # Load Engine
-    engine, context = create_engine_context(config.model_engine, logger)
-    inputs, outputs, allocations = get_io_bindings(engine)
-
-    # Load nms_engine
-    if config.test_mode == "MAP" and config.nms_type == "GPU":
-        nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
-        nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
-        nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
-        nms_output1 = np.zeros(nms_outputs[1]["shape"], nms_outputs[1]["dtype"])
-        print(f"nms_output0 shape : {nms_output0.shape}   nms_output0 type : {nms_output0.dtype}")
-        print(f"nms_output1 shape : {nms_output1.shape}   nms_output1 type : {nms_output1.dtype}")
-
-    # Warm up
-    if config.warm_up > 0:
-        print("\nWarm Start.")
-        for i in range(config.warm_up):
-            context.execute_v2(allocations)
-        print("Warm Done.")
-
-    # Prepare the output data
-    output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    print(f"output shape : {output.shape} output type : {output.dtype}")
-
-    for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
-        batch_data = batch_data.numpy()
-        batch_img_shape = [batch_img_shape[0].numpy(), batch_img_shape[1].numpy()]
-        # batch_img_id = batch_img_id.numpy()
-
-        cur_bsz_sample = batch_data.shape[0]
-
-        # Set input
-        err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-        # Forward
-        # start_time = time.time()
-        context.execute_v2(allocations)
-        # end_time = time.time()
-        # forward_time += end_time - start_time
-
-        if config.test_mode == "MAP":
-            # Fetch output
-            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
-            assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-            # Step 1 : prepare data to nms
-            _, box_num, box_unit = output.shape
-            if config.debug:
-                print(f"[Debug] box_num(25200) : {box_num}, box_unit(6) : {box_unit}")
-
-            if config.decoder_faster == 0:
-                nms_input = box_class85to6(output.reshape(-1, box_unit))
-            else:
-                nms_input = output
-
-            # Step 2 : nms
-            # cpu nms(TODO)
-
-            # gpu nms
-            if config.nms_type == "GPU":
-
-                # Set nms input
-                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-                nms_context.execute_v2(nms_allocations)
-                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-            # Step 3 : post process + save
-            pred_boxes = post_process_func(
-                ori_img_shape=batch_img_shape,
-                imgsz=(config.imgsz, config.imgsz),
-                box_datas=nms_output0,
-                box_nums=nms_output1,
-                sample_num=cur_bsz_sample,
-                max_det=config.max_det
-            )
-            save2json(batch_img_id, pred_boxes, json_result, class_map)
-
-    # fps = num_samples / forward_time
-
-    if config.test_mode == "FPS":
-        start_time = time.time()       
-        for i in range(config.loop_count):
-            context.execute_v2(allocations)  
-        end_time = time.time()  
-        forward_time = end_time - start_time      
-        fps = (config.loop_count*config.bsz) / forward_time
-        print("FPS : ", fps)
-        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
-        if fps >= config.fps_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(10)
-
-    if config.test_mode == "MAP":
-        if len(json_result) == 0:
-            print("Predict zero box!")
-            exit(10)
-
-        if not os.path.exists(config.pred_dir):
-            os.makedirs(config.pred_dir)
-
-        pred_json = os.path.join(
-            config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
-        )
-        with open(pred_json, "w") as f:
-            json.dump(json_result, f)
-
-        anno_json = config.coco_gt
-        anno = COCO(anno_json)  # init annotations api
-        pred = anno.loadRes(pred_json)  # init predictions api
-        eval = COCOeval(anno, pred, "bbox")
-
-        eval.evaluate()
-        eval.accumulate()
-        print(
-            f"==============================eval {config.model_name} {config.precision} coco map =============================="
-        )
-        eval.summarize()
-
-        map, map50 = eval.stats[:2]
-        print("MAP@0.5 : ", map50)
-        print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        if map50 >= config.map_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(10)
-
-def parse_config():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", type=str, default="YOLOV5s", help="YOLOV3 YOLOV5 YOLOV7 YOLOX"
-    )
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
-    parser.add_argument(
-        "--model_engine",
-        type=str,
-        default="",
-        help="model engine path",
-    )
-    parser.add_argument(
-        "--nms_engine",
-        type=str,
-        default="",
-        help="nms engine path",
-    )
-    parser.add_argument(
-        "--coco_gt",
-        type=str,
-        default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
-        help="coco instances_val2017.json",
-    )
-    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
-    parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
-    parser.add_argument(
-        "--eval_dir",
-        type=str,
-        default="data/datasets/cv/coco2017/val2017",
-        help="coco image dir",
-    )
-    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
-    parser.add_argument(
-        "--imgsz",
-        "--img",
-        "--img-size",
-        type=int,
-        default=640,
-        help="inference size h,w",
-    )
-    parser.add_argument("--max_det", type=int, default=1000, help="maximum detections per image")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--use_async", action="store_true")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
-    parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
-    parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
-    parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
-    parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
-
-    config = parser.parse_args()
-    print("config:", config)
-    return config
-
-if __name__ == "__main__":
-    config = parse_config()
-    main(config)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py
deleted file mode 100644
index 932efbdf..00000000
--- a/models/cv/object_detection/yolov5/ixrt/load_ixrt_plugin.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import ctypes
-import tensorrt
-from os.path import join, dirname, exists
-def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    ctypes.CDLL(dynamic_path)
-    tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py
deleted file mode 100644
index 00ed65dd..00000000
--- a/models/cv/object_detection/yolov5/ixrt/modify_batchsize.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import onnx
-import argparse
-
-def change_input_dim(model, bsz):
-    batch_size = bsz
-
-    # The following code changes the first dimension of every input to be batch_size
-    # Modify as appropriate ... note that this requires all inputs to
-    # have the same batch_size
-    inputs = model.graph.input
-    for input in inputs:
-        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
-        # Add checks as needed.
-        dim1 = input.type.tensor_type.shape.dim[0]
-        # update dim to be a symbolic value
-        if isinstance(batch_size, str):
-            # set dynamic batch size
-            dim1.dim_param = batch_size
-        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
-            # set given batch size
-            dim1.dim_value = int(batch_size)
-        else:
-            # set batch size of 1
-            dim1.dim_value = 1
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--batch_size", type=int)
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-model = onnx.load(args.origin_model)
-change_input_dim(model, args.batch_size)
-onnx.save(model, args.output_model)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/quant.py b/models/cv/object_detection/yolov5/ixrt/quant.py
deleted file mode 100644
index d73212ca..00000000
--- a/models/cv/object_detection/yolov5/ixrt/quant.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-import random
-import argparse
-import numpy as np
-from tensorrt.deploy import static_quantize
-
-import torch
-import sys
-sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
-print(sys.path)
-from calibration_dataset import create_dataloaders
-
-def setseed(seed=42):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str)
-    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
-    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
-    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
-    parser.add_argument("--disable_quant_names", nargs='*', type=str)
-    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
-    parser.add_argument("--bsz", type=int, default=32)
-    parser.add_argument("--step", type=int, default=20)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--imgsz", type=int, default=640)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-setseed(args.seed)
-model_name = args.model_name
-
-out_dir = args.save_dir
-dataloader = create_dataloaders(
-    data_path=args.dataset_dir,
-    annFile=args.ann_file,
-    img_sz=args.imgsz,
-    batch_size=args.bsz,
-    step=args.step,
-    data_process_type=args.data_process_type
-)
-# print("disable_quant_names : ", args.disable_quant_names)
-static_quantize(args.model,
-        calibration_dataloader=dataloader,
-        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
-        observer=args.observer,
-        data_preprocess=lambda x: x[0].to("cuda"),
-        quant_format="qdq",
-        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/requirements.txt b/models/cv/object_detection/yolov5/ixrt/requirements.txt
deleted file mode 100644
index 10a9fba6..00000000
--- a/models/cv/object_detection/yolov5/ixrt/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-tqdm
-onnx
-onnxsim
-ultralytics==8.3.97
-pycocotools
-opencv-python==4.6.0.66
-pycuda
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
index cd65d210..52ec959f 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_accuracy.sh
@@ -3,16 +3,15 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=0.626
+TGT=-1
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -115,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -123,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
-            --with_nms             True                     \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -138,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}}_with_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -154,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
index 3f841458..5e2f97fb 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_fp16_performance.sh
@@ -3,17 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=735
-LOOP_COUNT=100
+TGT=-1
+LOOP_COUNT=10
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -80,7 +76,6 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
-
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -116,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -124,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
-            --with_nms             False                    \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -139,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -155,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -182,7 +176,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}              \
+    --model_engine=${ENGINE_FILE}               \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
index 24829da8..606fc94c 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_accuracy.sh
@@ -3,16 +3,15 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=0.626
+TGT=-1
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -115,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -123,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
-            --with_nms             True                     \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -138,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}}_with_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -154,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
index 8afcb722..b2983669 100644
--- a/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
+++ b/models/cv/object_detection/yolov5/ixrt/scripts/infer_yolov5_int8_performance.sh
@@ -3,17 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=735
-LOOP_COUNT=100
+TGT=-1
+LOOP_COUNT=10
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -116,7 +112,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion_no_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -124,7 +120,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
-            --with_nms             False                    \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -139,7 +134,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}_without_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -155,7 +150,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5/ixrt/simplify_model.py b/models/cv/object_detection/yolov5/ixrt/simplify_model.py
deleted file mode 100644
index b4254b6f..00000000
--- a/models/cv/object_detection/yolov5/ixrt/simplify_model.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import onnx
-import argparse
-from onnxsim import simplify
-
-# Simplify
-def simplify_model(args):
-    onnx_model = onnx.load(args.origin_model)
-    model_simp, check = simplify(onnx_model)
-    model_simp = onnx.shape_inference.infer_shapes(model_simp)
-    onnx.save(model_simp, args.output_model)
-    print("  Simplify onnx Done.")
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-simplify_model(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/README.md b/models/cv/object_detection/yolov5s/ixrt/README.md
index 245205bf..88f55f22 100755
--- a/models/cv/object_detection/yolov5s/ixrt/README.md
+++ b/models/cv/object_detection/yolov5s/ixrt/README.md
@@ -27,7 +27,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 ```
 
 ### Model Conversion
@@ -53,13 +53,13 @@ popd
 ## Model Inference
 
 ```bash
-export PROJ_DIR=/Path/to/yolov5s/ixrt
+export PROJ_DIR=./
 export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
 export EVAL_DIR=${DATASETS_DIR}/images/val2017
-export RUN_DIR=${PROJ_DIR}/
-export CONFIG_DIR=config/YOLOV5S_CONFIG
+export RUN_DIR=../../ixrt_common
+export CONFIG_DIR=../../ixrt_common/config/YOLOV5S_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov5s/ixrt/build_engine.py b/models/cv/object_detection/yolov5s/ixrt/build_engine.py
deleted file mode 100644
index d47e45e5..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/build_engine.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import os
-import cv2
-import argparse
-import numpy as np
-
-import torch
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    # engine args
-    parser.add_argument("--engine", type=str, default=None)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py
deleted file mode 100644
index 25f0ab8a..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/build_nms_engine.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import os
-import argparse
-import torch
-import onnx
-from onnx import helper
-from onnx import TensorProto, numpy_helper
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-def create_onnx(args):
-    nms = helper.make_node(
-        "DetectionNMS_IxRT",
-        name="NMS",
-        inputs=["nms_input"],
-        outputs=["nms_output0", "nms_output1"],
-        nMaxKeep=args.max_box_pre_img,
-        fIoUThresh=args.iou_thresh,
-        fScoreThresh=args.score_thresh
-    )
-    graph = helper.make_graph(
-        nodes=[nms],
-        name="gpu_nms",
-        inputs=[
-            helper.make_tensor_value_info(
-                "nms_input", onnx.TensorProto.FLOAT, (args.bsz, args.all_box_num, 6)
-            )
-        ],
-        outputs=[
-            helper.make_tensor_value_info(
-                "nms_output0", onnx.TensorProto.FLOAT, (args.bsz, args.max_box_pre_img, 6)
-            ),
-            helper.make_tensor_value_info(
-                "nms_output1", onnx.TensorProto.INT32, (args.bsz,)
-            )
-        ],
-        initializer=[]
-    )
-
-    op = onnx.OperatorSetIdProto()
-    op.version = 13
-    model = onnx.helper.make_model(graph)
-
-    model = onnx.helper.make_model(graph, opset_imports=[op])
-    onnx_path = args.path + "/nms.onnx"
-    onnx.save(model, onnx_path)
-
-def build_engine(args):
-    onnx_path = args.path + "/nms.onnx"
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(onnx_path)
-    plan = builder.build_serialized_network(network, build_config)
-
-    engine_path = args.path + "/nms.engine"
-    with open(engine_path, "wb") as f:
-        f.write(plan)
-
-def main(args):
-    create_onnx(args)
-    build_engine(args)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--bsz", type=int, default=1, help="batch size")
-    parser.add_argument("--path", type=str)
-    parser.add_argument("--all_box_num", type=int, default=25200)
-    parser.add_argument("--max_box_pre_img", type=int, default=1000)
-    parser.add_argument("--iou_thresh", type=float, default=0.6)
-    parser.add_argument("--score_thresh", type=float, default=0.001)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py
deleted file mode 100644
index 578e013d..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/calibration_dataset.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import os
-import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-
-
-
-from datasets.coco import CocoDetection
-
-def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
-    dataset = CocoDetection(
-        root=data_path,
-        annFile=annFile,
-        img_size=img_sz,
-        data_process_type=data_process_type
-    )
-    calibration_dataset = dataset
-    num_samples = min(5000, batch_size * step)
-    if num_samples > 0:
-        calibration_dataset = torch.utils.data.Subset(
-            dataset, indices=range(num_samples)
-        )
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov5s/ixrt/ci/prepare.sh
index b9f3a57f..a08c47d7 100644
--- a/models/cv/object_detection/yolov5s/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/ci/prepare.sh
@@ -25,7 +25,7 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 
 mkdir -p checkpoints
 cp -r /root/data/3rd_party/yolov5 ./
diff --git a/models/cv/object_detection/yolov5s/ixrt/coco_labels.py b/models/cv/object_detection/yolov5s/ixrt/coco_labels.py
deleted file mode 100644
index 69d38878..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/coco_labels.py
+++ /dev/null
@@ -1,89 +0,0 @@
-labels = [
-    "person",
-    "bicycle",
-    "car",
-    "motorcycle",
-    "airplane",
-    "bus",
-    "train",
-    "truck",
-    "boat",
-    "traffic light",
-    "fire hydrant",
-    "stop sign",
-    "parking meter",
-    "bench",
-    "bird",
-    "cat",
-    "dog",
-    "horse",
-    "sheep",
-    "cow",
-    "elephant",
-    "bear",
-    "zebra",
-    "giraffe",
-    "backpack",
-    "umbrella",
-    "handbag",
-    "tie",
-    "suitcase",
-    "frisbee",
-    "skis",
-    "snowboard",
-    "sports ball",
-    "kite",
-    "baseball bat",
-    "baseball glove",
-    "skateboard",
-    "surfboard",
-    "tennis racket",
-    "bottle",
-    "wine glass",
-    "cup",
-    "fork",
-    "knife",
-    "spoon",
-    "bowl",
-    "banana",
-    "apple",
-    "sandwich",
-    "orange",
-    "broccoli",
-    "carrot",
-    "hot dog",
-    "pizza",
-    "donut",
-    "cake",
-    "chair",
-    "couch",
-    "potted plant",
-    "bed",
-    "dining table",
-    "toilet",
-    "tv",
-    "laptop",
-    "mouse",
-    "remote",
-    "keyboard",
-    "cell phone",
-    "microwave",
-    "oven",
-    "toaster",
-    "sink",
-    "refrigerator",
-    "book",
-    "clock",
-    "vase",
-    "scissors",
-    "teddy bear",
-    "hair drier",
-    "toothbrush",
-]
-def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
-    return [
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
-        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
-
-__all__ = ["labels"]
diff --git a/models/cv/object_detection/yolov5s/ixrt/common.py b/models/cv/object_detection/yolov5s/ixrt/common.py
deleted file mode 100644
index 5f543555..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/common.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import numpy as np
-from tqdm import tqdm
-
-import tensorrt
-import cuda.cuda as cuda
-import cuda.cudart as cudart
-
-# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
-# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
-def box_class85to6(input):
-    center_x_y = input[:, :2]
-    side = input[:, 2:4]
-    conf = input[:, 4:5]
-    class_id = np.argmax(input[:, 5:], axis = -1)
-    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
-    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
-    x1_y1 = center_x_y - 0.5 * side
-    x2_y2 = center_x_y + 0.5 * side
-    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
-    return nms_input
-
-def save2json(batch_img_id, pred_boxes, json_result, class_trans):
-    for i, boxes in enumerate(pred_boxes):
-        if boxes is not None:
-            image_id = int(batch_img_id[i])
-            # have no target
-            if image_id == -1:
-                continue
-            for x, y, w, h, c, p in boxes:
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                        "image_id": image_id,
-                        "category_id": class_trans[c - 1],
-                        "bbox": [x, y, w, h],
-                        "score": p,
-                    }
-                )
-
-def create_engine_context(engine_path, logger):
-    with open(engine_path, "rb") as f:
-        runtime = tensorrt.Runtime(logger)
-        assert runtime
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine
-        context = engine.create_execution_context()
-        assert context
-
-    return engine, context
-
-def get_io_bindings(engine):
-    # Setup I/O bindings
-    inputs = []
-    outputs = []
-    allocations = []
-
-    for i in range(engine.num_bindings):
-        is_input = False
-        if engine.binding_is_input(i):
-            is_input = True
-        name = engine.get_binding_name(i)
-        dtype = engine.get_binding_dtype(i)
-        shape = engine.get_binding_shape(i)
-        if is_input:
-            batch_size = shape[0]
-        size = np.dtype(tensorrt.nptype(dtype)).itemsize
-        for s in shape:
-            size *= s
-        err, allocation = cudart.cudaMalloc(size)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-        binding = {
-            "index": i,
-            "name": name,
-            "dtype": np.dtype(tensorrt.nptype(dtype)),
-            "shape": list(shape),
-            "allocation": allocation,
-            "nbytes": size,
-        }
-        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
-        allocations.append(allocation)
-        if engine.binding_is_input(i):
-            inputs.append(binding)
-        else:
-            outputs.append(binding)
-    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/cut_model.py b/models/cv/object_detection/yolov5s/ixrt/cut_model.py
deleted file mode 100644
index af0a3a4f..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/cut_model.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import onnx
-import argparse
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    parser.add_argument("--input_names", nargs='+', type=str)
-    parser.add_argument("--output_names", nargs='+', type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
-print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov5s/ixrt/datasets/__init__.py
deleted file mode 100755
index e69de29b..00000000
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py b/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py
deleted file mode 100755
index 7f355b84..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/coco.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import os.path
-from typing import Any, Callable, List, Optional, Tuple
-
-import cv2
-
-from .vision import VisionDataset
-from .pre_process import get_post_process
-class CocoDetection(VisionDataset):
-    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        annFile: str,
-        img_size: int,
-        data_process_type: str,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-        
-    ) -> None:
-        super().__init__(root, transforms, transform, target_transform)
-        from pycocotools.coco import COCO
-
-        self.coco = COCO(annFile)
-        self.ids = list(sorted(self.coco.imgs.keys()))
-        self.img_size = img_size
-        
-        self.transforms = get_post_process(data_process_type)
-
-    def _load_image(self, id: int):
-        path = self.coco.loadImgs(id)[0]["file_name"]
-        data = cv2.imread(os.path.join(self.root, path))
-        return data
-
-    def _load_target(self, id: int) -> List[Any]:
-        return self.coco.loadAnns(self.coco.getAnnIds(id))
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        id = self.ids[index]
-        image = self._load_image(id)
-        target = self._load_target(id)
-        origin_shape = image.shape[:2]
-
-        if self.transforms is not None:
-            image = self.transforms(image, self.img_size)
-
-        if len(target) > 0:
-            image_id = target[0]["image_id"]
-        else:
-            # have no target
-            image_id = -1
-        return image, origin_shape, image_id
-
-    def __len__(self) -> int:
-        return len(self.ids)
-
-
-class CocoCaptions(CocoDetection):
-    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-
-    Example:
-
-        .. code:: python
-
-            import torchvision.datasets as dset
-            import torchvision.transforms as transforms
-            cap = dset.CocoCaptions(root = 'dir where images are',
-                                    annFile = 'json annotation file',
-                                    transform=transforms.PILToTensor())
-
-            print('Number of samples: ', len(cap))
-            img, target = cap[3] # load 4th sample
-
-            print("Image Size: ", img.size())
-            print(target)
-
-        Output: ::
-
-            Number of samples: 82783
-            Image Size: (3L, 427L, 640L)
-            [u'A plane emitting smoke stream flying over a mountain.',
-            u'A plane darts across a bright blue sky behind a mountain covered in snow',
-            u'A plane leaves a contrail above the snowy mountain top.',
-            u'A mountain that has a plane flying overheard in the distance.',
-            u'A mountain view with a plume of smoke in the background']
-
-    """
-
-    def _load_target(self, id: int) -> List[str]:
-        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/common.py b/models/cv/object_detection/yolov5s/ixrt/datasets/common.py
deleted file mode 100755
index e120e00f..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/common.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-    if not scaleup:  # only scale down, do not scale up (for better val mAP)
-        r = min(r, 1.0)
-
-    # Compute padding
-    ratio = r, r  # width, height ratios
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-    if auto:  # minimum rectangle
-        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
-    elif scaleFill:  # stretch
-        dw, dh = 0.0, 0.0
-        new_unpad = (new_shape[1], new_shape[0])
-        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
-    return im, ratio, (dw, dh)
-
-def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
-    # Rescale boxes (xyxy) from net_shape to ori_shape
-
-    if use_letterbox:
-
-        gain = min(
-            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
-        )  # gain  = new / old
-        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
-            net_shape[0] - ori_shape[0] * gain
-        ) / 2.0
-
-        boxes[:, [0, 2]] -= pad[0]  # x padding
-        boxes[:, [1, 3]] -= pad[1]  # y padding
-        boxes[:, :4] /= gain
-    else:
-        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
-
-        boxes[:, 0] /= x_scale
-        boxes[:, 1] /= y_scale
-        boxes[:, 2] /= x_scale
-        boxes[:, 3] /= y_scale
-
-    clip_boxes(boxes, ori_shape)
-    return boxes
-
-def clip_boxes(boxes, shape):
-
-    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py
deleted file mode 100755
index a58c02f8..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/post_process.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox, scale_boxes, clip_boxes
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Postprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Postprocess
-    elif data_process_type == "yolox":
-        return YoloxPostprocess
-    return None
-
-def Yolov3Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=False
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def Yolov5Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=True
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def YoloxPostprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
-            boxes[:, :4] /= r
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py
deleted file mode 100755
index 8cc643a8..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/pre_process.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Preprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Preprocess
-    elif data_process_type == "yolox":
-        return YoloxPreprocess
-    return None
-
-def Yolov3Preprocess(image, img_size):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    image = cv2.resize(image, (img_size, img_size))
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def Yolov5Preprocess(image, img_size, augment=False):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    if r != 1:  # if sizes are not equal
-        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
-        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
-
-    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
-
-    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def YoloxPreprocess(img, img_size, swap=(2,0,1)):
-
-    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
-    r = min(img_size / img.shape[0], img_size / img.shape[1])
-    resized_img = cv2.resize(
-        img,
-        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        interpolation=cv2.INTER_LINEAR, 
-    ).astype(np.uint8)
-
-    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-    padded_img = padded_img.transpose(swap)
-    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-
-    return padded_img
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py b/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py
deleted file mode 100755
index 32da4a78..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/datasets/vision.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import os
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-import torch.utils.data as data
-
-from types import FunctionType
-
-def _log_api_usage_once(obj: Any) -> None:
-
-    """
-    Logs API usage(module and name) within an organization.
-    In a large ecosystem, it's often useful to track the PyTorch and
-    TorchVision APIs usage. This API provides the similar functionality to the
-    logging module in the Python stdlib. It can be used for debugging purpose
-    to log which methods are used and by default it is inactive, unless the user
-    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
-    Please note it is triggered only once for the same API call within a process.
-    It does not collect any data from open-source users since it is no-op by default.
-    For more information, please refer to
-    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
-    * Logging policy: https://github.com/pytorch/vision/issues/5052;
-
-    Args:
-        obj (class instance or method): an object to extract info from.
-    """
-    module = obj.__module__
-    if not module.startswith("torchvision"):
-        module = f"torchvision.internal.{module}"
-    name = obj.__class__.__name__
-    if isinstance(obj, FunctionType):
-        name = obj.__name__
-    torch._C._log_api_usage_once(f"{module}.{name}")
-
-class VisionDataset(data.Dataset):
-    """
-    Base Class For making datasets which are compatible with torchvision.
-    It is necessary to override the ``__getitem__`` and ``__len__`` method.
-
-    Args:
-        root (string): Root directory of dataset.
-        transforms (callable, optional): A function/transforms that takes in
-            an image and a label and returns the transformed versions of both.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-
-    .. note::
-
-        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
-    """
-
-    _repr_indent = 4
-
-    def __init__(
-        self,
-        root: str,
-        transforms: Optional[Callable] = None,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-    ) -> None:
-        _log_api_usage_once(self)
-        if isinstance(root, str):
-            root = os.path.expanduser(root)
-        self.root = root
-
-        has_transforms = transforms is not None
-        has_separate_transform = transform is not None or target_transform is not None
-        if has_transforms and has_separate_transform:
-            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
-
-        # for backwards-compatibility
-        self.transform = transform
-        self.target_transform = target_transform
-
-        if has_separate_transform:
-            transforms = StandardTransform(transform, target_transform)
-        self.transforms = transforms
-
-    def __getitem__(self, index: int) -> Any:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            (Any): Sample and meta data, optionally transformed by the respective transforms.
-        """
-        raise NotImplementedError
-
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    def __repr__(self) -> str:
-        head = "Dataset " + self.__class__.__name__
-        body = [f"Number of datapoints: {self.__len__()}"]
-        if self.root is not None:
-            body.append(f"Root location: {self.root}")
-        body += self.extra_repr().splitlines()
-        if hasattr(self, "transforms") and self.transforms is not None:
-            body += [repr(self.transforms)]
-        lines = [head] + [" " * self._repr_indent + line for line in body]
-        return "\n".join(lines)
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def extra_repr(self) -> str:
-        return ""
-
-
-class StandardTransform:
-    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
-        self.transform = transform
-        self.target_transform = target_transform
-
-    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
-        if self.transform is not None:
-            input = self.transform(input)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-        return input, target
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def __repr__(self) -> str:
-        body = [self.__class__.__name__]
-        if self.transform is not None:
-            body += self._format_transform_repr(self.transform, "Transform: ")
-        if self.target_transform is not None:
-            body += self._format_transform_repr(self.target_transform, "Target transform: ")
-
-        return "\n".join(body)
diff --git a/models/cv/object_detection/yolov5s/ixrt/deploy.py b/models/cv/object_detection/yolov5s/ixrt/deploy.py
deleted file mode 100644
index ec56b7ab..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/deploy.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# !/usr/bin/env python
-# -*- coding: utf-8 -*-
-import argparse
-from tensorrt.deploy.api import GraphTransform, create_source, create_target
-
-class Transform:
-    def __init__(self, graph):
-        self.t = GraphTransform(graph)
-        self.graph = graph
-
-    def ReplaceFocus(self, input_edge, outputs, to_op):
-        input_var = self.graph.get_variable(input_edge)
-        op = self.graph.get_operator(to_op)
-        self.t.delete_operators_between_var_op(
-            from_var=input_var, to_op=op
-        )
-        self.t.make_operator(
-            "Focus", inputs=input_edge, outputs=outputs
-        )
-        return self.graph
-
-    def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
-        if attributes["anchor"] is None:
-            del attributes["anchor"]
-        self.t.make_operator(
-            op_type, inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-    def AddConcatOp(self, inputs: list, outputs, **attributes):
-        self.t.make_operator(
-            "Concat", inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-def customize_ops(graph, args):
-    t = Transform(graph)
-    fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
-    if fuse_focus:
-        graph = t.ReplaceFocus(
-            input_edge=args.focus_input,
-            outputs=args.focus_output,
-            to_op=args.focus_last_node
-        )
-    decoder_input = args.decoder_input_names
-    num = len(decoder_input) // 3
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[:num],
-        outputs=["decoder_8"],
-        op_type=args.decoder_type,
-        anchor=args.decoder8_anchor,
-        num_class=args.num_class,
-        stride=8,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num:num*2],
-        outputs=["decoder_16"],
-        op_type=args.decoder_type,
-        anchor=args.decoder16_anchor,
-        num_class=args.num_class,
-        stride=16,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num*2:num*2+1],
-        outputs=["decoder_32"],
-        op_type=args.decoder_type,
-        anchor=args.decoder32_anchor,
-        num_class=args.num_class,
-        stride=32,
-        faster_impl=args.faster
-    )
-    if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2+1:],
-            outputs=["decoder_64"],
-            op_type=args.decoder_type,
-            anchor=args.decoder64_anchor,
-            num_class=args.num_class,
-            stride=64,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
-            outputs=["output"],
-            axis=1
-        )
-    elif args.with_nms:
-        graph = t.AddConcatOp(
-            inputs=["decoder_32", "decoder_16", "decoder_8"],
-            outputs=["output"],
-            axis=1
-        )
-
-        graph.outputs.clear()
-        graph.add_output("output")
-        graph.outputs["output"].dtype = "FLOAT"
-    else:
-        graph.outputs.clear()
-        graph.add_output("decoder_8")
-        graph.outputs["decoder_8"].dtype = "FLOAT"
-        graph.add_output("decoder_16")
-        graph.outputs["decoder_16"].dtype = "FLOAT"
-        graph.add_output("decoder_32")
-        graph.outputs["decoder_32"].dtype = "FLOAT"
-    return graph
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--src", type=str)
-    parser.add_argument("--dst", type=str)
-    parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
-    parser.add_argument("--with_nms", type=bool, default=False, help="engine with nms")
-    parser.add_argument("--decoder_input_names", nargs='+', type=str)
-    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
-    parser.add_argument("--num_class", type=int, default=80)
-    parser.add_argument("--faster", type=int, default=1)
-    parser.add_argument("--focus_input", type=str, default=None)
-    parser.add_argument("--focus_output", type=str, default=None)
-    parser.add_argument("--focus_last_node", type=str, default=None)
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-
-    args = parse_args()
-    graph = create_source(args.src)()
-    graph = customize_ops(graph, args)
-    create_target(saved_path=args.dst).export(graph)
-    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/inference.py b/models/cv/object_detection/yolov5s/ixrt/inference.py
deleted file mode 100644
index 5f5452d5..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/inference.py
+++ /dev/null
@@ -1,260 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import argparse
-import glob
-import json
-import os
-import time
-import sys
-
-import torch
-import numpy as np
-import cuda.cuda as cuda
-import cuda.cudart as cudart
-
-from coco_labels import coco80_to_coco91_class, labels
-from common import save2json, box_class85to6
-from common import create_engine_context, get_io_bindings
-from calibration_dataset import create_dataloaders
-from datasets.post_process import get_post_process
-
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-from tqdm import tqdm
-from tqdm.contrib import tzip
-
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-
-    # Load dataloader
-    dataloader = create_dataloaders(
-        data_path=config.eval_dir,
-        annFile=config.coco_gt,
-        img_sz=config.imgsz,
-        batch_size=config.bsz,
-        step=config.loop_count,
-        data_process_type=config.data_process_type
-    )
-
-    # Load post process func
-    if config.test_mode == "MAP":
-        post_process_func = get_post_process(config.data_process_type)
-
-    bsz = config.bsz
-    num_samples = 5000
-    if config.loop_count > 0:
-        num_samples = bsz * config.loop_count
-    num_batch = len(dataloader)
-    print("=" * 30)
-    print(f"Test Mode : {'Asynchronous' if config.use_async else 'Synchronous'}")
-    print(f"Total sample : {num_samples}\nBatch_size : {bsz}\nRun Batch : {num_batch}")
-    print("=" * 30)
-
-    json_result = []
-    forward_time = 0.0
-    class_map = coco80_to_coco91_class()
-
-    host_mem = tensorrt.IHostMemory
-    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-
-    # Load Engine
-    engine, context = create_engine_context(config.model_engine, logger)
-    inputs, outputs, allocations = get_io_bindings(engine)
-
-    # Load nms_engine
-    if config.test_mode == "MAP" and config.nms_type == "GPU":
-        nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
-        nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
-        nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
-        nms_output1 = np.zeros(nms_outputs[1]["shape"], nms_outputs[1]["dtype"])
-        print(f"nms_output0 shape : {nms_output0.shape}   nms_output0 type : {nms_output0.dtype}")
-        print(f"nms_output1 shape : {nms_output1.shape}   nms_output1 type : {nms_output1.dtype}")
-
-    # Warm up
-    if config.warm_up > 0:
-        print("\nWarm Start.")
-        for i in range(config.warm_up):
-            context.execute_v2(allocations)
-        print("Warm Done.")
-
-    # Prepare the output data
-    output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    print(f"output shape : {output.shape} output type : {output.dtype}")
-
-    for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
-        batch_data = batch_data.numpy()
-        batch_img_shape = [batch_img_shape[0].numpy(), batch_img_shape[1].numpy()]
-        # batch_img_id = batch_img_id.numpy()
-
-        cur_bsz_sample = batch_data.shape[0]
-
-        # Set input
-        err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-        # Forward
-        # start_time = time.time()
-        context.execute_v2(allocations)
-        # end_time = time.time()
-        # forward_time += end_time - start_time
-
-        if config.test_mode == "MAP":
-            # Fetch output
-            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
-            assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-            # Step 1 : prepare data to nms
-            _, box_num, box_unit = output.shape
-            if config.debug:
-                print(f"[Debug] box_num(25200) : {box_num}, box_unit(6) : {box_unit}")
-
-            if config.decoder_faster == 0:
-                nms_input = box_class85to6(output.reshape(-1, box_unit))
-            else:
-                nms_input = output
-
-            # Step 2 : nms
-            # cpu nms(TODO)
-
-            # gpu nms
-            if config.nms_type == "GPU":
-
-                # Set nms input
-                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-                nms_context.execute_v2(nms_allocations)
-                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-            # Step 3 : post process + save
-            pred_boxes = post_process_func(
-                ori_img_shape=batch_img_shape,
-                imgsz=(config.imgsz, config.imgsz),
-                box_datas=nms_output0,
-                box_nums=nms_output1,
-                sample_num=cur_bsz_sample,
-                max_det=config.max_det
-            )
-            save2json(batch_img_id, pred_boxes, json_result, class_map)
-
-    # fps = num_samples / forward_time
-
-    if config.test_mode == "FPS":
-        start_time = time.time()       
-        for i in range(config.loop_count):
-            context.execute_v2(allocations)  
-        end_time = time.time()  
-        forward_time = end_time - start_time      
-        fps = (config.loop_count*config.bsz) / forward_time
-        print("FPS : ", fps)
-        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
-        if fps >= config.fps_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(10)
-
-    if config.test_mode == "MAP":
-        if len(json_result) == 0:
-            print("Predict zero box!")
-            exit(10)
-
-        if not os.path.exists(config.pred_dir):
-            os.makedirs(config.pred_dir)
-
-        pred_json = os.path.join(
-            config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
-        )
-        with open(pred_json, "w") as f:
-            json.dump(json_result, f)
-
-        anno_json = config.coco_gt
-        anno = COCO(anno_json)  # init annotations api
-        pred = anno.loadRes(pred_json)  # init predictions api
-        eval = COCOeval(anno, pred, "bbox")
-
-        eval.evaluate()
-        eval.accumulate()
-        print(
-            f"==============================eval {config.model_name} {config.precision} coco map =============================="
-        )
-        eval.summarize()
-
-        map, map50 = eval.stats[:2]
-        print("MAP@0.5 : ", map50)
-        print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        if map50 >= config.map_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(10)
-
-def parse_config():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", type=str, default="YOLOV5s", help="YOLOV3 YOLOV5 YOLOV7 YOLOX"
-    )
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
-    parser.add_argument(
-        "--model_engine",
-        type=str,
-        default="",
-        help="model engine path",
-    )
-    parser.add_argument(
-        "--nms_engine",
-        type=str,
-        default="",
-        help="nms engine path",
-    )
-    parser.add_argument(
-        "--coco_gt",
-        type=str,
-        default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
-        help="coco instances_val2017.json",
-    )
-    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
-    parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
-    parser.add_argument(
-        "--eval_dir",
-        type=str,
-        default="data/datasets/cv/coco2017/val2017",
-        help="coco image dir",
-    )
-    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
-    parser.add_argument(
-        "--imgsz",
-        "--img",
-        "--img-size",
-        type=int,
-        default=640,
-        help="inference size h,w",
-    )
-    parser.add_argument("--max_det", type=int, default=1000, help="maximum detections per image")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--use_async", action="store_true")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
-    parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
-    parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
-    parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
-    parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
-
-    config = parser.parse_args()
-    print("config:", config)
-    return config
-
-if __name__ == "__main__":
-    config = parse_config()
-    main(config)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py
deleted file mode 100644
index 932efbdf..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/load_ixrt_plugin.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import ctypes
-import tensorrt
-from os.path import join, dirname, exists
-def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    ctypes.CDLL(dynamic_path)
-    tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py
deleted file mode 100644
index 00ed65dd..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/modify_batchsize.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import onnx
-import argparse
-
-def change_input_dim(model, bsz):
-    batch_size = bsz
-
-    # The following code changes the first dimension of every input to be batch_size
-    # Modify as appropriate ... note that this requires all inputs to
-    # have the same batch_size
-    inputs = model.graph.input
-    for input in inputs:
-        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
-        # Add checks as needed.
-        dim1 = input.type.tensor_type.shape.dim[0]
-        # update dim to be a symbolic value
-        if isinstance(batch_size, str):
-            # set dynamic batch size
-            dim1.dim_param = batch_size
-        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
-            # set given batch size
-            dim1.dim_value = int(batch_size)
-        else:
-            # set batch size of 1
-            dim1.dim_value = 1
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--batch_size", type=int)
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-model = onnx.load(args.origin_model)
-change_input_dim(model, args.batch_size)
-onnx.save(model, args.output_model)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/quant.py b/models/cv/object_detection/yolov5s/ixrt/quant.py
deleted file mode 100644
index d73212ca..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/quant.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-import random
-import argparse
-import numpy as np
-from tensorrt.deploy import static_quantize
-
-import torch
-import sys
-sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
-print(sys.path)
-from calibration_dataset import create_dataloaders
-
-def setseed(seed=42):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str)
-    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
-    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
-    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
-    parser.add_argument("--disable_quant_names", nargs='*', type=str)
-    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
-    parser.add_argument("--bsz", type=int, default=32)
-    parser.add_argument("--step", type=int, default=20)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--imgsz", type=int, default=640)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-setseed(args.seed)
-model_name = args.model_name
-
-out_dir = args.save_dir
-dataloader = create_dataloaders(
-    data_path=args.dataset_dir,
-    annFile=args.ann_file,
-    img_sz=args.imgsz,
-    batch_size=args.bsz,
-    step=args.step,
-    data_process_type=args.data_process_type
-)
-# print("disable_quant_names : ", args.disable_quant_names)
-static_quantize(args.model,
-        calibration_dataloader=dataloader,
-        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
-        observer=args.observer,
-        data_preprocess=lambda x: x[0].to("cuda"),
-        quant_format="qdq",
-        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/requirements.txt b/models/cv/object_detection/yolov5s/ixrt/requirements.txt
deleted file mode 100644
index b1a10ab0..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-tqdm
-onnx
-onnxsim
-ultralytics==8.3.97
-pycocotools
-pycuda
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
index b5cf3c97..52ec959f 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_accuracy.sh
@@ -3,16 +3,15 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=0.56
+TGT=-1
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -115,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -123,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
-            --with_nms             True                     \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -138,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_with_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -154,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
index f765679f..5e2f97fb 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_fp16_performance.sh
@@ -3,17 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=840
-LOOP_COUNT=100
+TGT=-1
+LOOP_COUNT=10
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -115,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_no_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -123,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
-            --with_nms             False                    \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -138,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_without_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -154,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
index 9b41db7d..606fc94c 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_accuracy.sh
@@ -3,16 +3,15 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=0.56
+TGT=-1
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -115,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -123,7 +119,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
-            --with_nms             True                     \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -138,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_with_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -154,7 +149,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_with_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
diff --git a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
index a2715061..b2983669 100644
--- a/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
+++ b/models/cv/object_detection/yolov5s/ixrt/scripts/infer_yolov5s_int8_performance.sh
@@ -3,17 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=840
-LOOP_COUNT=100
+TGT=-1
+LOOP_COUNT=10
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -80,6 +76,7 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
+
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -115,7 +112,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion_no_cancat.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -123,7 +120,6 @@ if [ $LAYER_FUSION == "1" ]; then
             --src ${CURRENT_MODEL}                          \
             --dst ${FUSION_ONNX}                            \
             --decoder_type        YoloV5Decoder             \
-            --with_nms             False                    \
             --decoder_input_names ${DECODER_INPUT_NAMES[@]} \
             --decoder8_anchor     ${DECODER_8_ANCHOR[@]}    \
             --decoder16_anchor    ${DECODER_16_ANCHOR[@]}   \
@@ -138,7 +134,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}_without_nms.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -154,7 +150,7 @@ CURRENT_MODEL=${FINAL_MODEL}
 let step++
 echo;
 echo [STEP ${step}] : Build Engine
-ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}_without_nms.engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
 if [ -f $ENGINE_FILE ];then
     echo "  "Build Engine Skip, $ENGINE_FILE has been existed
 else
@@ -181,7 +177,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}               \
+    --model_engine=${ENGINE_FILE}              \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov5s/ixrt/simplify_model.py b/models/cv/object_detection/yolov5s/ixrt/simplify_model.py
deleted file mode 100644
index b4254b6f..00000000
--- a/models/cv/object_detection/yolov5s/ixrt/simplify_model.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import onnx
-import argparse
-from onnxsim import simplify
-
-# Simplify
-def simplify_model(args):
-    onnx_model = onnx.load(args.origin_model)
-    model_simp, check = simplify(onnx_model)
-    model_simp = onnx.shape_inference.infer_shapes(model_simp)
-    onnx.save(model_simp, args.output_model)
-    print("  Simplify onnx Done.")
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-simplify_model(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/README.md b/models/cv/object_detection/yolov7/ixrt/README.md
index 7d867750..8ff917cc 100644
--- a/models/cv/object_detection/yolov7/ixrt/README.md
+++ b/models/cv/object_detection/yolov7/ixrt/README.md
@@ -30,7 +30,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 ```
 
 ### Model Conversion
@@ -47,13 +47,13 @@ mv yolov7.onnx /Path/to/checkpoints/yolov7m.onnx
 ## Model Inference
 
 ```bash
-export PROJ_DIR=/Path/to/yolov7/ixrt
+export PROJ_DIR=./
 export DATASETS_DIR=/Path/to/coco/
 export CHECKPOINTS_DIR=./checkpoints
 export COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
 export EVAL_DIR=${DATASETS_DIR}/images/val2017
-export RUN_DIR=/Path/to/yolov7/ixrt
-export CONFIG_DIR=config/YOLOV7_CONFIG
+export RUN_DIR=../../ixrt_common
+export CONFIG_DIR=../../ixrt_common/config/YOLOV7_CONFIG
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/yolov7/ixrt/build_engine.py b/models/cv/object_detection/yolov7/ixrt/build_engine.py
deleted file mode 100644
index d47e45e5..00000000
--- a/models/cv/object_detection/yolov7/ixrt/build_engine.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import os
-import cv2
-import argparse
-import numpy as np
-
-import torch
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    # engine args
-    parser.add_argument("--engine", type=str, default=None)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py b/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py
deleted file mode 100644
index 25f0ab8a..00000000
--- a/models/cv/object_detection/yolov7/ixrt/build_nms_engine.py
+++ /dev/null
@@ -1,81 +0,0 @@
-import os
-import argparse
-import torch
-import onnx
-from onnx import helper
-from onnx import TensorProto, numpy_helper
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-def create_onnx(args):
-    nms = helper.make_node(
-        "DetectionNMS_IxRT",
-        name="NMS",
-        inputs=["nms_input"],
-        outputs=["nms_output0", "nms_output1"],
-        nMaxKeep=args.max_box_pre_img,
-        fIoUThresh=args.iou_thresh,
-        fScoreThresh=args.score_thresh
-    )
-    graph = helper.make_graph(
-        nodes=[nms],
-        name="gpu_nms",
-        inputs=[
-            helper.make_tensor_value_info(
-                "nms_input", onnx.TensorProto.FLOAT, (args.bsz, args.all_box_num, 6)
-            )
-        ],
-        outputs=[
-            helper.make_tensor_value_info(
-                "nms_output0", onnx.TensorProto.FLOAT, (args.bsz, args.max_box_pre_img, 6)
-            ),
-            helper.make_tensor_value_info(
-                "nms_output1", onnx.TensorProto.INT32, (args.bsz,)
-            )
-        ],
-        initializer=[]
-    )
-
-    op = onnx.OperatorSetIdProto()
-    op.version = 13
-    model = onnx.helper.make_model(graph)
-
-    model = onnx.helper.make_model(graph, opset_imports=[op])
-    onnx_path = args.path + "/nms.onnx"
-    onnx.save(model, onnx_path)
-
-def build_engine(args):
-    onnx_path = args.path + "/nms.onnx"
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(onnx_path)
-    plan = builder.build_serialized_network(network, build_config)
-
-    engine_path = args.path + "/nms.engine"
-    with open(engine_path, "wb") as f:
-        f.write(plan)
-
-def main(args):
-    create_onnx(args)
-    build_engine(args)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--bsz", type=int, default=1, help="batch size")
-    parser.add_argument("--path", type=str)
-    parser.add_argument("--all_box_num", type=int, default=25200)
-    parser.add_argument("--max_box_pre_img", type=int, default=1000)
-    parser.add_argument("--iou_thresh", type=float, default=0.6)
-    parser.add_argument("--score_thresh", type=float, default=0.001)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py b/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py
deleted file mode 100644
index 578e013d..00000000
--- a/models/cv/object_detection/yolov7/ixrt/calibration_dataset.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import os
-import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-
-
-
-from datasets.coco import CocoDetection
-
-def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
-    dataset = CocoDetection(
-        root=data_path,
-        annFile=annFile,
-        img_size=img_sz,
-        data_process_type=data_process_type
-    )
-    calibration_dataset = dataset
-    num_samples = min(5000, batch_size * step)
-    if num_samples > 0:
-        calibration_dataset = torch.utils.data.Subset(
-            dataset, indices=range(num_samples)
-        )
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov7/ixrt/ci/prepare.sh
index 310566fb..611fcd19 100644
--- a/models/cv/object_detection/yolov7/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov7/ixrt/ci/prepare.sh
@@ -25,7 +25,7 @@ else
     echo "Not Support Os"
 fi
 
-pip3 install -r requirements.txt
+pip3 install -r ../../ixrt_common/requirements.txt
 mkdir -p checkpoints
 cp -r /root/data/3rd_party/yolov7 ./
 cd yolov7
diff --git a/models/cv/object_detection/yolov7/ixrt/coco_labels.py b/models/cv/object_detection/yolov7/ixrt/coco_labels.py
deleted file mode 100644
index 69d38878..00000000
--- a/models/cv/object_detection/yolov7/ixrt/coco_labels.py
+++ /dev/null
@@ -1,89 +0,0 @@
-labels = [
-    "person",
-    "bicycle",
-    "car",
-    "motorcycle",
-    "airplane",
-    "bus",
-    "train",
-    "truck",
-    "boat",
-    "traffic light",
-    "fire hydrant",
-    "stop sign",
-    "parking meter",
-    "bench",
-    "bird",
-    "cat",
-    "dog",
-    "horse",
-    "sheep",
-    "cow",
-    "elephant",
-    "bear",
-    "zebra",
-    "giraffe",
-    "backpack",
-    "umbrella",
-    "handbag",
-    "tie",
-    "suitcase",
-    "frisbee",
-    "skis",
-    "snowboard",
-    "sports ball",
-    "kite",
-    "baseball bat",
-    "baseball glove",
-    "skateboard",
-    "surfboard",
-    "tennis racket",
-    "bottle",
-    "wine glass",
-    "cup",
-    "fork",
-    "knife",
-    "spoon",
-    "bowl",
-    "banana",
-    "apple",
-    "sandwich",
-    "orange",
-    "broccoli",
-    "carrot",
-    "hot dog",
-    "pizza",
-    "donut",
-    "cake",
-    "chair",
-    "couch",
-    "potted plant",
-    "bed",
-    "dining table",
-    "toilet",
-    "tv",
-    "laptop",
-    "mouse",
-    "remote",
-    "keyboard",
-    "cell phone",
-    "microwave",
-    "oven",
-    "toaster",
-    "sink",
-    "refrigerator",
-    "book",
-    "clock",
-    "vase",
-    "scissors",
-    "teddy bear",
-    "hair drier",
-    "toothbrush",
-]
-def coco80_to_coco91_class():  # converts 80-index (val2014) to 91-index (paper)
-    return [
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
-        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-        64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
-
-__all__ = ["labels"]
diff --git a/models/cv/object_detection/yolov7/ixrt/common.py b/models/cv/object_detection/yolov7/ixrt/common.py
deleted file mode 100644
index 5f543555..00000000
--- a/models/cv/object_detection/yolov7/ixrt/common.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import numpy as np
-from tqdm import tqdm
-
-import tensorrt
-import cuda.cuda as cuda
-import cuda.cudart as cudart
-
-# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
-# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
-def box_class85to6(input):
-    center_x_y = input[:, :2]
-    side = input[:, 2:4]
-    conf = input[:, 4:5]
-    class_id = np.argmax(input[:, 5:], axis = -1)
-    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
-    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
-    x1_y1 = center_x_y - 0.5 * side
-    x2_y2 = center_x_y + 0.5 * side
-    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
-    return nms_input
-
-def save2json(batch_img_id, pred_boxes, json_result, class_trans):
-    for i, boxes in enumerate(pred_boxes):
-        if boxes is not None:
-            image_id = int(batch_img_id[i])
-            # have no target
-            if image_id == -1:
-                continue
-            for x, y, w, h, c, p in boxes:
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                        "image_id": image_id,
-                        "category_id": class_trans[c - 1],
-                        "bbox": [x, y, w, h],
-                        "score": p,
-                    }
-                )
-
-def create_engine_context(engine_path, logger):
-    with open(engine_path, "rb") as f:
-        runtime = tensorrt.Runtime(logger)
-        assert runtime
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine
-        context = engine.create_execution_context()
-        assert context
-
-    return engine, context
-
-def get_io_bindings(engine):
-    # Setup I/O bindings
-    inputs = []
-    outputs = []
-    allocations = []
-
-    for i in range(engine.num_bindings):
-        is_input = False
-        if engine.binding_is_input(i):
-            is_input = True
-        name = engine.get_binding_name(i)
-        dtype = engine.get_binding_dtype(i)
-        shape = engine.get_binding_shape(i)
-        if is_input:
-            batch_size = shape[0]
-        size = np.dtype(tensorrt.nptype(dtype)).itemsize
-        for s in shape:
-            size *= s
-        err, allocation = cudart.cudaMalloc(size)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-        binding = {
-            "index": i,
-            "name": name,
-            "dtype": np.dtype(tensorrt.nptype(dtype)),
-            "shape": list(shape),
-            "allocation": allocation,
-            "nbytes": size,
-        }
-        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
-        allocations.append(allocation)
-        if engine.binding_is_input(i):
-            inputs.append(binding)
-        else:
-            outputs.append(binding)
-    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/cut_model.py b/models/cv/object_detection/yolov7/ixrt/cut_model.py
deleted file mode 100644
index af0a3a4f..00000000
--- a/models/cv/object_detection/yolov7/ixrt/cut_model.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import onnx
-import argparse
-from onnxsim import simplify
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    parser.add_argument("--input_names", nargs='+', type=str)
-    parser.add_argument("--output_names", nargs='+', type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
-print("  Cut Model Done.")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py b/models/cv/object_detection/yolov7/ixrt/datasets/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/coco.py b/models/cv/object_detection/yolov7/ixrt/datasets/coco.py
deleted file mode 100644
index 7f355b84..00000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/coco.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import os.path
-from typing import Any, Callable, List, Optional, Tuple
-
-import cv2
-
-from .vision import VisionDataset
-from .pre_process import get_post_process
-class CocoDetection(VisionDataset):
-    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        annFile: str,
-        img_size: int,
-        data_process_type: str,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-        
-    ) -> None:
-        super().__init__(root, transforms, transform, target_transform)
-        from pycocotools.coco import COCO
-
-        self.coco = COCO(annFile)
-        self.ids = list(sorted(self.coco.imgs.keys()))
-        self.img_size = img_size
-        
-        self.transforms = get_post_process(data_process_type)
-
-    def _load_image(self, id: int):
-        path = self.coco.loadImgs(id)[0]["file_name"]
-        data = cv2.imread(os.path.join(self.root, path))
-        return data
-
-    def _load_target(self, id: int) -> List[Any]:
-        return self.coco.loadAnns(self.coco.getAnnIds(id))
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        id = self.ids[index]
-        image = self._load_image(id)
-        target = self._load_target(id)
-        origin_shape = image.shape[:2]
-
-        if self.transforms is not None:
-            image = self.transforms(image, self.img_size)
-
-        if len(target) > 0:
-            image_id = target[0]["image_id"]
-        else:
-            # have no target
-            image_id = -1
-        return image, origin_shape, image_id
-
-    def __len__(self) -> int:
-        return len(self.ids)
-
-
-class CocoCaptions(CocoDetection):
-    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-
-    Example:
-
-        .. code:: python
-
-            import torchvision.datasets as dset
-            import torchvision.transforms as transforms
-            cap = dset.CocoCaptions(root = 'dir where images are',
-                                    annFile = 'json annotation file',
-                                    transform=transforms.PILToTensor())
-
-            print('Number of samples: ', len(cap))
-            img, target = cap[3] # load 4th sample
-
-            print("Image Size: ", img.size())
-            print(target)
-
-        Output: ::
-
-            Number of samples: 82783
-            Image Size: (3L, 427L, 640L)
-            [u'A plane emitting smoke stream flying over a mountain.',
-            u'A plane darts across a bright blue sky behind a mountain covered in snow',
-            u'A plane leaves a contrail above the snowy mountain top.',
-            u'A mountain that has a plane flying overheard in the distance.',
-            u'A mountain view with a plume of smoke in the background']
-
-    """
-
-    def _load_target(self, id: int) -> List[str]:
-        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/common.py b/models/cv/object_detection/yolov7/ixrt/datasets/common.py
deleted file mode 100644
index e120e00f..00000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/common.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-    if not scaleup:  # only scale down, do not scale up (for better val mAP)
-        r = min(r, 1.0)
-
-    # Compute padding
-    ratio = r, r  # width, height ratios
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-    if auto:  # minimum rectangle
-        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
-    elif scaleFill:  # stretch
-        dw, dh = 0.0, 0.0
-        new_unpad = (new_shape[1], new_shape[0])
-        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
-    return im, ratio, (dw, dh)
-
-def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
-    # Rescale boxes (xyxy) from net_shape to ori_shape
-
-    if use_letterbox:
-
-        gain = min(
-            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
-        )  # gain  = new / old
-        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
-            net_shape[0] - ori_shape[0] * gain
-        ) / 2.0
-
-        boxes[:, [0, 2]] -= pad[0]  # x padding
-        boxes[:, [1, 3]] -= pad[1]  # y padding
-        boxes[:, :4] /= gain
-    else:
-        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
-
-        boxes[:, 0] /= x_scale
-        boxes[:, 1] /= y_scale
-        boxes[:, 2] /= x_scale
-        boxes[:, 3] /= y_scale
-
-    clip_boxes(boxes, ori_shape)
-    return boxes
-
-def clip_boxes(boxes, shape):
-
-    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py b/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py
deleted file mode 100644
index a58c02f8..00000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/post_process.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox, scale_boxes, clip_boxes
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Postprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Postprocess
-    elif data_process_type == "yolox":
-        return YoloxPostprocess
-    return None
-
-def Yolov3Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=False
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def Yolov5Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=True
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def YoloxPostprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
-            boxes[:, :4] /= r
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py b/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py
deleted file mode 100644
index 8cc643a8..00000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/pre_process.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Preprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Preprocess
-    elif data_process_type == "yolox":
-        return YoloxPreprocess
-    return None
-
-def Yolov3Preprocess(image, img_size):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    image = cv2.resize(image, (img_size, img_size))
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def Yolov5Preprocess(image, img_size, augment=False):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    if r != 1:  # if sizes are not equal
-        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
-        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
-
-    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
-
-    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def YoloxPreprocess(img, img_size, swap=(2,0,1)):
-
-    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
-    r = min(img_size / img.shape[0], img_size / img.shape[1])
-    resized_img = cv2.resize(
-        img,
-        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        interpolation=cv2.INTER_LINEAR, 
-    ).astype(np.uint8)
-
-    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-    padded_img = padded_img.transpose(swap)
-    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-
-    return padded_img
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/datasets/vision.py b/models/cv/object_detection/yolov7/ixrt/datasets/vision.py
deleted file mode 100644
index 32da4a78..00000000
--- a/models/cv/object_detection/yolov7/ixrt/datasets/vision.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import os
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-import torch.utils.data as data
-
-from types import FunctionType
-
-def _log_api_usage_once(obj: Any) -> None:
-
-    """
-    Logs API usage(module and name) within an organization.
-    In a large ecosystem, it's often useful to track the PyTorch and
-    TorchVision APIs usage. This API provides the similar functionality to the
-    logging module in the Python stdlib. It can be used for debugging purpose
-    to log which methods are used and by default it is inactive, unless the user
-    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
-    Please note it is triggered only once for the same API call within a process.
-    It does not collect any data from open-source users since it is no-op by default.
-    For more information, please refer to
-    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
-    * Logging policy: https://github.com/pytorch/vision/issues/5052;
-
-    Args:
-        obj (class instance or method): an object to extract info from.
-    """
-    module = obj.__module__
-    if not module.startswith("torchvision"):
-        module = f"torchvision.internal.{module}"
-    name = obj.__class__.__name__
-    if isinstance(obj, FunctionType):
-        name = obj.__name__
-    torch._C._log_api_usage_once(f"{module}.{name}")
-
-class VisionDataset(data.Dataset):
-    """
-    Base Class For making datasets which are compatible with torchvision.
-    It is necessary to override the ``__getitem__`` and ``__len__`` method.
-
-    Args:
-        root (string): Root directory of dataset.
-        transforms (callable, optional): A function/transforms that takes in
-            an image and a label and returns the transformed versions of both.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-
-    .. note::
-
-        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
-    """
-
-    _repr_indent = 4
-
-    def __init__(
-        self,
-        root: str,
-        transforms: Optional[Callable] = None,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-    ) -> None:
-        _log_api_usage_once(self)
-        if isinstance(root, str):
-            root = os.path.expanduser(root)
-        self.root = root
-
-        has_transforms = transforms is not None
-        has_separate_transform = transform is not None or target_transform is not None
-        if has_transforms and has_separate_transform:
-            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
-
-        # for backwards-compatibility
-        self.transform = transform
-        self.target_transform = target_transform
-
-        if has_separate_transform:
-            transforms = StandardTransform(transform, target_transform)
-        self.transforms = transforms
-
-    def __getitem__(self, index: int) -> Any:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            (Any): Sample and meta data, optionally transformed by the respective transforms.
-        """
-        raise NotImplementedError
-
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    def __repr__(self) -> str:
-        head = "Dataset " + self.__class__.__name__
-        body = [f"Number of datapoints: {self.__len__()}"]
-        if self.root is not None:
-            body.append(f"Root location: {self.root}")
-        body += self.extra_repr().splitlines()
-        if hasattr(self, "transforms") and self.transforms is not None:
-            body += [repr(self.transforms)]
-        lines = [head] + [" " * self._repr_indent + line for line in body]
-        return "\n".join(lines)
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def extra_repr(self) -> str:
-        return ""
-
-
-class StandardTransform:
-    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
-        self.transform = transform
-        self.target_transform = target_transform
-
-    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
-        if self.transform is not None:
-            input = self.transform(input)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-        return input, target
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def __repr__(self) -> str:
-        body = [self.__class__.__name__]
-        if self.transform is not None:
-            body += self._format_transform_repr(self.transform, "Transform: ")
-        if self.target_transform is not None:
-            body += self._format_transform_repr(self.target_transform, "Target transform: ")
-
-        return "\n".join(body)
diff --git a/models/cv/object_detection/yolov7/ixrt/deploy.py b/models/cv/object_detection/yolov7/ixrt/deploy.py
deleted file mode 100644
index 83f80a9e..00000000
--- a/models/cv/object_detection/yolov7/ixrt/deploy.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# !/usr/bin/env python
-# -*- coding: utf-8 -*-
-import argparse
-from tensorrt.deploy.api import GraphTransform, create_source, create_target
-
-class Transform:
-    def __init__(self, graph):
-        self.t = GraphTransform(graph)
-        self.graph = graph
-
-    def ReplaceFocus(self, input_edge, outputs, to_op):
-        input_var = self.graph.get_variable(input_edge)
-        op = self.graph.get_operator(to_op)
-        self.t.delete_operators_between_var_op(
-            from_var=input_var, to_op=op
-        )
-        self.t.make_operator(
-            "Focus", inputs=input_edge, outputs=outputs
-        )
-        return self.graph
-
-    def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
-        if attributes["anchor"] is None:
-            del attributes["anchor"]
-        self.t.make_operator(
-            op_type, inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-    def AddConcatOp(self, inputs: list, outputs, **attributes):
-        self.t.make_operator(
-            "Concat", inputs=inputs, outputs=outputs, **attributes
-        )
-        return self.graph
-
-def customize_ops(graph, args):
-    t = Transform(graph)
-    fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
-    if fuse_focus:
-        graph = t.ReplaceFocus(
-            input_edge=args.focus_input,
-            outputs=args.focus_output,
-            to_op=args.focus_last_node
-        )
-    decoder_input = args.decoder_input_names
-    num = len(decoder_input) // 3
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[:num],
-        outputs=["decoder_8"],
-        op_type=args.decoder_type,
-        anchor=args.decoder8_anchor,
-        num_class=args.num_class,
-        stride=8,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num:num*2],
-        outputs=["decoder_16"],
-        op_type=args.decoder_type,
-        anchor=args.decoder16_anchor,
-        num_class=args.num_class,
-        stride=16,
-        faster_impl=args.faster
-    )
-    graph = t.AddYoloDecoderOp(
-        inputs=decoder_input[num*2:num*2+1],
-        outputs=["decoder_32"],
-        op_type=args.decoder_type,
-        anchor=args.decoder32_anchor,
-        num_class=args.num_class,
-        stride=32,
-        faster_impl=args.faster
-    )
-    if args.decoder64_anchor is not None:
-        graph = t.AddYoloDecoderOp(
-            inputs=decoder_input[num*2+1:],
-            outputs=["decoder_64"],
-            op_type=args.decoder_type,
-            anchor=args.decoder64_anchor,
-            num_class=args.num_class,
-            stride=64,
-            faster_impl=args.faster
-        )
-        graph = t.AddConcatOp(
-            inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
-            outputs=["output"],
-            axis=1
-        )
-    else:
-        graph = t.AddConcatOp(
-            inputs=["decoder_32", "decoder_16", "decoder_8"],
-            outputs=["output"],
-            axis=1
-        )
-
-    graph.outputs.clear()
-    graph.add_output("output")
-    graph.outputs["output"].dtype = "FLOAT"
-    return graph
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--src", type=str)
-    parser.add_argument("--dst", type=str)
-    parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
-    parser.add_argument("--decoder_input_names", nargs='+', type=str)
-    parser.add_argument("--decoder8_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder16_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder32_anchor", nargs='*', type=int)
-    parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
-    parser.add_argument("--num_class", type=int, default=80)
-    parser.add_argument("--faster", type=int, default=1)
-    parser.add_argument("--focus_input", type=str, default=None)
-    parser.add_argument("--focus_output", type=str, default=None)
-    parser.add_argument("--focus_last_node", type=str, default=None)
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-
-    args = parse_args()
-    graph = create_source(args.src)()
-    graph = customize_ops(graph, args)
-    create_target(saved_path=args.dst).export(graph)
-    print("Surged onnx lies on", args.dst)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/inference.py b/models/cv/object_detection/yolov7/ixrt/inference.py
deleted file mode 100644
index 5637b839..00000000
--- a/models/cv/object_detection/yolov7/ixrt/inference.py
+++ /dev/null
@@ -1,261 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import argparse
-import glob
-import json
-import os
-import time
-import sys
-
-import torch
-import numpy as np
-import cuda.cuda as cuda
-import cuda.cudart as cudart
-
-from coco_labels import coco80_to_coco91_class, labels
-from common import save2json, box_class85to6
-from common import create_engine_context, get_io_bindings
-from calibration_dataset import create_dataloaders
-from datasets.post_process import get_post_process
-
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-from tqdm import tqdm
-from tqdm.contrib import tzip
-
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-
-    # Load dataloader
-    dataloader = create_dataloaders(
-        data_path=config.eval_dir,
-        annFile=config.coco_gt,
-        img_sz=config.imgsz,
-        batch_size=config.bsz,
-        step=config.loop_count,
-        data_process_type=config.data_process_type
-    )
-
-    # Load post process func
-    if config.test_mode == "MAP":
-        post_process_func = get_post_process(config.data_process_type)
-
-    bsz = config.bsz
-    num_samples = 5000
-    if config.loop_count > 0:
-        num_samples = bsz * config.loop_count
-    num_batch = len(dataloader)
-    print("=" * 30)
-    print(f"Test Mode : {'Asynchronous' if config.use_async else 'Synchronous'}")
-    print(f"Total sample : {num_samples}\nBatch_size : {bsz}\nRun Batch : {num_batch}")
-    print("=" * 30)
-
-    json_result = []
-    forward_time = 0.0
-    class_map = coco80_to_coco91_class()
-
-    host_mem = tensorrt.IHostMemory
-    logger = tensorrt.Logger(tensorrt.Logger.ERROR)
-
-    # Load Engine
-    engine, context = create_engine_context(config.model_engine, logger)
-    inputs, outputs, allocations = get_io_bindings(engine)
-
-    # Load nms_engine
-    if config.test_mode == "MAP" and config.nms_type == "GPU":
-        nms_engine, nms_context = create_engine_context(config.nms_engine, logger)
-        nms_inputs, nms_outputs, nms_allocations = get_io_bindings(nms_engine)
-        nms_output0 = np.zeros(nms_outputs[0]["shape"], nms_outputs[0]["dtype"])
-        nms_output1 = np.zeros(nms_outputs[1]["shape"], nms_outputs[1]["dtype"])
-        print(f"nms_output0 shape : {nms_output0.shape}   nms_output0 type : {nms_output0.dtype}")
-        print(f"nms_output1 shape : {nms_output1.shape}   nms_output1 type : {nms_output1.dtype}")
-
-    # Warm up
-    if config.warm_up > 0:
-        print("\nWarm Start.")
-        for i in range(config.warm_up):
-            context.execute_v2(allocations)
-        print("Warm Done.")
-
-    # Prepare the output data
-    output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
-    print(f"output shape : {output.shape} output type : {output.dtype}")
-
-    for batch_data, batch_img_shape, batch_img_id in tqdm(dataloader):
-        batch_data = batch_data.numpy()
-        batch_img_shape = [batch_img_shape[0].numpy(), batch_img_shape[1].numpy()]
-        
-        # batch_img_id = batch_img_id.numpy()
-
-        cur_bsz_sample = batch_data.shape[0]
-      
-
-        # Set input
-        err, = cuda.cuMemcpyHtoD(inputs[0]["allocation"], batch_data, batch_data.nbytes)
-        assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-        # Forward
-        
-        context.execute_v2(allocations)
-        
-        
-
-        if config.test_mode == "MAP":
-            # Fetch output
-            err, = cuda.cuMemcpyDtoH(output, outputs[0]["allocation"], outputs[0]["nbytes"])
-            assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-            # Step 1 : prepare data to nms
-            _, box_num, box_unit = output.shape
-            if config.debug:
-                print(f"[Debug] box_num(25200) : {box_num}, box_unit(6) : {box_unit}")
-
-            if config.decoder_faster == 0:
-                nms_input = box_class85to6(output.reshape(-1, box_unit))
-            else:
-                nms_input = output
-
-            # Step 2 : nms
-            # cpu nms(TODO)
-
-            # gpu nms
-            if config.nms_type == "GPU":
-
-                # Set nms input
-                err, = cuda.cuMemcpyHtoD(nms_inputs[0]["allocation"], nms_input, nms_input.nbytes)
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-                nms_context.execute_v2(nms_allocations)
-                err, = cuda.cuMemcpyDtoH(nms_output0, nms_outputs[0]["allocation"], nms_outputs[0]["nbytes"])
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-                err, = cuda.cuMemcpyDtoH(nms_output1, nms_outputs[1]["allocation"], nms_outputs[1]["nbytes"])
-                assert(err == cuda.CUresult.CUDA_SUCCESS)
-
-            # Step 3 : post process + save
-            pred_boxes = post_process_func(
-                ori_img_shape=batch_img_shape,
-                imgsz=(config.imgsz, config.imgsz),
-                box_datas=nms_output0,
-                box_nums=nms_output1,
-                sample_num=cur_bsz_sample,
-                max_det=config.max_det
-            )
-            save2json(batch_img_id, pred_boxes, json_result, class_map)
-            
-            
-    if config.test_mode == "FPS":
-        start_time = time.time()       
-        for i in range(config.loop_count):
-            context.execute_v2(allocations)  
-        end_time = time.time()  
-        forward_time = end_time - start_time      
-        fps = (config.loop_count*config.bsz) / forward_time
-        print("FPS : ", fps)
-        print(f"Performance Check : Test {fps} >= target {config.fps_target}")
-        if fps >= config.fps_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(10)
-
-    if config.test_mode == "MAP":
-        if len(json_result) == 0:
-            print("Predict zero box!")
-            exit(10)
-
-        if not os.path.exists(config.pred_dir):
-            os.makedirs(config.pred_dir)
-
-        pred_json = os.path.join(
-            config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
-        )
-        with open(pred_json, "w") as f:
-            json.dump(json_result, f)
-
-        anno_json = config.coco_gt
-        anno = COCO(anno_json)  # init annotations api
-        pred = anno.loadRes(pred_json)  # init predictions api
-        eval = COCOeval(anno, pred, "bbox")
-
-        eval.evaluate()
-        eval.accumulate()
-        print(
-            f"==============================eval {config.model_name} {config.precision} coco map =============================="
-        )
-        eval.summarize()
-
-        map, map50 = eval.stats[:2]
-        print("MAP@0.5 : ", map50)
-        print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
-        if map50 >= config.map_target:
-            print("pass!")
-            exit()
-        else:
-            print("failed!")
-            exit(10)
-
-def parse_config():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_name", type=str, default="YOLOV5s", help="YOLOV3 YOLOV5 YOLOV7 YOLOX"
-    )
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
-    parser.add_argument(
-        "--model_engine",
-        type=str,
-        default="",
-        help="model engine path",
-    )
-    parser.add_argument(
-        "--nms_engine",
-        type=str,
-        default="",
-        help="nms engine path",
-    )
-    parser.add_argument(
-        "--coco_gt",
-        type=str,
-        default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
-        help="coco instances_val2017.json",
-    )
-    parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
-    parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
-    parser.add_argument(
-        "--eval_dir",
-        type=str,
-        default="data/datasets/cv/coco2017/val2017",
-        help="coco image dir",
-    )
-    parser.add_argument("--bsz", type=int, default=32, help="test batch size")
-    parser.add_argument(
-        "--imgsz",
-        "--img",
-        "--img-size",
-        type=int,
-        default=640,
-        help="inference size h,w",
-    )
-    parser.add_argument("--max_det", type=int, default=1000, help="maximum detections per image")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--use_async", action="store_true")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
-    parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
-    parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
-    parser.add_argument("--decoder_faster", type=int, default=0, help="decoder faster can use gpu nms directly")
-    parser.add_argument("--nms_type", type=str, default="GPU", help="GPU/CPU")
-
-    config = parser.parse_args()
-    print("config:", config)
-    return config
-
-if __name__ == "__main__":
-    config = parse_config()
-    main(config)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py
deleted file mode 100644
index 932efbdf..00000000
--- a/models/cv/object_detection/yolov7/ixrt/load_ixrt_plugin.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import ctypes
-import tensorrt
-from os.path import join, dirname, exists
-def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    ctypes.CDLL(dynamic_path)
-    tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py b/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py
deleted file mode 100644
index 00ed65dd..00000000
--- a/models/cv/object_detection/yolov7/ixrt/modify_batchsize.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import onnx
-import argparse
-
-def change_input_dim(model, bsz):
-    batch_size = bsz
-
-    # The following code changes the first dimension of every input to be batch_size
-    # Modify as appropriate ... note that this requires all inputs to
-    # have the same batch_size
-    inputs = model.graph.input
-    for input in inputs:
-        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
-        # Add checks as needed.
-        dim1 = input.type.tensor_type.shape.dim[0]
-        # update dim to be a symbolic value
-        if isinstance(batch_size, str):
-            # set dynamic batch size
-            dim1.dim_param = batch_size
-        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
-            # set given batch size
-            dim1.dim_value = int(batch_size)
-        else:
-            # set batch size of 1
-            dim1.dim_value = 1
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--batch_size", type=int)
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-model = onnx.load(args.origin_model)
-change_input_dim(model, args.batch_size)
-onnx.save(model, args.output_model)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/quant.py b/models/cv/object_detection/yolov7/ixrt/quant.py
deleted file mode 100644
index d73212ca..00000000
--- a/models/cv/object_detection/yolov7/ixrt/quant.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import os
-import random
-import argparse
-import numpy as np
-from tensorrt.deploy import static_quantize
-
-import torch
-import sys
-sys.path.append("/home/haoyuan.chen/temp/inferencesamples/benchmarks/cv/detection/yolov3/tensorrt")
-print(sys.path)
-from calibration_dataset import create_dataloaders
-
-def setseed(seed=42):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str)
-    parser.add_argument("--model", type=str,  default="yolov5s_with_decoder.onnx")
-    parser.add_argument("--data_process_type", type=str,  default="none")
-    parser.add_argument("--dataset_dir", type=str,  default="./coco2017/val2017")
-    parser.add_argument("--ann_file", type=str,  default="./coco2017/annotations/instances_val2017.json")
-    parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
-    parser.add_argument("--disable_quant_names", nargs='*', type=str)
-    parser.add_argument("--save_dir", type=str,  help="save path", default=None)
-    parser.add_argument("--bsz", type=int, default=32)
-    parser.add_argument("--step", type=int, default=20)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--imgsz", type=int, default=640)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-setseed(args.seed)
-model_name = args.model_name
-
-out_dir = args.save_dir
-dataloader = create_dataloaders(
-    data_path=args.dataset_dir,
-    annFile=args.ann_file,
-    img_sz=args.imgsz,
-    batch_size=args.bsz,
-    step=args.step,
-    data_process_type=args.data_process_type
-)
-# print("disable_quant_names : ", args.disable_quant_names)
-static_quantize(args.model,
-        calibration_dataloader=dataloader,
-        save_quant_onnx_path=os.path.join(out_dir, f"quantized_{model_name}.onnx"),
-        observer=args.observer,
-        data_preprocess=lambda x: x[0].to("cuda"),
-        quant_format="qdq",
-        disable_quant_names=args.disable_quant_names)
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/requirements.txt b/models/cv/object_detection/yolov7/ixrt/requirements.txt
deleted file mode 100644
index 10a9fba6..00000000
--- a/models/cv/object_detection/yolov7/ixrt/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-tqdm
-onnx
-onnxsim
-ultralytics==8.3.97
-pycocotools
-opencv-python==4.6.0.66
-pycuda
\ No newline at end of file
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
index 30132700..52ec959f 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_accuracy.sh
@@ -3,16 +3,15 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=0.68
+TGT=-1
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=float16
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -115,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -137,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
index aca4b01c..5e2f97fb 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_fp16_performance.sh
@@ -3,17 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=425
-LOOP_COUNT=100
+TGT=-1
+LOOP_COUNT=10
 RUN_MODE=FPS
 PRECISION=float16
 
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
@@ -80,7 +76,6 @@ else
 fi
 CURRENT_MODEL=${NO_DECODER_MODEL}
 
-
 # Quant Model
 if [ $PRECISION == "int8" ];then
     let step++
@@ -116,7 +111,7 @@ if [ $LAYER_FUSION == "1" ]; then
     let step++
     echo;
     echo [STEP ${step}] : Add Decoder
-    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_fusion.onnx
+    FUSION_ONNX=${CHECKPOINTS_DIR}/${MODEL_NAME}_fusion.onnx
     if [ -f $FUSION_ONNX ];then
         echo "  "Add Decoder Skip, $FUSION_ONNX has been existed
     else
@@ -138,7 +133,7 @@ fi
 let step++
 echo;
 echo [STEP ${step}] : Change Batchsize
-FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_bs${BSZ}.onnx
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_bs${BSZ}.onnx
 if [ -f $FINAL_MODEL ];then
     echo "  "Change Batchsize Skip, $FINAL_MODEL has been existed
 else
@@ -181,7 +176,7 @@ let step++
 echo;
 echo [STEP ${step}] : Inference
 python3 ${RUN_DIR}/inference.py                 \
-    --model_engine=${ENGINE_FILE}              \
+    --model_engine=${ENGINE_FILE}               \
     --nms_engine=${NMS_ENGINE}                  \
     --coco_gt=${COCO_GT}                        \
     --eval_dir=${EVAL_DIR}                      \
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
index d9cbd209..606fc94c 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_accuracy.sh
@@ -3,16 +3,15 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=-1
-TGT=0.68
+TGT=-1
 LOOP_COUNT=-1
 RUN_MODE=MAP
 PRECISION=int8
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
diff --git a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
index 051473b2..b2983669 100644
--- a/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
+++ b/models/cv/object_detection/yolov7/ixrt/scripts/infer_yolov7_int8_performance.sh
@@ -3,17 +3,16 @@
 EXIT_STATUS=0
 check_status()
 {
-    ret_code=${PIPESTATUS[0]}
-    if [ ${ret_code} != 0 ]; then
-    [[ ${ret_code} -eq 10 && "${TEST_PERF:-1}" -eq 0 ]] || EXIT_STATUS=1
+    if ((${PIPESTATUS[0]} != 0));then
+    EXIT_STATUS=1
     fi
 }
 
 # Run paraments
 BSZ=32
 WARM_UP=3
-TGT=425
-LOOP_COUNT=100
+TGT=-1
+LOOP_COUNT=10
 RUN_MODE=FPS
 PRECISION=int8
 
@@ -41,9 +40,6 @@ echo ====================== Model Info ======================
 echo Model Name : ${MODEL_NAME}
 echo Onnx Path : ${ORIGINE_MODEL}
 
-CHECKPOINTS_DIR=${CHECKPOINTS_DIR}/tmp
-mkdir -p ${CHECKPOINTS_DIR}
-
 step=0
 faster=0
 CURRENT_MODEL=${ORIGINE_MODEL}
diff --git a/models/cv/object_detection/yolov7/ixrt/simplify_model.py b/models/cv/object_detection/yolov7/ixrt/simplify_model.py
deleted file mode 100644
index b4254b6f..00000000
--- a/models/cv/object_detection/yolov7/ixrt/simplify_model.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import onnx
-import argparse
-from onnxsim import simplify
-
-# Simplify
-def simplify_model(args):
-    onnx_model = onnx.load(args.origin_model)
-    model_simp, check = simplify(onnx_model)
-    model_simp = onnx.shape_inference.infer_shapes(model_simp)
-    onnx.save(model_simp, args.output_model)
-    print("  Simplify onnx Done.")
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-simplify_model(args)
\ No newline at end of file
diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py
index 796ef109..1ab1995b 100644
--- a/tests/run_ixrt.py
+++ b/tests/run_ixrt.py
@@ -266,22 +266,34 @@ def run_detec_testcase(model):
         logging.info(f"Start running {model_name} {prec} test case")
         result["result"].setdefault(prec, {})
         result["result"].setdefault(prec, {"status": "FAIL"})
-        script = f"""
-        cd ../{model['model_path']}
-        export DATASETS_DIR=./{dataset_n}/
-
-        export MODEL_PATH=./{model_name}.onnx
-
-        export PROJ_DIR=./
-        export CHECKPOINTS_DIR=./checkpoints
-        export COCO_GT=./{dataset_n}/annotations/instances_val2017.json
-        export EVAL_DIR=./{dataset_n}/images/val2017
-        export RUN_DIR=./
-        export CONFIG_DIR=config/{config_name}_CONFIG
-
-        bash scripts/infer_{model_name}_{prec}_accuracy.sh
-        bash scripts/infer_{model_name}_{prec}_performance.sh
-        """
+        if model_name in ["yolov3", "yolov5", "yolov5s", "yolov7"]:
+            script = f"""
+            cd ../{model['model_path']}
+            export DATASETS_DIR=./{dataset_n}/
+            export MODEL_PATH=./{model_name}.onnx
+            export PROJ_DIR=./
+            export CHECKPOINTS_DIR=./checkpoints
+            export COCO_GT=./{dataset_n}/annotations/instances_val2017.json
+            export EVAL_DIR=./{dataset_n}/images/val2017
+            export RUN_DIR=../../ixrt_common
+            export CONFIG_DIR=../../ixrt_common/config/{config_name}_CONFIG
+            bash scripts/infer_{model_name}_{prec}_accuracy.sh
+            bash scripts/infer_{model_name}_{prec}_performance.sh
+            """
+        else:
+            script = f"""
+            cd ../{model['model_path']}
+            export DATASETS_DIR=./{dataset_n}/
+            export MODEL_PATH=./{model_name}.onnx
+            export PROJ_DIR=./
+            export CHECKPOINTS_DIR=./checkpoints
+            export COCO_GT=./{dataset_n}/annotations/instances_val2017.json
+            export EVAL_DIR=./{dataset_n}/images/val2017
+            export RUN_DIR=./
+            export CONFIG_DIR=config/{config_name}_CONFIG
+            bash scripts/infer_{model_name}_{prec}_accuracy.sh
+            bash scripts/infer_{model_name}_{prec}_performance.sh
+            """
 
         if model_name == "rtmpose":
             script = f"""
-- 
Gitee


From aabf62faf8df36bba798ccb156ff169e0e057350 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 3 Jul 2025 16:54:03 +0800
Subject: [PATCH 14/15] update fcos

---
 .../cv/object_detection/fcos/ixrt/README.md   |   2 +-
 .../fcos/ixrt/build_engine.py                 |  43 -----
 .../fcos/ixrt/calibration_dataset.py          |  28 ----
 .../cv/object_detection/fcos/ixrt/common.py   | 102 ------------
 .../fcos/ixrt/datasets/__init__.py            |   0
 .../fcos/ixrt/datasets/coco.py                | 116 -------------
 .../fcos/ixrt/datasets/common.py              |  68 --------
 .../fcos/ixrt/datasets/post_process.py        | 157 ------------------
 .../fcos/ixrt/datasets/pre_process.py         |  76 ---------
 .../fcos/ixrt/datasets/vision.py              | 136 ---------------
 .../fcos/ixrt/load_ixrt_plugin.py             |  12 --
 .../fcos/ixrt/modify_batchsize.py             |  37 -----
 .../fcos/ixrt/simplify_model.py               |  21 ---
 .../fcos_r50_caffe_fpn_gn-head_1x_coco.py     |   0
 .../ixrt => ixrt_common}/inference_mmdet.py   |   0
 .../yolov3/ixrt/ci/prepare.sh                 |   4 +-
 16 files changed, 3 insertions(+), 799 deletions(-)
 delete mode 100755 models/cv/object_detection/fcos/ixrt/build_engine.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/calibration_dataset.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/common.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/datasets/__init__.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/datasets/coco.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/datasets/common.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/datasets/post_process.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/datasets/pre_process.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/datasets/vision.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/load_ixrt_plugin.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/modify_batchsize.py
 delete mode 100644 models/cv/object_detection/fcos/ixrt/simplify_model.py
 rename models/cv/object_detection/{fcos/ixrt => ixrt_common}/fcos_r50_caffe_fpn_gn-head_1x_coco.py (100%)
 rename models/cv/object_detection/{fcos/ixrt => ixrt_common}/inference_mmdet.py (100%)

diff --git a/models/cv/object_detection/fcos/ixrt/README.md b/models/cv/object_detection/fcos/ixrt/README.md
index dcc15b7c..721fed15 100755
--- a/models/cv/object_detection/fcos/ixrt/README.md
+++ b/models/cv/object_detection/fcos/ixrt/README.md
@@ -50,7 +50,7 @@ wget http://files.deepspark.org.cn:880/deepspark/fcos_opt.onnx
 export PROJ_DIR=./
 export DATASETS_DIR=./coco/
 export CHECKPOINTS_DIR=./checkpoints
-export RUN_DIR=./
+export RUN_DIR=../../ixrt_common
 ```
 
 ### FP16
diff --git a/models/cv/object_detection/fcos/ixrt/build_engine.py b/models/cv/object_detection/fcos/ixrt/build_engine.py
deleted file mode 100755
index d47e45e5..00000000
--- a/models/cv/object_detection/fcos/ixrt/build_engine.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import os
-import cv2
-import argparse
-import numpy as np
-
-import torch
-import tensorrt
-
-from load_ixrt_plugin import load_ixrt_plugin
-load_ixrt_plugin()
-
-def main(config):
-    IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
-    builder = tensorrt.Builder(IXRT_LOGGER)
-    EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-    build_config = builder.create_builder_config()
-    parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
-    parser.parse_from_file(config.model)
-
-    precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
-    # print("precision : ", precision)
-    build_config.set_flag(precision)
-
-    plan = builder.build_serialized_network(network, build_config)
-    engine_file_path = config.engine
-    with open(engine_file_path, "wb") as f:
-        f.write(plan)
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
-            help="The precision of datatype")
-    # engine args
-    parser.add_argument("--engine", type=str, default=None)
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/calibration_dataset.py b/models/cv/object_detection/fcos/ixrt/calibration_dataset.py
deleted file mode 100644
index 2473f7d0..00000000
--- a/models/cv/object_detection/fcos/ixrt/calibration_dataset.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import os
-import torch
-import torchvision.datasets
-from torch.utils.data import DataLoader
-from datasets.coco import CocoDetection
-
-def create_dataloaders(data_path, annFile, img_sz=640, batch_size=32, step=32, workers=2, data_process_type="yolov5"):
-    dataset = CocoDetection(
-        root=data_path,
-        annFile=annFile,
-        img_size=img_sz,
-        data_process_type=data_process_type
-    )
-    calibration_dataset = dataset
-    num_samples = min(5000, batch_size * step)
-    if num_samples > 0:
-        calibration_dataset = torch.utils.data.Subset(
-            dataset, indices=range(num_samples)
-        )
-
-    calibration_dataloader = DataLoader(
-        calibration_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        drop_last=False,
-        num_workers=workers,
-    )
-    return calibration_dataloader
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/common.py b/models/cv/object_detection/fcos/ixrt/common.py
deleted file mode 100644
index 7d9a078e..00000000
--- a/models/cv/object_detection/fcos/ixrt/common.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import numpy as np
-from tqdm import tqdm
-
-import tensorrt
-from cuda import cuda, cudart
-
-# input  : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
-# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
-def box_class85to6(input):
-    center_x_y = input[:, :2]
-    side = input[:, 2:4]
-    conf = input[:, 4:5]
-    class_id = np.argmax(input[:, 5:], axis = -1)
-    class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
-    max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
-    x1_y1 = center_x_y - 0.5 * side
-    x2_y2 = center_x_y + 0.5 * side
-    nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
-    return nms_input
-
-def save2json(batch_img_id, pred_boxes, json_result, class_trans):
-    for i, boxes in enumerate(pred_boxes):
-        if boxes is not None:
-            image_id = int(batch_img_id[i])
-            # have no target
-            if image_id == -1:
-                continue
-            for x, y, w, h, c, p in boxes:
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                        "image_id": image_id,
-                        "category_id": class_trans[c - 1],
-                        "bbox": [x, y, w, h],
-                        "score": p,
-                    }
-                )
-def save2json_nonms(batch_img_id, pred_boxes, json_result):
-    for i, boxes in enumerate(pred_boxes):
-        image_id = int(batch_img_id)
-        if boxes is not None:
-            x, y, w, h, c, p = boxes
-            if image_id!=-1:
-                
-                x, y, w, h, p = float(x), float(y), float(w), float(h), float(p)
-                c = int(c)
-                json_result.append(
-                    {
-                    "image_id": image_id,
-                    "category_id": c,
-                    "bbox": [x, y, w, h],
-                    "score": p,
-                    }
-                    )
-
-def create_engine_context(engine_path, logger):
-    with open(engine_path, "rb") as f:
-        runtime = tensorrt.Runtime(logger)
-        assert runtime
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine
-        context = engine.create_execution_context()
-        assert context
-
-    return engine, context
-
-def get_io_bindings(engine):
-    # Setup I/O bindings
-    inputs = []
-    outputs = []
-    allocations = []
-
-    for i in range(engine.num_bindings):
-        is_input = False
-        if engine.binding_is_input(i):
-            is_input = True
-        name = engine.get_binding_name(i)
-        dtype = engine.get_binding_dtype(i)
-        shape = engine.get_binding_shape(i)
-        if is_input:
-            batch_size = shape[0]
-        size = np.dtype(tensorrt.nptype(dtype)).itemsize
-        for s in shape:
-            size *= s
-        err, allocation = cudart.cudaMalloc(size)
-        assert err == cudart.cudaError_t.cudaSuccess
-        binding = {
-            "index": i,
-            "name": name,
-            "dtype": np.dtype(tensorrt.nptype(dtype)),
-            "shape": list(shape),
-            "allocation": allocation,
-            "nbytes": size,
-        }
-        print(f"binding {i}, name : {name}  dtype : {np.dtype(tensorrt.nptype(dtype))}  shape : {list(shape)}")
-        allocations.append(allocation)
-        if engine.binding_is_input(i):
-            inputs.append(binding)
-        else:
-            outputs.append(binding)
-    return inputs, outputs, allocations
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/__init__.py b/models/cv/object_detection/fcos/ixrt/datasets/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/coco.py b/models/cv/object_detection/fcos/ixrt/datasets/coco.py
deleted file mode 100644
index 7f355b84..00000000
--- a/models/cv/object_detection/fcos/ixrt/datasets/coco.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import os.path
-from typing import Any, Callable, List, Optional, Tuple
-
-import cv2
-
-from .vision import VisionDataset
-from .pre_process import get_post_process
-class CocoDetection(VisionDataset):
-    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-    """
-
-    def __init__(
-        self,
-        root: str,
-        annFile: str,
-        img_size: int,
-        data_process_type: str,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-        transforms: Optional[Callable] = None,
-        
-    ) -> None:
-        super().__init__(root, transforms, transform, target_transform)
-        from pycocotools.coco import COCO
-
-        self.coco = COCO(annFile)
-        self.ids = list(sorted(self.coco.imgs.keys()))
-        self.img_size = img_size
-        
-        self.transforms = get_post_process(data_process_type)
-
-    def _load_image(self, id: int):
-        path = self.coco.loadImgs(id)[0]["file_name"]
-        data = cv2.imread(os.path.join(self.root, path))
-        return data
-
-    def _load_target(self, id: int) -> List[Any]:
-        return self.coco.loadAnns(self.coco.getAnnIds(id))
-
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        id = self.ids[index]
-        image = self._load_image(id)
-        target = self._load_target(id)
-        origin_shape = image.shape[:2]
-
-        if self.transforms is not None:
-            image = self.transforms(image, self.img_size)
-
-        if len(target) > 0:
-            image_id = target[0]["image_id"]
-        else:
-            # have no target
-            image_id = -1
-        return image, origin_shape, image_id
-
-    def __len__(self) -> int:
-        return len(self.ids)
-
-
-class CocoCaptions(CocoDetection):
-    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
-
-    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
-
-    Args:
-        root (string): Root directory where images are downloaded to.
-        annFile (string): Path to json annotation file.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.PILToTensor``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-        transforms (callable, optional): A function/transform that takes input sample and its target as entry
-            and returns a transformed version.
-
-    Example:
-
-        .. code:: python
-
-            import torchvision.datasets as dset
-            import torchvision.transforms as transforms
-            cap = dset.CocoCaptions(root = 'dir where images are',
-                                    annFile = 'json annotation file',
-                                    transform=transforms.PILToTensor())
-
-            print('Number of samples: ', len(cap))
-            img, target = cap[3] # load 4th sample
-
-            print("Image Size: ", img.size())
-            print(target)
-
-        Output: ::
-
-            Number of samples: 82783
-            Image Size: (3L, 427L, 640L)
-            [u'A plane emitting smoke stream flying over a mountain.',
-            u'A plane darts across a bright blue sky behind a mountain covered in snow',
-            u'A plane leaves a contrail above the snowy mountain top.',
-            u'A mountain that has a plane flying overheard in the distance.',
-            u'A mountain view with a plume of smoke in the background']
-
-    """
-
-    def _load_target(self, id: int) -> List[str]:
-        return [ann["caption"] for ann in super()._load_target(id)]
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/common.py b/models/cv/object_detection/fcos/ixrt/datasets/common.py
deleted file mode 100644
index a8e5e6e7..00000000
--- a/models/cv/object_detection/fcos/ixrt/datasets/common.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
-    # Resize and pad image while meeting stride-multiple constraints
-    shape = im.shape[:2]  # current shape [height, width]
-    if isinstance(new_shape, int):
-        new_shape = (new_shape, new_shape)
-
-    # Scale ratio (new / old)
-    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
-    if not scaleup:  # only scale down, do not scale up (for better val mAP)
-        r = min(r, 1.0)
-
-    # Compute padding
-    ratio = r, r  # width, height ratios
-    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
-    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
-    if auto:  # minimum rectangle
-        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
-    elif scaleFill:  # stretch
-        dw, dh = 0.0, 0.0
-        new_unpad = (new_shape[1], new_shape[0])
-        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
-
-    dw /= 2  # divide padding into 2 sides
-    dh /= 2
-
-    if shape[::-1] != new_unpad:  # resize
-        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
-    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
-    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
-    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
-    return im, ratio, (dw, dh)
-
-def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
-    # Rescale boxes (xyxy) from net_shape to ori_shape
-
-    if use_letterbox:
-
-        gain = min(
-            net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
-        )  # gain  = new / old
-        pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
-            net_shape[0] - ori_shape[0] * gain
-        ) / 2.0
-
-        boxes[:, [0, 2]] -= pad[0]  # x padding
-        boxes[:, [1, 3]] -= pad[1]  # y padding
-        boxes[:, :4] /= gain
-    else:
-        x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
-
-        boxes[:, 0] /= x_scale
-        boxes[:, 1] /= y_scale
-        boxes[:, 2] /= x_scale
-        boxes[:, 3] /= y_scale
-
-    clip_boxes(boxes, ori_shape)
-    return boxes
-
-def clip_boxes(boxes, shape):
-
-    boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
-    boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2
-    
-    return boxes
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/post_process.py b/models/cv/object_detection/fcos/ixrt/datasets/post_process.py
deleted file mode 100644
index 7b411a50..00000000
--- a/models/cv/object_detection/fcos/ixrt/datasets/post_process.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import cv2
-import math
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-from .common import letterbox, scale_boxes, clip_boxes
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Postprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Postprocess
-    elif data_process_type == "yolox":
-        return YoloxPostprocess
-    elif data_process_type == "detr":
-        return DetrPostprocess
-    return None
-
-def Yolov3Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=False
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def Yolov5Postprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            cur_box = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            boxes = scale_boxes(
-                (imgsz[0], imgsz[1]),
-                cur_box,
-                (ori_img_shape[0][i], ori_img_shape[1][i]),
-                use_letterbox=True
-            )
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def YoloxPostprocess(
-    ori_img_shape,
-    imgsz,
-    box_datas,
-    box_nums,
-    sample_num,
-    max_det=1000,
-):
-    all_box = []
-    data_offset = 0
-    box_datas = box_datas.flatten()
-    box_nums = box_nums.flatten()
-
-    for i in range(sample_num):
-        box_num = box_nums[i]
-        if box_num == 0:
-            boxes = None
-        else:
-            boxes = box_datas[data_offset : data_offset + box_num * 6].reshape(-1, 6)
-            r = min(imgsz[0]/ori_img_shape[0][i], imgsz[1]/ori_img_shape[1][i])
-            boxes[:, :4] /= r
-            # xyxy2xywh
-            boxes[:, 2] -= boxes[:, 0]
-            boxes[:, 3] -= boxes[:, 1]
-            clip_boxes(boxes, (ori_img_shape[0][i], ori_img_shape[1][i]))
-
-        all_box.append(boxes)
-        data_offset += max_det * 6
-
-    return all_box
-
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return torch.stack(b, dim=-1)
-
-
-def convert_to_xywh(boxes):
-    xmin, ymin, xmax, ymax = boxes.unbind(-1)
-    return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
-
-def DetrPostprocess(pred_logits, pred_boxes, target_sizes):
-    
-    out_logits = torch.from_numpy(pred_logits) 
-    out_bbox = torch.from_numpy(pred_boxes)
-    assert len(target_sizes) == 2
-    
-    prob = F.softmax(out_logits, -1)
-    scores, labels = prob[..., :-1].max(-1)
-    
-    # convert to [x0, y0, x1, y1] format 
-    boxes = box_cxcywh_to_xyxy(out_bbox)
-    # and from relative [0, 1] to absolute [0, height] coordinates
-    img_w, img_h = target_sizes
-    scale_fct = torch.tensor([img_w, img_h, img_w, img_h])
-    boxes = boxes * scale_fct
-   
-    boxes = clip_boxes(boxes, target_sizes)
-    boxes = convert_to_xywh(boxes)
-
-    labels = labels.unsqueeze(1)
-    scores =scores.unsqueeze(1)
-    pred_boxes = torch.cat([
-            boxes, 
-            labels, 
-            scores], dim=1).numpy().tolist()
-    return pred_boxes
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/pre_process.py b/models/cv/object_detection/fcos/ixrt/datasets/pre_process.py
deleted file mode 100644
index e5b4ddfb..00000000
--- a/models/cv/object_detection/fcos/ixrt/datasets/pre_process.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import cv2
-import math
-import numpy as np
-
-from .common import letterbox
-
-def get_post_process(data_process_type):
-    if data_process_type == "yolov5":
-        return Yolov5Preprocess
-    elif data_process_type == "yolov3":
-        return Yolov3Preprocess
-    elif data_process_type == "yolox":
-        return YoloxPreprocess
-    elif data_process_type == "detr":
-        return DetrPreprocess
-    return None
-
-def Yolov3Preprocess(image, img_size):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    image = cv2.resize(image, (img_size, img_size))
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def Yolov5Preprocess(image, img_size, augment=False):
-
-    h0, w0 = image.shape[:2]  # orig hw
-    r = img_size / max(h0, w0)  # ratio
-
-    if r != 1:  # if sizes are not equal
-        interp = cv2.INTER_LINEAR if (augment or r > 1) else cv2.INTER_AREA
-        image = cv2.resize(image, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
-
-    # shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  rect == True
-
-    image, ratio, dwdh = letterbox(image, new_shape=img_size, auto=False, scaleup=False)
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    return image
-
-def YoloxPreprocess(img, img_size, swap=(2,0,1)):
-
-    padded_img = np.ones((img_size, img_size, 3), dtype=np.uint8) * 114
-    r = min(img_size / img.shape[0], img_size / img.shape[1])
-    resized_img = cv2.resize(
-        img,
-        (int(img.shape[1] * r), int(img.shape[0] * r)),
-        interpolation=cv2.INTER_LINEAR, 
-    ).astype(np.uint8)
-
-    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
-    padded_img = padded_img.transpose(swap)
-    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
-
-    return padded_img
-
-def DetrPreprocess(image, img_size):    
-    # img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-    # img = img.resize((img_size, img_size))
-    
-    std = [0.485, 0.456, 0.406] 
-    mean = [0.229, 0.224, 0.225]
-    
-    image = cv2.resize(image, (img_size, img_size))
-    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
-    image = np.ascontiguousarray(image).astype(np.float32) / 255.0  # 0~1 np array
-    
-    image[0,:,:] = (image[0,:,:]- std[0])/mean[0]
-    image[1,:,:] = (image[1,:,:]- std[1])/mean[1]
-    image[2,:,:] = (image[2,:,:]- std[2])/mean[2]
-    
-    return image
-    
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/datasets/vision.py b/models/cv/object_detection/fcos/ixrt/datasets/vision.py
deleted file mode 100644
index 32da4a78..00000000
--- a/models/cv/object_detection/fcos/ixrt/datasets/vision.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import os
-from typing import Any, Callable, List, Optional, Tuple
-
-import torch
-import torch.utils.data as data
-
-from types import FunctionType
-
-def _log_api_usage_once(obj: Any) -> None:
-
-    """
-    Logs API usage(module and name) within an organization.
-    In a large ecosystem, it's often useful to track the PyTorch and
-    TorchVision APIs usage. This API provides the similar functionality to the
-    logging module in the Python stdlib. It can be used for debugging purpose
-    to log which methods are used and by default it is inactive, unless the user
-    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
-    Please note it is triggered only once for the same API call within a process.
-    It does not collect any data from open-source users since it is no-op by default.
-    For more information, please refer to
-    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
-    * Logging policy: https://github.com/pytorch/vision/issues/5052;
-
-    Args:
-        obj (class instance or method): an object to extract info from.
-    """
-    module = obj.__module__
-    if not module.startswith("torchvision"):
-        module = f"torchvision.internal.{module}"
-    name = obj.__class__.__name__
-    if isinstance(obj, FunctionType):
-        name = obj.__name__
-    torch._C._log_api_usage_once(f"{module}.{name}")
-
-class VisionDataset(data.Dataset):
-    """
-    Base Class For making datasets which are compatible with torchvision.
-    It is necessary to override the ``__getitem__`` and ``__len__`` method.
-
-    Args:
-        root (string): Root directory of dataset.
-        transforms (callable, optional): A function/transforms that takes in
-            an image and a label and returns the transformed versions of both.
-        transform (callable, optional): A function/transform that  takes in an PIL image
-            and returns a transformed version. E.g, ``transforms.RandomCrop``
-        target_transform (callable, optional): A function/transform that takes in the
-            target and transforms it.
-
-    .. note::
-
-        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
-    """
-
-    _repr_indent = 4
-
-    def __init__(
-        self,
-        root: str,
-        transforms: Optional[Callable] = None,
-        transform: Optional[Callable] = None,
-        target_transform: Optional[Callable] = None,
-    ) -> None:
-        _log_api_usage_once(self)
-        if isinstance(root, str):
-            root = os.path.expanduser(root)
-        self.root = root
-
-        has_transforms = transforms is not None
-        has_separate_transform = transform is not None or target_transform is not None
-        if has_transforms and has_separate_transform:
-            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
-
-        # for backwards-compatibility
-        self.transform = transform
-        self.target_transform = target_transform
-
-        if has_separate_transform:
-            transforms = StandardTransform(transform, target_transform)
-        self.transforms = transforms
-
-    def __getitem__(self, index: int) -> Any:
-        """
-        Args:
-            index (int): Index
-
-        Returns:
-            (Any): Sample and meta data, optionally transformed by the respective transforms.
-        """
-        raise NotImplementedError
-
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    def __repr__(self) -> str:
-        head = "Dataset " + self.__class__.__name__
-        body = [f"Number of datapoints: {self.__len__()}"]
-        if self.root is not None:
-            body.append(f"Root location: {self.root}")
-        body += self.extra_repr().splitlines()
-        if hasattr(self, "transforms") and self.transforms is not None:
-            body += [repr(self.transforms)]
-        lines = [head] + [" " * self._repr_indent + line for line in body]
-        return "\n".join(lines)
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def extra_repr(self) -> str:
-        return ""
-
-
-class StandardTransform:
-    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
-        self.transform = transform
-        self.target_transform = target_transform
-
-    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
-        if self.transform is not None:
-            input = self.transform(input)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
-        return input, target
-
-    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
-        lines = transform.__repr__().splitlines()
-        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
-
-    def __repr__(self) -> str:
-        body = [self.__class__.__name__]
-        if self.transform is not None:
-            body += self._format_transform_repr(self.transform, "Transform: ")
-        if self.target_transform is not None:
-            body += self._format_transform_repr(self.target_transform, "Target transform: ")
-
-        return "\n".join(body)
diff --git a/models/cv/object_detection/fcos/ixrt/load_ixrt_plugin.py b/models/cv/object_detection/fcos/ixrt/load_ixrt_plugin.py
deleted file mode 100644
index 932efbdf..00000000
--- a/models/cv/object_detection/fcos/ixrt/load_ixrt_plugin.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import ctypes
-import tensorrt
-from os.path import join, dirname, exists
-def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
-    if not dynamic_path:
-        dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
-    if not exists(dynamic_path):
-        raise FileNotFoundError(
-            f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
-    ctypes.CDLL(dynamic_path)
-    tensorrt.init_libnvinfer_plugins(logger, namespace)
-    print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/modify_batchsize.py b/models/cv/object_detection/fcos/ixrt/modify_batchsize.py
deleted file mode 100644
index 00ed65dd..00000000
--- a/models/cv/object_detection/fcos/ixrt/modify_batchsize.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import onnx
-import argparse
-
-def change_input_dim(model, bsz):
-    batch_size = bsz
-
-    # The following code changes the first dimension of every input to be batch_size
-    # Modify as appropriate ... note that this requires all inputs to
-    # have the same batch_size
-    inputs = model.graph.input
-    for input in inputs:
-        # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
-        # Add checks as needed.
-        dim1 = input.type.tensor_type.shape.dim[0]
-        # update dim to be a symbolic value
-        if isinstance(batch_size, str):
-            # set dynamic batch size
-            dim1.dim_param = batch_size
-        elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
-            # set given batch size
-            dim1.dim_value = int(batch_size)
-        else:
-            # set batch size of 1
-            dim1.dim_value = 1
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--batch_size", type=int)
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-model = onnx.load(args.origin_model)
-change_input_dim(model, args.batch_size)
-onnx.save(model, args.output_model)
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/simplify_model.py b/models/cv/object_detection/fcos/ixrt/simplify_model.py
deleted file mode 100644
index b4254b6f..00000000
--- a/models/cv/object_detection/fcos/ixrt/simplify_model.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import onnx
-import argparse
-from onnxsim import simplify
-
-# Simplify
-def simplify_model(args):
-    onnx_model = onnx.load(args.origin_model)
-    model_simp, check = simplify(onnx_model)
-    model_simp = onnx.shape_inference.infer_shapes(model_simp)
-    onnx.save(model_simp, args.output_model)
-    print("  Simplify onnx Done.")
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--origin_model", type=str)
-    parser.add_argument("--output_model", type=str)
-    args = parser.parse_args()
-    return args
-
-args = parse_args()
-simplify_model(args)
\ No newline at end of file
diff --git a/models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py b/models/cv/object_detection/ixrt_common/fcos_r50_caffe_fpn_gn-head_1x_coco.py
similarity index 100%
rename from models/cv/object_detection/fcos/ixrt/fcos_r50_caffe_fpn_gn-head_1x_coco.py
rename to models/cv/object_detection/ixrt_common/fcos_r50_caffe_fpn_gn-head_1x_coco.py
diff --git a/models/cv/object_detection/fcos/ixrt/inference_mmdet.py b/models/cv/object_detection/ixrt_common/inference_mmdet.py
similarity index 100%
rename from models/cv/object_detection/fcos/ixrt/inference_mmdet.py
rename to models/cv/object_detection/ixrt_common/inference_mmdet.py
diff --git a/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh b/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
index 481003d1..7d6d6fba 100644
--- a/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
+++ b/models/cv/object_detection/yolov3/ixrt/ci/prepare.sh
@@ -27,8 +27,8 @@ fi
 
 pip3 install -r ../../ixrt_common/requirements.txt
 mkdir checkpoints
-unzip -q /mnt/deepspark/data/3rd_party/onnx_tflite_yolov3.zip -d ./
-cp /mnt/deepspark/data/checkpoints/yolov3.weights onnx_tflite_yolov3/weights
+unzip -q /root/data/3rd_party/onnx_tflite_yolov3.zip -d ./
+cp /root/data/checkpoints/yolov3.weights onnx_tflite_yolov3/weights
 cd onnx_tflite_yolov3
 python3 detect.py --cfg cfg/yolov3.cfg --weights weights/yolov3.weights
 mv weights/export.onnx ../checkpoints/yolov3.onnx
-- 
Gitee


From ad1a096d45e2ef56cfe4ad221cf7c8b2141fccad Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 3 Jul 2025 17:25:39 +0800
Subject: [PATCH 15/15] update

---
 models/cv/object_detection/ixrt_common/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/models/cv/object_detection/ixrt_common/requirements.txt b/models/cv/object_detection/ixrt_common/requirements.txt
index b0f4374b..46ef4ec8 100644
--- a/models/cv/object_detection/ixrt_common/requirements.txt
+++ b/models/cv/object_detection/ixrt_common/requirements.txt
@@ -4,4 +4,5 @@ onnxsim
 ultralytics
 pycocotools
 opencv-python==4.6.0.66
-pycuda
\ No newline at end of file
+pycuda
+seaborn
\ No newline at end of file
-- 
Gitee