diff --git a/README.md b/README.md
index d33db3066de534e8fb3aead86ab9ab5ea63b252c..5ad55a16581bc03d39b13b847277f0d29dda59a5 100644
--- a/README.md
+++ b/README.md
@@ -396,15 +396,15 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
- |
- Wide_ResNet50 |
+ Wide ResNet50 |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
Supported |
- - |
+ Supported |
@@ -746,7 +746,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型
Conformer |
FP16 |
Supported |
- - |
+ Supported |
INT8 |
diff --git a/models/cv/classification/swin_transformer_large/ixrt/README.md b/models/cv/classification/swin_transformer_large/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f2282f9fcfdfc756cebc98df53c11c0d03c339c
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/README.md
@@ -0,0 +1,96 @@
+# Swin Transformer Large
+
+## Description
+
+Swin Transformer-Large is a variant of the Swin Transformer, an architecture designed for computer vision tasks, particularly within the realms of image classification, object detection, and segmentation. The Swin Transformer-Large model represents an expanded version with more layers and parameters compared to its base configuration, aiming for improved performance and deeper processing of visual data.
+
+## Setup
+
+### Install
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/cv/classification/swin_transformer_large/ixrt
+cd ${MODEL_PATH}
+
+apt install -y libnuma-dev libgl1-mesa-glx
+
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the open_imagenet dataset.
+
+or you can :
+
+```bash
+bash ./scripts/prepare_model_and_dataset.sh
+
+```
+
+### Model Conversion
+
+Please correct the paths in the following commands or files.
+
+```bash
+tar -xvf open-swin-large.tar
+wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/swin-large-torch-fp32.json
+python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/swin-large/swin-transformer-large.pt --output_path swin-large-torch-fp32.onnx
+
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./swin-large-torch-fp32
+export OPTIMIER_FILE=/Path/ixrt/oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+
+bash ./scripts/infer_swinl_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit here: , which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+
+# copy data
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+cp -r datasets/open_imagenet/* ByteMLPerf/byte_infer_perf/general_perf/datasets/open_imagenet/
+mkdir -p ./ByteMLPerf/general_perf/model_zoo/popular/swin-large
+cp general_perf/model_zoo/popular/swin-large/* ./ByteMLPerf/general_perf/model_zoo/popular/swin-large
+
+# run acc scripts
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task swin-large-torch-fp32
+```
+
+## Results
+
+| Model | BatchSize | Precision | QPS | Top-1 Acc |
+| ---------------------- | --------- | --------- | ----- | --------- |
+| Swin Transformer Large | 2 | FP16 | 5.746 | 85.62 |
diff --git a/models/cv/classification/swin_transformer_large/ixrt/perf_engine.py b/models/cv/classification/swin_transformer_large/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--task",
+ default="resnet50-tf-fp32",
+ help="The task going to be evaluted, refs to workloads/")
+ parser.add_argument(
+ "--hardware_type",
+ default="GPU",
+ help="The backend going to be evaluted, refs to backends/")
+ parser.add_argument("--compile_only",
+ action='store_true',
+ help="Run compilation only")
+
+ args = parser.parse_args()
+ return args
+
+
+class PerfEngine:
+ def __init__(self) -> None:
+ super().__init__()
+ self.args = get_args()
+ self.workload = load_workload(self.args.task)
+ self.backend_type = self.args.hardware_type
+ self.compile_backend = None
+ self.old_os_path = os.environ['PATH']
+ self.prev_sys_path = list(sys.path)
+ self.real_prefix = sys.prefix
+ self.compile_only_mode = False
+
+ def start_engine(self) -> None:
+ '''
+ Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+ '''
+ success, total = 0, len(self.workload)
+ if total == 0:
+ return
+ log.info("******************* Backend Env Initization *******************")
+ status = self.activate_venv(self.backend_type)
+ if not status:
+ log.warning("Activate virtualenv Failed, Please Check...")
+
+ self.compile_backend = init_compile_backend(self.backend_type)
+ self.runtime_backend = init_runtime_backend(self.backend_type)
+
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type)
+ os.makedirs(output_dir, exist_ok=True)
+
+ status = self.single_workload_perf(self.workload)
+
+ def single_workload_perf(
+ self, workload: Dict[str, Any]) -> bool:
+ log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+ # Check Compile Only Mode
+ self.compile_only_mode = False
+ if self.args.compile_only or workload['compile_only']:
+ self.compile_only_mode = True
+
+ base_report = {
+ "Model": workload['model'].upper(),
+ "Backend": self.backend_type,
+ "Host Info": self.get_cpu_name()
+ }
+
+ # Initalize Model Config Info
+ model_info = self.get_model_info(workload['model'])
+ pre_compile_config = {"workload": workload, 'model_info': model_info}
+ interact_info = self.check_interact_info(pre_compile_config)
+ pre_compile_config['interact_info'] = interact_info
+ if not model_info['dataset_name']:
+ model_info['dataset_name'] = 'fake_dataset'
+
+
+ '''
+ Compile Backend could do some optimization like convert model format here
+ '''
+ log.info("******************************************* Running Backend Compilation... *******************************************")
+ log.info("Running Backend Preoptimization...")
+ pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+ # Initalize dataset
+ dataset = load_dataset(model_info)
+ dataset.preprocess()
+ base_report['Dataset'] = model_info['dataset_name'].upper(
+ ) if model_info['dataset_name'] else None
+
+ #Placeholder Only
+ segment_info = self.compile_backend.segment(pre_compile_config)
+
+ best_batch_sizes = self.compile_backend.get_best_batch_size()
+ if isinstance(best_batch_sizes, list):
+ pre_compile_config['workload'][
+ 'batch_sizes'] = best_batch_sizes
+
+ log.info("Start to compile the model...")
+ start = time.time()
+ compile_info = self.compile_backend.compile(pre_compile_config,
+ dataset)
+ end = time.time()
+
+ graph_compile_report = {}
+ graph_compile_report["Compile Duration"] = round(end - start, 5)
+ graph_compile_report["Compile Precision"] = compile_info[
+ 'compile_precision']
+ graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+ if 'optimizations' in compile_info:
+ graph_compile_report['Optimizations'] = compile_info['optimizations']
+ if 'instance_count' in compile_info:
+ base_report['Instance Count'] = compile_info['instance_count']
+ if 'device_count' in compile_info:
+ base_report['Device Count'] = compile_info['device_count']
+ base_report['Graph Compile'] = graph_compile_report
+
+ # Initalize Output Dir and Reports
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type + '/' +
+ workload['model'])
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Compile only mode will stop here
+ if self.compile_only_mode:
+ base_report.pop("Backend")
+ return compile_info["compile_status"], base_report
+
+ # load runtime backend
+ """
+ Start Here
+ """
+ batch_sizes = pre_compile_config['workload']['batch_sizes']
+ self.runtime_backend.configs = compile_info
+ self.runtime_backend.workload = workload
+ self.runtime_backend.model_info = model_info
+
+ self.runtime_backend.load(workload['batch_sizes'][0])
+ # test accuracy
+ accuracy_report = {}
+ AccuracyChecker = self.get_accuracy_checker(
+ model_info['dataset_name']
+ if model_info['dataset_name'] else 'fake_dataset')
+ AccuracyChecker.runtime_backend = self.runtime_backend
+ AccuracyChecker.dataloader = dataset
+ AccuracyChecker.output_dir = output_dir
+ AccuracyChecker.configs = compile_info
+
+ if workload['test_accuracy']:
+ log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+
+ accuracy_report['Data Percent'] = workload['data_percent']
+ accuracy_report.update(accuracy_results)
+
+ # test numeric
+ if workload['test_numeric']:
+ log.info("******************************************* Running Numeric Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ if not workload['test_accuracy']:
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+ diff_results = AccuracyChecker.calculate_diff()
+ accuracy_report.update(diff_results)
+ # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+ if accuracy_report:
+ base_report['Accuracy'] = accuracy_report
+
+ # function to test qps and latency
+ if workload['test_perf']:
+ log.info("******************************************* Runing QPS Checker... *******************************************")
+ performance_reports = []
+ qs_status = self.runtime_backend.is_qs_mode_supported()
+ if qs_status:
+ qs_config = self.runtime_backend.generate_qs_config()
+ performance_reports = self.qs_benchmark(qs_config)
+ else:
+ for bs in batch_sizes:
+ self.runtime_backend.load(bs)
+ batch_reports = self.runtime_backend.benchmark(dataset)
+ performance_reports.append(batch_reports)
+ base_report['Performance'] = performance_reports
+
+ if "Instance Count" not in base_report:
+ log.warning("Vendors need to Add # of instances")
+ if "Device Count" not in base_report:
+ log.warning("Vendors need to Add # of devices")
+
+ # write output to json file
+ output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+ with open(output_report_path, 'w') as file:
+ json.dump(base_report, file, indent=4)
+
+ base_report.pop("Backend")
+ log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+ format(output_dir[output_dir.rfind('general_perf'):],
+ os.path.basename(output_report_path)))
+
+ return compile_info["compile_status"]
+
+ #WIP
+ def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+ return []
+
+ def get_accuracy_checker(self, dataset_name: str):
+ AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+ dataset_name +
+ ".test_accuracy")
+ AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+ return AccuracyChecker()
+
+ def get_model_info(self, model_name: str) -> Dict[str, Any]:
+ with open("general_perf/model_zoo/" + model_name + '.json',
+ 'r') as file:
+ model_info = json.load(file)
+ return model_info
+
+ def get_cpu_name(self):
+ command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+ cpu_name = subprocess.check_output(command, shell=True)
+ return cpu_name.decode().strip()
+
+ def check_interact_info(
+ self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+ interact_info = self.compile_backend.get_interact_profile(
+ pre_compile_config)
+
+ answer = {}
+ if len(interact_info) == 0:
+ return answer
+
+ dialog_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ })
+
+ input_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ 'text-area.prompt': 'bg:#ffffff',
+ 'text-area': '#000000',
+ })
+
+ option = yes_no_dialog(title=self.backend_type + '编译配置',
+ text='[请选择]:是否进行编译后端配置:',
+ style=dialog_style).run()
+ if option:
+ sum_question = len(interact_info)
+ for i, question in enumerate(interact_info):
+ if question['depends']:
+ state = 0
+ for title in question['depends'].split(','):
+ if not answer[title]:
+ state = 1
+ if state:
+ continue
+ if question['dialog_type'] == 'Yes/No Dialog':
+ option = yes_no_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=dialog_style).run()
+ elif question['dialog_type'] == "Input Dialog":
+ option = input_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=input_style).run()
+ elif question['dialog_type'] == "Radiolist Dialog":
+ choice = [(i, text)
+ for i, text in enumerate(question['options'])]
+ num = radiolist_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ values=choice,
+ style=dialog_style).run()
+ option = question['options'][num] if num is not None else question[
+ 'default']
+ answer[question['name']] = option
+
+ return answer
+
+ def activate_venv(self, hardware_type: str) -> bool:
+
+ return True
+
+ def deactivate_venv(self):
+ sys.path[:
+ 0] = self.prev_sys_path #will also revert the added site-packages
+ sys.prefix = self.real_prefix
+ os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+ engine = PerfEngine()
+ engine.start_engine()
diff --git a/models/cv/classification/swin_transformer_large/ixrt/scripts/infer_swinl_fp16_performance.sh b/models/cv/classification/swin_transformer_large/ixrt/scripts/infer_swinl_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eb9350ff48d8a05d6d7f5e2bf3ce1eb0930033b2
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/scripts/infer_swinl_fp16_performance.sh
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+ BS=16
+ TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+ TARGET_ENGINE=${ORIGIN_ONNX_NAME}_end.engine
+ if [[ ! -f "${ORIGIN_ONNX}" ]];then
+ echo "${ORIGIN_ONNX} not exists!"
+ exit 1
+ fi
+
+ # Graph optimize
+ python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --model_type swint --dump_onnx
+ OPTMIZE_STATUS=$?
+
+ # Build Engine
+ ixrtexec --onnx ${TARGET_ONNX} --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin\
+ --min_shape pixel_values.1:${BS}x3x384x384 --opt_shape pixel_values.1:${BS}x3x384x384 --max_shape pixel_values.1:${BS}x3x384x384
+
+ # Test Performance
+ ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes pixel_values.1:${BS}x3x384x384 --shapes pixel_values.1:${BS}x3x384x384
+ PERFORMANCE_STATUS=$?
+
+}
+run 1
\ No newline at end of file
diff --git a/models/cv/classification/swin_transformer_large/ixrt/scripts/prepare_model_and_dataset.sh b/models/cv/classification/swin_transformer_large/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8154d350c9e24de33313a12873582641f8c73263
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model.... *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_imagenet/
+
+wget -O general_perf/download/open-swin-large.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open-swin-large.tar
+tar xf general_perf/download/open-swin-large.tar -C general_perf/model_zoo/popular/
+
+
+# # Download Datasets
+wget -O general_perf/download/open_imagenet.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_imagenet.tar
+tar xf general_perf/download/open_imagenet.tar -C datasets/
+
+
+echo "Extract Done."
diff --git a/models/cv/classification/swin_transformer_large/ixrt/torch2onnx.py b/models/cv/classification/swin_transformer_large/ixrt/torch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f115b730caf065b3f3dfc496c161916afc96d9e
--- /dev/null
+++ b/models/cv/classification/swin_transformer_large/ixrt/torch2onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+ model_name = output_path.split("/")[-1][:-4]
+ with open(model_name + "json", "r") as f:
+ model_info = json.load(f)
+ model_inputs = model_info["inputs"].split(",")
+ input_shapes = model_info["input_shape"]
+ input_type = model_info["input_type"].split(",")
+ example_inputs = _get_fake_samples(input_shapes, input_type)
+
+ model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+ model.eval()
+
+ names = model_inputs
+ dynamic_inputs = {}
+ for i in range(len(names)):
+ dynamic_inputs[names[i]] = {0: "batch_size"}
+ outputs = model_info["outputs"].split(",")
+ for output in outputs:
+ dynamic_inputs[output] = {0: "batch_size"}
+ torch.onnx.export(
+ model,
+ example_inputs,
+ output_path,
+ opset_version=11,
+ input_names=names,
+ output_names=outputs,
+ dynamic_axes=dynamic_inputs,
+ )
+
+
+def _get_fake_samples(shape, type):
+ data = []
+ idx = 0
+ for key, val in shape.items():
+ val = [val[0] * 1] + val[1:]
+ data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+ idx += 1
+ return data
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+ args = get_args()
+ torch_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/igie/README.md b/models/cv/classification/wide_resnet50/igie/README.md
index 50120e88b90dca143ed5b9ce856af7d3903f5aa8..c3bebf1738ae76036f696eb7f1f0e6a80f93553b 100644
--- a/models/cv/classification/wide_resnet50/igie/README.md
+++ b/models/cv/classification/wide_resnet50/igie/README.md
@@ -1,4 +1,4 @@
-# WideResNet50
+# Wide ResNet50
## Description
@@ -51,7 +51,7 @@ bash scripts/infer_wide_resnet50_int8_performance.sh
## Results
-Model |BatchSize |Precision |FPS |Top-1(%) |Top-5(%)
--------------|-----------|----------|----------|----------|--------
-WideResNet50 | 32 | FP16 | 2312.383 | 78.459 | 94.052
-WideResNet50 | 32 | INT8 | 5195.654 | 77.957 | 93.798
+| Model | BatchSize | Precision | FPS | Top-1(%) | Top-5(%) |
+| ------------- | --------- | --------- | -------- | -------- | -------- |
+| Wide ResNet50 | 32 | FP16 | 2312.383 | 78.459 | 94.052 |
+| Wide ResNet50 | 32 | INT8 | 5195.654 | 77.957 | 93.798 |
diff --git a/models/cv/classification/wide_resnet50/ixrt/README.md b/models/cv/classification/wide_resnet50/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..72dd1308b11b2dd7f6237e8c7ec782c99107e0c2
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/README.md
@@ -0,0 +1,61 @@
+# Wide ResNet50
+
+## Description
+
+The distinguishing feature of Wide ResNet50 lies in its widened architecture compared to traditional ResNet models. By increasing the width of the residual blocks, Wide ResNet50 enhances the capacity of the network to capture richer and more diverse feature representations, leading to improved performance on various visual recognition tasks.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install onnx
+pip3 install tqdm
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the validation dataset.
+
+### Model Conversion
+
+```bash
+mkdir -p checkpoints/
+python3 export.py --weight wide_resnet50_2-95faca4d.pth --output checkpoints/wide_resnet50.onnx
+```
+
+## Inference
+
+```bash
+export DATASETS_DIR=/Path/to/imagenet_val/
+export CHECKPOINTS_DIR=./checkpoints
+export RUN_DIR=./
+export CONFIG_DIR=config/WIDE_RESNET50_CONFIG
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_wide_resnet50_fp16_accuracy.sh
+# Performance
+bash scripts/infer_wide_resnet50_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_wide_resnet50_int8_accuracy.sh
+# Performance
+bash scripts/infer_wide_resnet50_int8_performance.sh
+```
+
+## Results
+
+| Model | BatchSize | Precision | FPS | Top-1(%) | Top-5(%) |
+| ------------- | --------- | --------- | -------- | -------- | -------- |
+| Wide ResNet50 | 32 | FP16 | 2478.551 | 78.486 | 94.084 |
+| Wide ResNet50 | 32 | INT8 | 5981.702 | 76.956 | 93.920 |
diff --git a/models/cv/classification/wide_resnet50/ixrt/build_engine.py b/models/cv/classification/wide_resnet50/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e126bc715aa77d38c3abdd1e02191a262689e7
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/build_engine.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from calibration_dataset import getdataloader
+import cuda.cudart as cudart
+
+def assertSuccess(err):
+ assert(err == cudart.cudaError_t.cudaSuccess)
+
+class EngineCalibrator(tensorrt.IInt8EntropyCalibrator2):
+
+ def __init__(self, cache_file, datasets_dir, loop_count=10, bsz=1, img_sz=224):
+ super().__init__()
+ self.cache_file = cache_file
+ self.image_batcher = getdataloader(datasets_dir, loop_count, batch_size=bsz, img_sz=img_sz)
+ self.batch_generator = iter(self.image_batcher)
+ size = img_sz*img_sz*3*bsz
+ __import__('pdb').set_trace()
+ err, self.batch_allocation = cudart.cudaMalloc(size)
+ assertSuccess(err)
+
+ def __del__(self):
+ err,= cudart.cudaFree(self.batch_allocation)
+ assertSuccess(err)
+
+ def get_batch_size(self):
+ return self.image_batcher.batch_size
+
+ def get_batch(self, names):
+ try:
+ batch, _ = next(self.batch_generator)
+ batch = batch.numpy()
+ __import__('pdb').set_trace()
+ cudart.cudaMemcpy(self.batch_allocation,
+ np.ascontiguousarray(batch),
+ batch.nbytes,
+ cudart.cudaMemcpyKind.cudaMemcpyHostToDevice)
+ return [int(self.batch_allocation)]
+ except StopIteration:
+ return None
+
+ def read_calibration_cache(self):
+ if os.path.exists(self.cache_file):
+ with open(self.cache_file, "rb") as f:
+ return f.read()
+
+ def write_calibration_cache(self, cache):
+ with open(self.cache_file, "wb") as f:
+ f.write(cache)
+
+def main(config):
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.VERBOSE)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file(config.model)
+
+ precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+ print("precision : ", precision)
+ build_config.set_flag(precision)
+ if config.precision == "int8":
+ build_config.int8_calibrator = EngineCalibrator("int8_cache", config.datasets_dir)
+
+ plan = builder.build_serialized_network(network, build_config)
+ engine_file_path = config.engine
+ with open(engine_file_path, "wb") as f:
+ f.write(plan)
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str)
+ parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+ help="The precision of datatype")
+ parser.add_argument("--engine", type=str, default=None)
+ parser.add_argument(
+ "--datasets_dir",
+ type=str,
+ default="",
+ help="ImageNet dir",
+ )
+ args = parser.parse_args()
+ return args
+
+if __name__ == "__main__":
+ # cali = EngineCalibrator("tmp", "/home/qiang.zhang/data/imagenet_val/")
+ # print(cali.get_batch_size())
+ # print(cali.get_batch("hello"))
+ args = parse_args()
+ main(args)
diff --git a/models/cv/classification/wide_resnet50/ixrt/build_i8_engine.py b/models/cv/classification/wide_resnet50/ixrt/build_i8_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..6038b33f50cff7a14efcefa6673ae9d2fd19870b
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/build_i8_engine.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import argparse
+import json
+import os
+
+import tensorrt
+import tensorrt as trt
+
+TRT_LOGGER = trt.Logger(tensorrt.Logger.VERBOSE)
+
+EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+
+
+def GiB(val):
+ return val * 1 << 30
+
+
+def json_load(filename):
+ with open(filename) as json_file:
+ data = json.load(json_file)
+ return data
+
+
+def setDynamicRange(network, json_file):
+ """Sets ranges for network layers."""
+ quant_param_json = json_load(json_file)
+ act_quant = quant_param_json["act_quant_info"]
+
+ for i in range(network.num_inputs):
+ input_tensor = network.get_input(i)
+ if act_quant.__contains__(input_tensor.name):
+ print(input_tensor.name)
+ value = act_quant[input_tensor.name]
+ tensor_max = abs(value)
+ tensor_min = -abs(value)
+ input_tensor.dynamic_range = (tensor_min, tensor_max)
+
+ for i in range(network.num_layers):
+ layer = network.get_layer(i)
+
+ for output_index in range(layer.num_outputs):
+ tensor = layer.get_output(output_index)
+
+ if act_quant.__contains__(tensor.name):
+ value = act_quant[tensor.name]
+ tensor_max = abs(value)
+ tensor_min = -abs(value)
+ tensor.dynamic_range = (tensor_min, tensor_max)
+ else:
+ print("\033[1;32m%s\033[0m" % tensor.name)
+
+
+def build_engine(onnx_file, json_file, engine_file):
+ builder = trt.Builder(TRT_LOGGER)
+ network = builder.create_network(EXPLICIT_BATCH)
+
+ config = builder.create_builder_config()
+
+ # If it is a dynamic onnx model , you need to add the following.
+ # profile = builder.create_optimization_profile()
+ # profile.set_shape("input_name", (batch, channels, min_h, min_w), (batch, channels, opt_h, opt_w), (batch, channels, max_h, max_w))
+ # config.add_optimization_profile(profile)
+
+ parser = trt.OnnxParser(network, TRT_LOGGER)
+ # config.max_workspace_size = GiB(1)
+ if not os.path.exists(onnx_file):
+ quit("ONNX file {} not found".format(onnx_file))
+
+ with open(onnx_file, "rb") as model:
+ if not parser.parse(model.read()):
+ print("ERROR: Failed to parse the ONNX file.")
+ for error in range(parser.num_errors):
+ print(parser.get_error(error))
+ return None
+
+ config.set_flag(trt.BuilderFlag.INT8)
+
+ setDynamicRange(network, json_file)
+
+ engine = builder.build_engine(network, config)
+
+ with open(engine_file, "wb") as f:
+ f.write(engine.serialize())
+
+
+if __name__ == "__main__":
+ # Add plugins if needed
+ # import ctypes
+ # ctypes.CDLL("libmmdeploy_tensorrt_ops.so")
+ parser = argparse.ArgumentParser(
+ description="Writing qparams to onnx to convert tensorrt engine."
+ )
+ parser.add_argument("--onnx", type=str, default=None)
+ parser.add_argument("--qparam_json", type=str, default=None)
+ parser.add_argument("--engine", type=str, default=None)
+ arg = parser.parse_args()
+
+ build_engine(arg.onnx, arg.qparam_json, arg.engine)
+ print("\033[1;32mgenerate %s\033[0m" % arg.engine)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/calibration_dataset.py b/models/cv/classification/wide_resnet50/ixrt/calibration_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec931c656abf5b2309dc9938490df46e4e8cdb19
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/calibration_dataset.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from torchvision import models
+from torchvision import transforms as T
+
+
+class CalibrationImageNet(torchvision.datasets.ImageFolder):
+ def __init__(self, *args, **kwargs):
+ super(CalibrationImageNet, self).__init__(*args, **kwargs)
+ img2label_path = os.path.join(self.root, "val_map.txt")
+ if not os.path.exists(img2label_path):
+ raise FileNotFoundError(f"Not found label file `{img2label_path}`.")
+
+ self.img2label_map = self.make_img2label_map(img2label_path)
+
+ def make_img2label_map(self, path):
+ with open(path) as f:
+ lines = f.readlines()
+
+ img2lable_map = dict()
+ for line in lines:
+ line = line.lstrip().rstrip().split("\t")
+ if len(line) != 2:
+ continue
+ img_name, label = line
+ img_name = img_name.strip()
+ if img_name in [None, ""]:
+ continue
+ label = int(label.strip())
+ img2lable_map[img_name] = label
+ return img2lable_map
+
+ def __getitem__(self, index):
+ path, target = self.samples[index]
+ sample = self.loader(path)
+ if self.transform is not None:
+ sample = self.transform(sample)
+ # if self.target_transform is not None:
+ # target = self.target_transform(target)
+ img_name = os.path.basename(path)
+ target = self.img2label_map[img_name]
+
+ return sample, target
+
+
+def create_dataloaders(data_path, num_samples=1024, img_sz=224, batch_size=2, workers=0):
+ dataset = CalibrationImageNet(
+ data_path,
+ transform=T.Compose(
+ [
+ T.Resize(256),
+ T.CenterCrop(img_sz),
+ T.ToTensor(),
+ T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+ ]
+ ),
+ )
+
+ calibration_dataset = dataset
+ if num_samples is not None:
+ calibration_dataset = torch.utils.data.Subset(
+ dataset, indices=range(num_samples)
+ )
+
+ calibration_dataloader = DataLoader(
+ calibration_dataset,
+ shuffle=True,
+ batch_size=batch_size,
+ drop_last=False,
+ num_workers=workers,
+ )
+
+ verify_dataloader = DataLoader(
+ dataset,
+ shuffle=False,
+ batch_size=batch_size,
+ drop_last=False,
+ num_workers=workers,
+ )
+
+ return calibration_dataloader, verify_dataloader
+
+
+def getdataloader(dataset_dir, step=20, batch_size=32, workers=2, img_sz=224, total_sample=50000):
+ num_samples = min(total_sample, step * batch_size)
+ if step < 0:
+ num_samples = None
+ calibration_dataloader, _ = create_dataloaders(
+ dataset_dir,
+ img_sz=img_sz,
+ batch_size=batch_size,
+ workers=workers,
+ num_samples=num_samples,
+ )
+ return calibration_dataloader
diff --git a/models/cv/classification/wide_resnet50/ixrt/common.py b/models/cv/classification/wide_resnet50/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..0458195e5b7980ce70585d7284ca8a875afa3fd6
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/common.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import glob
+import torch
+import tensorrt
+import numpy as np
+import pycuda.driver as cuda
+
+def eval_batch(batch_score, batch_label):
+ batch_score = torch.tensor(torch.from_numpy(batch_score), dtype=torch.float32)
+ values, indices = batch_score.topk(5)
+ top1, top5 = 0, 0
+ for idx, label in enumerate(batch_label):
+
+ if label == indices[idx][0]:
+ top1 += 1
+ if label in indices[idx]:
+ top5 += 1
+ return top1, top5
+
+def create_engine_context(engine_path, logger):
+ with open(engine_path, "rb") as f:
+ runtime = tensorrt.Runtime(logger)
+ assert runtime
+ engine = runtime.deserialize_cuda_engine(f.read())
+ assert engine
+ context = engine.create_execution_context()
+ assert context
+
+ return engine, context
+
+def get_io_bindings(engine):
+ # Setup I/O bindings
+ inputs = []
+ outputs = []
+ allocations = []
+
+ for i in range(engine.num_bindings):
+ is_input = False
+ if engine.binding_is_input(i):
+ is_input = True
+ name = engine.get_binding_name(i)
+ dtype = engine.get_binding_dtype(i)
+ shape = engine.get_binding_shape(i)
+ if is_input:
+ batch_size = shape[0]
+ size = np.dtype(tensorrt.nptype(dtype)).itemsize
+ for s in shape:
+ size *= s
+ allocation = cuda.mem_alloc(size)
+ binding = {
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(tensorrt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
+ }
+ print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}")
+ allocations.append(allocation)
+ if engine.binding_is_input(i):
+ inputs.append(binding)
+ else:
+ outputs.append(binding)
+ return inputs, outputs, allocations
diff --git a/models/cv/classification/wide_resnet50/ixrt/config/WIDE_RESNET50_CONFIG b/models/cv/classification/wide_resnet50/ixrt/config/WIDE_RESNET50_CONFIG
new file mode 100644
index 0000000000000000000000000000000000000000..04e6b34078b14979940a6f5b0747b8032ab6fc2a
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/config/WIDE_RESNET50_CONFIG
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+# IMGSIZE : 模型输入hw大小
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件名称
+IMGSIZE=224
+MODEL_NAME=Wide_Resnet50
+ORIGINE_MODEL=wide_resnet50.onnx
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+ # QUANT_OBSERVER : 量化策略,可选 [hist_percentile, percentile, minmax, entropy, ema]
+ # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致,有些op可能推导shape错误(比如Reshape)
+ # QUANT_STEP : 量化步数
+ # QUANT_SEED : 随机种子 保证量化结果可复现
+ # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=minmax
+QUANT_BATCHSIZE=1
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=
+QUANT_EXIST_ONNX=
diff --git a/models/cv/classification/wide_resnet50/ixrt/export.py b/models/cv/classification/wide_resnet50/ixrt/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d3c64c825ab3aaf172f0c6ca7ef9b802ea06bf9
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/export.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import torch
+import torchvision
+import argparse
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--weight",
+ type=str,
+ required=True,
+ help="pytorch model weight.")
+
+ parser.add_argument("--output",
+ type=str,
+ required=True,
+ help="export onnx model path.")
+
+ args = parser.parse_args()
+ return args
+
+def main():
+ args = parse_args()
+
+ model = torchvision.models.wide_resnet50_2()
+ model.load_state_dict(torch.load(args.weight))
+ model.eval()
+
+ input_names = ['input']
+ output_names = ['output']
+ dynamic_axes = {'input': {0: '-1'}, 'output': {0: '-1'}}
+ dummy_input = torch.randn(1, 3, 224, 224)
+
+ torch.onnx.export(
+ model,
+ dummy_input,
+ args.output,
+ input_names = input_names,
+ dynamic_axes = dynamic_axes,
+ output_names = output_names,
+ opset_version=13
+ )
+
+ print("Export onnx model successfully! ")
+
+if __name__ == "__main__":
+ main()
diff --git a/models/cv/classification/wide_resnet50/ixrt/inference.py b/models/cv/classification/wide_resnet50/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c9dcb3f9cc5b9a26903651a31fafa16d8f0db31
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/inference.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import argparse
+import json
+import os
+import re
+import time
+from tqdm import tqdm
+
+import cv2
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import tensorrt
+
+from calibration_dataset import getdataloader
+from common import eval_batch, create_engine_context, get_io_bindings
+
+def main(config):
+ dataloader = getdataloader(config.datasets_dir, config.loop_count, config.bsz, img_sz=config.imgsz)
+
+ host_mem = tensorrt.IHostMemory
+ logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+ # Load Engine && I/O bindings
+ engine, context = create_engine_context(config.engine_file, logger)
+ inputs, outputs, allocations = get_io_bindings(engine)
+
+ # Warm up
+ if config.warm_up > 0:
+ print("\nWarm Start.")
+ for i in range(config.warm_up):
+ context.execute_v2(allocations)
+ print("Warm Done.")
+
+ # Inference
+ if config.test_mode == "FPS":
+ torch.cuda.synchronize()
+ start_time = time.time()
+
+ for i in range(config.loop_count):
+ context.execute_v2(allocations)
+
+ torch.cuda.synchronize()
+ end_time = time.time()
+ forward_time = end_time - start_time
+
+ num_samples = 50000
+ if config.loop_count * config.bsz < num_samples:
+ num_samples = config.loop_count * config.bsz
+ fps = num_samples / forward_time
+
+ print("FPS : ", fps)
+ print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+ if fps >= config.fps_target:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+ elif config.test_mode == "ACC":
+
+ ## Prepare the output data
+ output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+ print(f"output shape : {output.shape} output type : {output.dtype}")
+
+ total_sample = 0
+ acc_top1, acc_top5 = 0, 0
+
+ with tqdm(total= len(dataloader)) as _tqdm:
+ for idx, (batch_data, batch_label) in enumerate(dataloader):
+ batch_data = batch_data.numpy().astype(inputs[0]["dtype"])
+ batch_data = np.ascontiguousarray(batch_data)
+ total_sample += batch_data.shape[0]
+
+ cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+ context.execute_v2(allocations)
+ cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+
+ # squeeze output shape [32,1000,1,1] to [32,1000] for mobilenet_v2 model
+ if len(output.shape) == 4:
+ output = output.squeeze(axis=(2,3))
+
+ batch_top1, batch_top5 = eval_batch(output, batch_label)
+ acc_top1 += batch_top1
+ acc_top5 += batch_top5
+
+ _tqdm.set_postfix(acc_1='{:.4f}'.format(acc_top1/total_sample),
+ acc_5='{:.4f}'.format(acc_top5/total_sample))
+ _tqdm.update(1)
+
+ print(F"Acc@1 : {acc_top1/total_sample} = {acc_top1}/{total_sample}")
+ print(F"Acc@5 : {acc_top5/total_sample} = {acc_top5}/{total_sample}")
+ acc1 = acc_top1/total_sample
+ print(f"Accuracy Check : Test {acc1} >= target {config.acc_target}")
+ if acc1 >= config.acc_target:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+def parse_config():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+ parser.add_argument(
+ "--engine_file",
+ type=str,
+ help="engine file path"
+ )
+ parser.add_argument(
+ "--datasets_dir",
+ type=str,
+ default="",
+ help="ImageNet dir",
+ )
+ parser.add_argument("--warm_up", type=int, default=-1, help="warm_up times")
+ parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+ parser.add_argument(
+ "--imgsz",
+ "--img",
+ "--img-size",
+ type=int,
+ default=224,
+ help="inference size h,w",
+ )
+ parser.add_argument("--use_async", action="store_true")
+ parser.add_argument(
+ "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4"
+ )
+ parser.add_argument("--fps_target", type=float, default=-1.0)
+ parser.add_argument("--acc_target", type=float, default=-1.0)
+ parser.add_argument("--loop_count", type=int, default=-1)
+
+ config = parser.parse_args()
+ return config
+
+if __name__ == "__main__":
+ config = parse_config()
+ main(config)
diff --git a/models/cv/classification/wide_resnet50/ixrt/modify_batchsize.py b/models/cv/classification/wide_resnet50/ixrt/modify_batchsize.py
new file mode 100644
index 0000000000000000000000000000000000000000..689b7a972dcbfec77c185592ede16bb4f04fa4fd
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/modify_batchsize.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import onnx
+import argparse
+
+def change_input_dim(model, bsz):
+ batch_size = bsz
+
+ # The following code changes the first dimension of every input to be batch_size
+ # Modify as appropriate ... note that this requires all inputs to
+ # have the same batch_size
+ inputs = model.graph.input
+ for input in inputs:
+ # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+ # Add checks as needed.
+ dim1 = input.type.tensor_type.shape.dim[0]
+ # update dim to be a symbolic value
+ if isinstance(batch_size, str):
+ # set dynamic batch size
+ dim1.dim_param = batch_size
+ elif (isinstance(batch_size, str) and batch_size.isdigit()) or isinstance(batch_size, int):
+ # set given batch size
+ dim1.dim_value = int(batch_size)
+ else:
+ # set batch size of 1
+ dim1.dim_value = 1
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--batch_size", type=int)
+ parser.add_argument("--origin_model", type=str)
+ parser.add_argument("--output_model", type=str)
+ args = parser.parse_args()
+ return args
+
+args = parse_args()
+model = onnx.load(args.origin_model)
+change_input_dim(model, args.batch_size)
+onnx.save(model, args.output_model)
+
+
+
+
+
diff --git a/models/cv/classification/wide_resnet50/ixrt/quant.py b/models/cv/classification/wide_resnet50/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d71c828629bb0370aa40c5bcdcf117812bbaedc
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/quant.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+"""这是一个高度自动化的 PPQ 量化的入口脚本,将你的模型和数据按要求进行打包:
+
+在自动化 API 中,我们使用 QuantizationSetting 对象传递量化参数。
+
+This file will show you how to quantize your network with PPQ
+ You should prepare your model and calibration dataset as follow:
+
+ ~/working/model.onnx <-- your model
+ ~/working/data/*.npy or ~/working/data/*.bin <-- your dataset
+
+if you are using caffe model:
+ ~/working/model.caffemdoel <-- your model
+ ~/working/model.prototext <-- your model
+
+### MAKE SURE YOUR INPUT LAYOUT IS [N, C, H, W] or [C, H, W] ###
+
+quantized model will be generated at: ~/working/quantized.onnx
+"""
+from ppq import *
+from ppq.api import *
+import os
+from calibration_dataset import getdataloader
+import argparse
+import random
+import numpy as np
+import torch
+
+
+def setseed(seed=42):
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_name", type=str)
+ parser.add_argument("--model", type=str)
+ parser.add_argument("--dataset_dir", type=str, default="imagenet_val")
+ parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"],
+ default="hist_percentile")
+ parser.add_argument("--disable_quant_names", nargs='*', type=str)
+ parser.add_argument("--save_dir", type=str, help="save path", default=None)
+ parser.add_argument("--bsz", type=int, default=32)
+ parser.add_argument("--step", type=int, default=20)
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--imgsz", type=int, default=224)
+ args = parser.parse_args()
+ print("Quant config:", args)
+ print(args.disable_quant_names)
+ return args
+
+
+config = parse_args()
+
+# modify configuration below:
+WORKING_DIRECTORY = 'checkpoints' # choose your working directory
+TARGET_PLATFORM = TargetPlatform.TRT_INT8 # choose your target platform
+MODEL_TYPE = NetworkFramework.ONNX # or NetworkFramework.CAFFE
+INPUT_LAYOUT = 'chw' # input data layout, chw or hwc
+NETWORK_INPUTSHAPE = [1, 3, 224, 224] # input shape of your network
+EXECUTING_DEVICE = 'cuda' # 'cuda' or 'cpu'.
+REQUIRE_ANALYSE = False
+TRAINING_YOUR_NETWORK = False # 是否需要 Finetuning 一下你的网络
+# -------------------------------------------------------------------
+# 加载你的模型文件,PPQ 将会把 onnx 或者 caffe 模型文件解析成自己的格式
+# 如果你正使用 pytorch, tensorflow 等框架,你可以先将模型导出成 onnx
+# 使用 torch.onnx.export 即可,如果你在导出 torch 模型时发生错误,欢迎与我们联系。
+# -------------------------------------------------------------------
+graph = None
+if MODEL_TYPE == NetworkFramework.ONNX:
+ graph = load_onnx_graph(onnx_import_file=config.model)
+if MODEL_TYPE == NetworkFramework.CAFFE:
+ graph = load_caffe_graph(
+ caffemodel_path=os.path.join(WORKING_DIRECTORY, 'model.caffemodel'),
+ prototxt_path=os.path.join(WORKING_DIRECTORY, 'model.prototxt'))
+assert graph is not None, 'Graph Loading Error, Check your input again.'
+
+# -------------------------------------------------------------------
+# SETTING 对象用于控制 PPQ 的量化逻辑,主要描述了图融合逻辑、调度方案、量化细节策略等
+# 当你的网络量化误差过高时,你需要修改 SETTING 对象中的属性来进行特定的优化
+# -------------------------------------------------------------------
+QS = QuantizationSettingFactory.default_setting()
+
+# -------------------------------------------------------------------
+# 下面向你展示了如何使用 finetuning 过程提升量化精度
+# 在 PPQ 中我们提供了十余种算法用来帮助你恢复精度
+# 开启他们的方式都是 QS.xxxx = True
+# 按需使用,不要全部打开,容易起飞
+# -------------------------------------------------------------------
+if TRAINING_YOUR_NETWORK:
+ QS.lsq_optimization = True # 启动网络再训练过程,降低量化误差
+ QS.lsq_optimization_setting.steps = 500 # 再训练步数,影响训练时间,500 步大概几分钟
+ QS.lsq_optimization_setting.collecting_device = 'cuda' # 缓存数据放在那,cuda 就是放在gpu,如果显存超了你就换成 'cpu'
+
+
+dataloader = getdataloader(config.dataset_dir, config.step, batch_size=config.bsz, img_sz=config.imgsz)
+# ENABLE CUDA KERNEL 会加速量化效率 3x ~ 10x,但是你如果没有装相应编译环境的话是编译不了的
+# 你可以尝试安装编译环境,或者在不启动 CUDA KERNEL 的情况下完成量化:移除 with ENABLE_CUDA_KERNEL(): 即可
+with ENABLE_CUDA_KERNEL():
+ print('网络正量化中,根据你的量化配置,这将需要一段时间:')
+ quantized = quantize_native_model(
+ setting=QS, # setting 对象用来控制标准量化逻辑
+ model=graph,
+ calib_dataloader=dataloader,
+ calib_steps=config.step,
+ input_shape=NETWORK_INPUTSHAPE, # 如果你的网络只有一个输入,使用这个参数传参
+ inputs=None,
+ # 如果你的网络有多个输入,使用这个参数传参,就是 input_shape=None, inputs=[torch.zeros(1,3,224,224), torch.zeros(1,3,224,224)]
+ collate_fn=lambda x: x[0].to(EXECUTING_DEVICE), # collate_fn 跟 torch dataloader 的 collate fn 是一样的,用于数据预处理,
+ # 你当然也可以用 torch dataloader 的那个,然后设置这个为 None
+ platform=TARGET_PLATFORM,
+ device=EXECUTING_DEVICE,
+ do_quantize=True)
+
+ # -------------------------------------------------------------------
+ # 如果你需要执行量化后的神经网络并得到结果,则需要创建一个 executor
+ # 这个 executor 的行为和 torch.Module 是类似的,你可以利用这个东西来获取执行结果
+ # 请注意,必须在 export 之前执行此操作。
+ # -------------------------------------------------------------------
+ executor = TorchExecutor(graph=quantized, device=EXECUTING_DEVICE)
+ # output = executor.forward(input)
+
+ # -------------------------------------------------------------------
+ # PPQ 计算量化误差时,使用信噪比的倒数作为指标,即噪声能量 / 信号能量
+ # 量化误差 0.1 表示在整体信号中,量化噪声的能量约为 10%
+ # 你应当注意,在 graphwise_error_analyse 分析中,我们衡量的是累计误差
+ # 网络的最后一层往往都具有较大的累计误差,这些误差是其前面的所有层所共同造成的
+ # 你需要使用 layerwise_error_analyse 逐层分析误差的来源
+ # -------------------------------------------------------------------
+ print('正计算网络量化误差(SNR),最后一层的误差应小于 0.1 以保证量化精度:')
+ reports = graphwise_error_analyse(
+ graph=quantized, running_device=EXECUTING_DEVICE, steps=32,
+ dataloader=dataloader, collate_fn=lambda x: x[0].to(EXECUTING_DEVICE))
+ for op, snr in reports.items():
+ if snr > 0.1: ppq_warning(f'层 {op} 的累计量化误差显著,请考虑进行优化')
+
+ if REQUIRE_ANALYSE:
+ print('正计算逐层量化误差(SNR),每一层的独立量化误差应小于 0.1 以保证量化精度:')
+ layerwise_error_analyse(graph=quantized, running_device=EXECUTING_DEVICE,
+ interested_outputs=None,
+ dataloader=dataloader, collate_fn=lambda x: x.to(EXECUTING_DEVICE))
+
+ # -------------------------------------------------------------------
+ # 使用 export_ppq_graph 函数来导出量化后的模型
+ # PPQ 会根据你所选择的导出平台来修改模型格式
+ # -------------------------------------------------------------------
+ print('网络量化结束,正在生成目标文件:')
+ export_ppq_graph(
+ graph=quantized, platform=TARGET_PLATFORM,
+ graph_save_to=os.path.join(config.save_dir, f"quantized_{config.model_name}.onnx"),
+ config_save_to=os.path.join(config.save_dir, 'quant_cfg.json'))
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_model.py b/models/cv/classification/wide_resnet50/ixrt/refine_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f1e6c2f6325651556267ceed7e4403a565a2f69
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/refine_model.py
@@ -0,0 +1,290 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import argparse
+import dataclasses
+
+import torch
+import onnx
+
+from refine_utils.matmul_to_gemm_pass import FusedGemmPass
+from refine_utils.linear_pass import FusedLinearPass
+
+from refine_utils.common import *
+
+def get_constant_input_name_of_operator(graph: Graph, operator: Operator):
+ const = None
+ for input in operator.inputs:
+ if not graph.containe_var(input):
+ continue
+
+ if not graph.is_leaf_variable(input):
+ continue
+
+ input_var = graph.get_variable(input)
+ if input_var.value is not None:
+ const = input
+ return const
+
+class FuseLayerNormPass(BasePass):
+
+ def process(self, graph: Graph) -> Graph:
+ self.transform = GraphTransform(graph)
+ find_sequence_subgraph(
+ graph,
+ [OP.REDUCE_MEAN, OP.SUB, OP.POW, OP.REDUCE_MEAN, OP.ADD, OP.SQRT, OP.DIV, OP.MUL, OP.ADD],
+ self.fuse_layer_norm,
+ strict=False
+ )
+ return graph
+
+ def fuse_layer_norm(self, graph: Graph, pattern: PatternGraph):
+ # 检查 REDUCE_MEAN 的输入是否和 SUB 的输入是一致的
+ if pattern.nodes[0].operator.inputs[0] != pattern.nodes[1].operator.inputs[0]:
+ return
+
+ # 检查 POW 的输入是否和 DIV 的输入是一致的
+ if pattern.nodes[2].operator.inputs[0] != pattern.nodes[6].operator.inputs[0]:
+ return
+
+ # 检查部分算子的输出是否被多个算子使用
+ nodes = pattern.nodes
+ for node in [nodes[0]] + nodes[2:-1]:
+ next_ops = graph.get_next_operators(node.operator)
+ if len(next_ops) > 1:
+ return
+
+ eps = None
+ for input in nodes[4].operator.inputs:
+ input_var = graph.get_variable(input)
+ if input_var.value is not None and graph.is_leaf_variable(input):
+ eps = to_py_type(input_var.value)
+
+ scale = get_constant_input_name_of_operator(graph, nodes[-2].operator)
+ bias = get_constant_input_name_of_operator(graph, nodes[-1].operator)
+
+ self.transform.delete_operators_between_op_op(nodes[0].operator, nodes[-1].operator)
+
+ bias_var = graph.get_variable(bias)
+ print(bias_var)
+
+ attributes = {
+ "axis": nodes[0].operator.attributes.axes,
+ "epsilon": eps,
+ }
+
+
+ layer_norm_op = self.transform.make_operator(
+ op_type="LayerNormalization",
+ inputs=[nodes[0].operator.inputs[0], scale, bias],
+ outputs=[nodes[-1].operator.outputs[0]],
+ **attributes
+ )
+
+ self.transform.add_operator(layer_norm_op)
+
+class FusedGeluPass(BasePass):
+
+ def process(self, graph: Graph) -> Graph:
+ self.transform = GraphTransform(graph)
+
+ find_sequence_subgraph(
+ graph, pattern=[OP.DIV, OP.ERF, OP.ADD, OP.MUL, OP.MUL], callback=self.fuse_gelu, strict=True
+ )
+ return graph
+
+ def fuse_gelu(self, graph: Graph, pattern: PatternGraph):
+ nodes = pattern.nodes
+ prev_op = self.transform.get_previous_operators(nodes[0].operator)[0]
+ next_ops = self.transform.get_next_operators(prev_op)
+ if len(next_ops) != 2:
+ return
+
+ if nodes[0].operator not in next_ops or nodes[3].operator not in next_ops:
+ return
+
+ gelu_op_input = None
+ for input in nodes[3].operator.inputs:
+ if input in nodes[0].operator.inputs:
+ gelu_op_input = input
+ break
+
+ self.transform.delete_operators_between_op_op(nodes[0].operator, nodes[-1].operator)
+
+ gelu_op = self.transform.make_operator(
+ op_type=OP.GELU,
+ inputs=[gelu_op_input],
+ outputs=[nodes[-1].operator.outputs[0]]
+ )
+ self.transform.add_operator(gelu_op)
+
+@dataclasses.dataclass
+class NormalizeAttr(BaseOperatorAttr):
+ p: float = 2.0
+ epsilon: float = 1e-12
+ axis: int = 1
+
+
+@registe_operator(OP.GELU)
+class GeluOperator(BaseOperator):
+
+ def call(
+ self,
+ executor,
+ operator: Operator,
+ inputs: List,
+ attr: NormalizeAttr,
+ ):
+ return F.gelu(inputs[0])
+
+ def convert_onnx_operator(
+ self, ir_graph: Graph, onnx_graph: onnx.GraphProto, node: onnx.NodeProto
+ ) -> Operator:
+ return default_converter(ir_graph, onnx_graph, node, attr_cls=attr.EmptyAttr)
+
+ def quantize(
+ self,
+ graph: Graph,
+ op: Operator,
+ operator_observer_config: QuantOperatorObserverConfig,
+ quant_outputs: bool = False,
+ ):
+ return quant_single_input_operator(graph, op, operator_observer_config, quant_outputs=quant_outputs)
+
+
+
+class ClearUnsedVariables(BasePass):
+
+ def process(self, graph: Graph) -> Graph:
+ vars = list(graph.variables)
+
+ for var in vars:
+ if len(graph.get_dst_operators(var)) == 0 and graph.is_leaf_variable(var):
+ graph.delete_variable(var)
+
+ quant_params = list(graph.quant_parameters.keys())
+ for var in quant_params:
+ if not graph.containe_var(var):
+ graph.quant_parameters.pop(var)
+
+ return graph
+
+class FormatLayerNorm(BasePass):
+
+ def process(self, graph: Graph) -> Graph:
+ for op in graph.operators.values():
+ if "LayerNorm" in op.op_type:
+ self.format_layer_norm(graph, op)
+ return graph
+
+ def format_layer_norm(self, graph, operator):
+ if not hasattr(operator.attributes, "axis"):
+ return
+ if isinstance(operator.attributes.axis, (tuple, list)):
+ operator.attributes.axis = operator.attributes.axis[0]
+
+class FormatReshape(BasePass):
+
+ def process(self, graph: Graph) -> Graph:
+ for op in graph.operators.values():
+ if op.op_type == "Reshape":
+ self.format_reshape(graph, op)
+
+ return graph
+
+ def format_reshape(self, graph, operator):
+ shape = graph.get_variable(operator.inputs[1])
+ shape.value = torch.tensor(shape.value, dtype=torch.int64)
+
+class FormatScalar(BasePass):
+
+ def process(self, graph: Graph):
+ for var in graph.variables.values():
+ var: Variable
+ use_ops = graph.get_dst_operators(var)
+
+ if len(use_ops) == 0:
+ continue
+
+ if use_ops[0].op_type not in [OP.MUL, OP.ADD, OP.GATHER]:
+ continue
+
+ if var.value is not None and var.value.ndim == 0:
+ var.value = var.value.reshape(1)
+ print(f"Reshape scalar to tensor for {var.name}.")
+
+ return graph
+
+class RenamePass(BasePass):
+
+ def process(self, graph:Graph):
+
+ names = [name for name in graph.operators.keys()]
+ for old_name in names:
+ new_name = old_name.replace("/", "#")
+
+ graph.rename_operator(old_name, new_name)
+
+ names = [name for name in graph.variables.keys()]
+ for name in names:
+ new_name = name.replace("/", ".").replace("Output", "out").replace("output", "out")
+
+ graph.rename_vaiable(name, new_name,
+ with_variables=True,
+ with_operator_outputs=True)
+
+ return graph
+
+def create_pipeline(example_inputs):
+ return PassSequence(
+ # FuseLayerNormPass(),
+ FusedGeluPass(),
+
+ # ClearUnsedVariables(),
+ # FormatLayerNorm(),
+ # FormatReshape(),
+ # FormatScalar(),
+ # RenamePass()
+ )
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--onnx_path", type=str)
+ parser.add_argument("--dst_onnx_path", type=str)
+
+ parser.add_argument("--bsz", type=int, default=8,
+ help="Batch size")
+ parser.add_argument("--imgsz", type=int, default=224,
+ help="Image size")
+
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = parse_args()
+
+ example_inputs = torch.randn(args.bsz, 3, args.imgsz, args.imgsz)
+
+ refine_pipline = Pipeline(
+ create_source(f"{args.onnx_path}", example_inputs=example_inputs),
+ create_pipeline(example_inputs),
+ create_target(
+ f"{args.dst_onnx_path}",
+ example_inputs=example_inputs,
+ )
+ )
+ refine_pipline.run()
+
+ print(f"refine the model, input shape={example_inputs.shape}")
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_utils/__init__.py b/models/cv/classification/wide_resnet50/ixrt/refine_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_utils/common.py b/models/cv/classification/wide_resnet50/ixrt/refine_utils/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2af19a14df73cea6ba27ad6a8ad020fe0bec7aaa
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/refine_utils/common.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+from typing import Union, Callable, List
+
+from tensorrt.deploy.api import *
+from tensorrt.deploy.backend.onnx.converter import default_converter
+from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
+from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
+from tensorrt.deploy.ir.operator_type import OperatorType as OP
+from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
+from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
+from tensorrt.deploy.ir import Graph
+from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
+from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
+
+def find_sequence_subgraph(graph,
+ pattern: Union[List[str], PatternGraph],
+ callback: Callable[[Graph, PatternGraph], None],
+ strict=True):
+ if isinstance(pattern, List):
+ pattern = build_sequence_graph(pattern)
+
+ matcher = GraphMatcher(pattern, strict=strict)
+ return matcher.findall(graph, callback)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_utils/linear_pass.py b/models/cv/classification/wide_resnet50/ixrt/refine_utils/linear_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..29b5e4a96e6edc448168bd78ede3111f6b59c032
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/refine_utils/linear_pass.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import dataclasses
+
+from refine_utils.common import *
+
+# AXB=C, Only for B is initializer
+
+class FusedLinearPass(BasePass):
+
+ def process(self, graph: Graph) -> Graph:
+ self.transform = GraphTransform(graph)
+
+ find_sequence_subgraph(
+ graph, pattern=[OP.MATMUL, OP.ADD], callback=self.to_linear_with_bias, strict=True
+ )
+ find_sequence_subgraph(
+ graph, pattern=[OP.MATMUL], callback=self.to_linear, strict=True
+ )
+ return graph
+
+ def to_linear_with_bias(self, graph, pattern: PatternGraph):
+ matmul = pattern.nodes[0]
+ add = pattern.nodes[1]
+ if len(add.operator.inputs) != 2:
+ return
+
+ b_var = graph.get_variable(matmul.operator.inputs[1])
+ if not graph.is_leaf_variable(b_var) or b_var.value is None:
+ return
+
+ if b_var.value.ndim != 2:
+ return
+
+ bias_var = None
+ for input in add.operator.inputs:
+ if input not in matmul.operator.outputs:
+ bias_var = input
+
+ inputs = matmul.operator.inputs
+ inputs.append(bias_var)
+ outputs = add.operator.outputs
+
+ b_var.value = b_var.value.transpose(1, 0)
+ b_var.shape[0],b_var.shape[1] = b_var.shape[1],b_var.shape[0]
+
+ hidden_size = b_var.shape[1]
+ linear_dim = b_var.shape[0]
+
+ attributes = {
+ "hidden_size": hidden_size,
+ "linear_dim": linear_dim,
+ "has_bias": 1,
+ "act_type":"none"
+ }
+
+ self.transform.make_operator(
+ "LinearFP16",
+ inputs=inputs,
+ outputs=outputs,
+ **attributes
+ )
+
+ self.transform.delete_operator(add.operator)
+ self.transform.delete_operator(matmul.operator)
+
+ def to_linear(self, graph, pattern: PatternGraph):
+ matmul = pattern.nodes[0]
+ if len(matmul.operator.inputs) != 2:
+ return
+
+ b_var = graph.get_variable(matmul.operator.inputs[1])
+ if not graph.is_leaf_variable(b_var) or b_var.value is None:
+ return
+
+ if b_var.value.ndim != 2:
+ return
+
+ attributes = {
+ "hidden_size": hidden_size,
+ "linear_dim": linear_dim,
+ "has_bias": 0,
+ "act_type": "none"
+ }
+
+ b_var.value = b_var.value.transpose(1, 0)
+ b_var.shape[0],b_var.shape[1] = b_var.shape[1], b_var.shape[0]
+
+ hidden_size = b_var.shape[1]
+ linear_dim = b_var.shape[0]
+
+ op = self.transform.make_operator(
+ op_type = "LinearFP16",
+ inputs = pattern.nodes[0].operator.inputs,
+ outputs=[pattern.nodes[-1].operator.outputs[0]],
+ **attributes
+ )
+
+ self.transform.add_operator(op)
+
+ self.transform.delete_operator(matmul.operator)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/refine_utils/matmul_to_gemm_pass.py b/models/cv/classification/wide_resnet50/ixrt/refine_utils/matmul_to_gemm_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ebfac4d917d6b05e46187f025c3c17184096e80
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/refine_utils/matmul_to_gemm_pass.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+from refine_utils.common import *
+
+#
+# Common pattern Matmul to Gemm
+#
+class FusedGemmPass(BasePass):
+
+ def process(self, graph: Graph) -> Graph:
+ self.transform = GraphTransform(graph)
+
+ find_sequence_subgraph(
+ graph, pattern=[OP.MATMUL], callback=self.to_gemm, strict=True
+ )
+ return graph
+
+ def to_gemm(self, graph, pattern: PatternGraph):
+ matmul_op = pattern.nodes[0]
+ inputs = matmul_op.operator.inputs
+ outputs = matmul_op.operator.outputs
+
+ if len(inputs)!=2 and len(outputs)!=1:
+ return
+
+ for input in inputs:
+ if self.transform.is_leaf_variable(input):
+ return
+
+ print(f"{self.transform.get_variable(inputs[0]).shape} {self.transform.get_variable(inputs[1]).shape}")
+ self.transform.delete_operator(matmul_op.operator)
+
+ op = self.transform.make_operator(
+ op_type = "Gemm",
+ inputs = inputs,
+ outputs = outputs,
+ alpha = 1,
+ beta = 1,
+ transB = 1
+ )
+
+ self.transform.add_operator(op)
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_accuracy.sh b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b743d7084ae058118c29daaf494769fc293ceb41
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_accuracy.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+ index=`expr $index + 1`
+ case $argument in
+ --bs) BSZ=${arguments[index]};;
+ --tgt) TGT=${arguments[index]};;
+ esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+ echo " "Simplify Model, ${SIM_MODEL} has been existed
+else
+ python3 ${RUN_DIR}/simplify_model.py \
+ --origin_model $ORIGINE_MODEL \
+ --output_model ${SIM_MODEL}
+ echo " "Generate ${SIM_MODEL}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+ --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+ echo " "Generate ${FINAL_MODEL}
+fi
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision ${PRECISION} \
+ --model ${FINAL_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py \
+ --engine_file=${ENGINE_FILE} \
+ --datasets_dir=${DATASETS_DIR} \
+ --imgsz=${IMGSIZE} \
+ --warm_up=${WARM_UP} \
+ --loop_count ${LOOP_COUNT} \
+ --test_mode ${RUN_MODE} \
+ --acc_target ${TGT} \
+ --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_performance.sh b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e7a4f1a7276406a0ed7400af4368b5bec2a06e06
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_fp16_performance.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+ index=`expr $index + 1`
+ case $argument in
+ --bs) BSZ=${arguments[index]};;
+ --tgt) TGT=${arguments[index]};;
+ esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+ echo " "Simplify Model, ${SIM_MODEL} has been existed
+else
+ python3 ${RUN_DIR}/simplify_model.py \
+ --origin_model $ORIGINE_MODEL \
+ --output_model ${SIM_MODEL}
+ echo " "Generate ${SIM_MODEL}
+fi
+
+# Change Batchsize
+let step++
+echo;
+echo [STEP ${step}] : Change Batchsize
+FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_${BSZ}.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+ --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+ echo " "Generate ${FINAL_MODEL}
+fi
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision ${PRECISION} \
+ --model ${FINAL_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py \
+ --engine_file=${ENGINE_FILE} \
+ --datasets_dir=${DATASETS_DIR} \
+ --imgsz=${IMGSIZE} \
+ --warm_up=${WARM_UP} \
+ --loop_count ${LOOP_COUNT} \
+ --test_mode ${RUN_MODE} \
+ --fps_target ${TGT} \
+ --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_accuracy.sh b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..367bdd4bd22be28f96cd3c6719888d0ca889c612
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_accuracy.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+set -x
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=int8
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+ index=`expr $index + 1`
+ case $argument in
+ --bs) BSZ=${arguments[index]};;
+ --tgt) TGT=${arguments[index]};;
+ esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+ echo [STEP ${step}] : Simplify Model
+ if [ -f ${SIM_MODEL} ];then
+ echo " "Simplify Model, ${SIM_MODEL} has been existed
+ else
+ python3 ${RUN_DIR}/simplify_model.py \
+ --origin_model $ORIGINE_MODEL \
+ --output_model ${SIM_MODEL}
+ echo " "Generate ${SIM_MODEL}
+ fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+ let step++
+ echo;
+ echo [STEP ${step}] : Quant Model
+ if [[ -z ${QUANT_EXIST_ONNX} ]];then
+ QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+ fi
+ if [[ -f ${QUANT_EXIST_ONNX} ]];then
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+ else
+ python3 ${RUN_DIR}/quant.py \
+ --model ${SIM_MODEL} \
+ --model_name ${MODEL_NAME} \
+ --dataset_dir ${DATASETS_DIR} \
+ --observer ${QUANT_OBSERVER} \
+ --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+ --save_dir $CHECKPOINTS_DIR \
+ --bsz ${QUANT_BATCHSIZE} \
+ --step ${QUANT_STEP} \
+ --seed ${QUANT_SEED} \
+ --imgsz ${IMGSIZE}
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Generate ${SIM_MODEL}
+ fi
+fi
+
+ # Change Batchsize
+ let step++
+ echo;
+ echo [STEP ${step}] : Change Batchsize
+ FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx
+ if [ -f $FINAL_MODEL ];then
+ echo " "Change Batchsize Skip, $FINAL_MODEL has been existed
+ else
+ python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+ --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+ echo " "Generate ${FINAL_MODEL}
+ fi
+
+ # Build Engine
+ let step++
+ echo;
+ echo [STEP ${step}] : Build Engine
+ ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+ else
+ python3 ${RUN_DIR}/build_i8_engine.py \
+ --onnx ${FINAL_MODEL} \
+ --qparam_json ${CHECKPOINTS_DIR}/quant_cfg.json \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+ fi
+
+# Inference
+# let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py \
+ --engine_file=${ENGINE_FILE} \
+ --datasets_dir=${DATASETS_DIR} \
+ --imgsz=${IMGSIZE} \
+ --warm_up=${WARM_UP} \
+ --loop_count ${LOOP_COUNT} \
+ --test_mode ${RUN_MODE} \
+ --acc_target ${TGT} \
+ --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
diff --git a/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_performance.sh b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..468c557de451ddab0024ef2c69e9fa42751a50ce
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/scripts/infer_wide_resnet50_int8_performance.sh
@@ -0,0 +1,143 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+# Run paraments
+BSZ=32
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=int8
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+ index=`expr $index + 1`
+ case $argument in
+ --bs) BSZ=${arguments[index]};;
+ --tgt) TGT=${arguments[index]};;
+ esac
+done
+
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_sim.onnx
+
+# Simplify Model
+let step++
+ echo [STEP ${step}] : Simplify Model
+ if [ -f ${SIM_MODEL} ];then
+ echo " "Simplify Model, ${SIM_MODEL} has been existed
+ else
+ python3 ${RUN_DIR}/simplify_model.py \
+ --origin_model $ORIGINE_MODEL \
+ --output_model ${SIM_MODEL}
+ echo " "Generate ${SIM_MODEL}
+ fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+ let step++
+ echo;
+ echo [STEP ${step}] : Quant Model
+ if [[ -z ${QUANT_EXIST_ONNX} ]];then
+ QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/quantized_${MODEL_NAME}.onnx
+ fi
+ if [[ -f ${QUANT_EXIST_ONNX} ]];then
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+ else
+ python3 ${RUN_DIR}/quant.py \
+ --model ${SIM_MODEL} \
+ --model_name ${MODEL_NAME} \
+ --dataset_dir ${DATASETS_DIR} \
+ --observer ${QUANT_OBSERVER} \
+ --disable_quant_names ${DISABLE_QUANT_LIST[@]} \
+ --save_dir $CHECKPOINTS_DIR \
+ --bsz ${QUANT_BATCHSIZE} \
+ --step ${QUANT_STEP} \
+ --seed ${QUANT_SEED} \
+ --imgsz ${IMGSIZE}
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Generate ${SIM_MODEL}
+ fi
+fi
+
+ # Change Batchsize
+ let step++
+ echo;
+ echo [STEP ${step}] : Change Batchsize
+ FINAL_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}_quant_${BSZ}.onnx
+ if [ -f $FINAL_MODEL ];then
+ echo " "Change Batchsize Skip, $FINAL_MODEL has been existed
+ else
+ python3 ${RUN_DIR}/modify_batchsize.py --batch_size ${BSZ} \
+ --origin_model ${SIM_MODEL} --output_model ${FINAL_MODEL}
+ echo " "Generate ${FINAL_MODEL}
+ fi
+
+ # Build Engine
+ let step++
+ echo;
+ echo [STEP ${step}] : Build Engine
+ ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+ if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+ else
+ python3 ${RUN_DIR}/build_i8_engine.py \
+ --onnx ${FINAL_MODEL} \
+ --qparam_json ${CHECKPOINTS_DIR}/quant_cfg.json \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+ fi
+
+# Inference
+# let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py \
+ --engine_file=${ENGINE_FILE} \
+ --datasets_dir=${DATASETS_DIR} \
+ --imgsz=${IMGSIZE} \
+ --warm_up=${WARM_UP} \
+ --loop_count ${LOOP_COUNT} \
+ --test_mode ${RUN_MODE} \
+ --acc_target ${TGT} \
+ --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/classification/wide_resnet50/ixrt/simplify_model.py b/models/cv/classification/wide_resnet50/ixrt/simplify_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9948a9fa083ff99ff88e556e96614b02cccaa965
--- /dev/null
+++ b/models/cv/classification/wide_resnet50/ixrt/simplify_model.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import onnx
+import argparse
+from onnxsim import simplify
+
+# Simplify
+def simplify_model(args):
+ onnx_model = onnx.load(args.origin_model)
+ model_simp, check = simplify(onnx_model)
+ model_simp = onnx.shape_inference.infer_shapes(model_simp)
+ onnx.save(model_simp, args.output_model)
+ print(" Simplify onnx Done.")
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--origin_model", type=str)
+ parser.add_argument("--output_model", type=str)
+ parser.add_argument("--reshape", action="store_true")
+ args = parser.parse_args()
+ return args
+
+args = parse_args()
+simplify_model(args)
+
+
+
+
diff --git a/models/cv/detection/yolov4/ixrt/README.md b/models/cv/detection/yolov4/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..886a87aec3bc59e730e1b9fb3436fe07c8179600
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/README.md
@@ -0,0 +1,82 @@
+# YOLOv4
+
+## Description
+
+YOLOv4 employs a two-step process, involving regression for bounding box positioning and classification for object categorization. it amalgamates past YOLO family research contributions with novel features like WRC, CSP, CmBN, SAT, Mish activation, Mosaic data augmentation, DropBlock regularization, and CIoU loss.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install onnxsim
+pip3 install pycocotools
+pip3 install pycuda
+```
+
+### Download
+
+Pretrained cfg:
+Pretrained model:
+
+Dataset: to download the validation dataset.
+
+### Model Conversion
+
+```bash
+# clone yolov4
+git clone https://github.com/Tianxiaomo/pytorch-YOLOv4.git yolov4
+
+# download weight
+mkdir data
+wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.weights -P data
+
+# export onnx model
+python3 export.py --cfg yolov4/cfg/yolov4.cfg --weight data/yolov4.weights --batchsize 16 --output data/yolov4.onnx
+mv yolov4_16_3_608_608_static.onnx data/yolov4.onnx
+
+# Use onnxsim optimize onnx model
+onnxsim data/yolov4.onnx data/yolov4_sim.onnx
+
+# Make sure the dataset path is "data/coco"
+```
+
+## Inference
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_yolov4darknet_fp16_accuary.sh
+# Performance
+bash scripts/infer_yolov4darknet_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_yolov4darknet_int8_accuracy.sh
+# Performance
+bash scripts/infer_yolov4darknet_int8_performance.sh
+```
+
+## Results
+
+| Model | BatchSize | Precision | FPS | MAP@0.5 |
+| ------ | --------- | --------- | ------ | ------- |
+| YOLOv4 | 32 | FP16 | 303.27 | 0.730 |
+| YOLOv4 | 32 | INT8 | 682.14 | 0.608 |
+
+## Reference
+
+DarkNet:
+Pytorch-YOLOv4:
diff --git a/models/cv/detection/yolov4/ixrt/build_engine.py b/models/cv/detection/yolov4/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4080edd3c275a4595cbfb407a21cebdada7fa7
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/build_engine.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from tensorrt import Dims
+
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+
+def build_engine_trtapi_staticshape(config):
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file(config.model)
+
+ precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+ # print("precision : ", precision)
+ build_config.set_flag(precision)
+
+ plan = builder.build_serialized_network(network, build_config)
+ engine_file_path = config.engine
+ with open(engine_file_path, "wb") as f:
+ f.write(plan)
+ print("Build static shape engine done!")
+
+
+def build_engine_trtapi_dynamicshape(config):
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+
+ profile = builder.create_optimization_profile()
+ profile.set_shape("input",
+ Dims([1, 3, 608, 608]),
+ Dims([32, 3, 608, 608]),
+ Dims([64, 3, 608, 608]),
+ )
+ build_config.add_optimization_profile(profile)
+
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file(config.model)
+ precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+ # print("precision : ", precision)
+ build_config.set_flag(precision)
+
+ # set dynamic
+ num_inputs = network.num_inputs
+ for i in range(num_inputs):
+ input_tensor = network.get_input(i)
+ input_tensor.shape = Dims([-1, 3, 608, 608])
+
+ plan = builder.build_serialized_network(network, build_config)
+ engine_file_path = config.engine
+ with open(engine_file_path, "wb") as f:
+ f.write(plan)
+ print("Build dynamic shape engine done!")
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str)
+ parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+ help="The precision of datatype")
+ # engine args
+ parser.add_argument("--engine", type=str, default=None)
+
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ build_engine_trtapi_staticshape(args)
+ # build_engine_trtapi_dynamicshape(args)
diff --git a/models/cv/detection/yolov4/ixrt/coco_labels.py b/models/cv/detection/yolov4/ixrt/coco_labels.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc21282c7fa393e9d15e8bdc16c741dc7e78448
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/coco_labels.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+labels = [
+ "person",
+ "bicycle",
+ "car",
+ "motorcycle",
+ "airplane",
+ "bus",
+ "train",
+ "truck",
+ "boat",
+ "traffic light",
+ "fire hydrant",
+ "stop sign",
+ "parking meter",
+ "bench",
+ "bird",
+ "cat",
+ "dog",
+ "horse",
+ "sheep",
+ "cow",
+ "elephant",
+ "bear",
+ "zebra",
+ "giraffe",
+ "backpack",
+ "umbrella",
+ "handbag",
+ "tie",
+ "suitcase",
+ "frisbee",
+ "skis",
+ "snowboard",
+ "sports ball",
+ "kite",
+ "baseball bat",
+ "baseball glove",
+ "skateboard",
+ "surfboard",
+ "tennis racket",
+ "bottle",
+ "wine glass",
+ "cup",
+ "fork",
+ "knife",
+ "spoon",
+ "bowl",
+ "banana",
+ "apple",
+ "sandwich",
+ "orange",
+ "broccoli",
+ "carrot",
+ "hot dog",
+ "pizza",
+ "donut",
+ "cake",
+ "chair",
+ "couch",
+ "potted plant",
+ "bed",
+ "dining table",
+ "toilet",
+ "tv",
+ "laptop",
+ "mouse",
+ "remote",
+ "keyboard",
+ "cell phone",
+ "microwave",
+ "oven",
+ "toaster",
+ "sink",
+ "refrigerator",
+ "book",
+ "clock",
+ "vase",
+ "scissors",
+ "teddy bear",
+ "hair drier",
+ "toothbrush",
+]
+def coco80_to_coco91_class(): # converts 80-index (val2014) to 91-index (paper)
+ return [
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90]
+
+__all__ = ["labels"]
diff --git a/models/cv/detection/yolov4/ixrt/common.py b/models/cv/detection/yolov4/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c2766533fa5a334a61231adb168ecf09622c3
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/common.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import glob
+import time
+import numpy as np
+from tqdm import tqdm
+
+import tensorrt
+import pycuda.driver as cuda
+
+
+def load_class_names(namesfile):
+ class_names = []
+ with open(namesfile, 'r') as fp:
+ lines = fp.readlines()
+ for line in lines:
+ line = line.rstrip()
+ class_names.append(line)
+ return class_names
+
+# input : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
+# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
+def box_class85to6(input):
+ center_x_y = input[:, :2]
+ side = input[:, 2:4]
+ conf = input[:, 4:5]
+ class_id = np.argmax(input[:, 5:], axis = -1)
+ class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
+ max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
+ x1_y1 = center_x_y - 0.5 * side
+ x2_y2 = center_x_y + 0.5 * side
+ nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
+ return nms_input
+
+def save2json(batch_img_id, pred_boxes, json_result, class_trans):
+ for i, boxes in enumerate(pred_boxes):
+ if boxes is not None:
+ image_id = int(batch_img_id[i])
+ # have no target
+ if image_id == -1:
+ continue
+
+ for x1, y1, x2, y2, _, p, c in boxes:
+ x1, y1, x2, y2, p = float(x1), float(y1), float(x2), float(y2), float(p)
+ c = int(c)
+ x = x1
+ y = y1
+ w = x2 - x1
+ h = y2 - y1
+
+ json_result.append(
+ {
+ "image_id": image_id,
+ "category_id": class_trans[c - 1],
+ "bbox": [x, y, w, h],
+ "score": p,
+ }
+ )
+
+################## About TensorRT #################
+def create_engine_context(engine_path, logger):
+ with open(engine_path, "rb") as f:
+ runtime = tensorrt.Runtime(logger)
+ assert runtime
+ engine = runtime.deserialize_cuda_engine(f.read())
+ assert engine
+ context = engine.create_execution_context()
+ assert context
+
+ return engine, context
+
+def setup_io_bindings(engine, context):
+ # Setup I/O bindings
+ inputs = []
+ outputs = []
+ allocations = []
+
+ for i in range(engine.num_bindings):
+ is_input = False
+ if engine.binding_is_input(i):
+ is_input = True
+ name = engine.get_binding_name(i)
+ dtype = engine.get_binding_dtype(i)
+ shape = context.get_binding_shape(i)
+ if is_input:
+ batch_size = shape[0]
+ size = np.dtype(tensorrt.nptype(dtype)).itemsize
+ for s in shape:
+ size *= s
+ allocation = cuda.mem_alloc(size)
+ binding = {
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(tensorrt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
+ }
+ # print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}")
+ allocations.append(allocation)
+ if engine.binding_is_input(i):
+ inputs.append(binding)
+ else:
+ outputs.append(binding)
+ return inputs, outputs, allocations
+##########################################################
+
+
+################## About Loading Dataset #################
+def load_images(images_path):
+ """
+ If image path is given, return it directly
+ For txt file, read it and return each line as image path
+ In other case, it's a folder, return a list with names of each
+ jpg, jpeg and png file
+ """
+ input_path_extension = images_path.split('.')[-1]
+ if input_path_extension in ['jpg', 'jpeg', 'png']:
+ return [images_path]
+ elif input_path_extension == "txt":
+ with open(images_path, "r") as f:
+ return f.read().splitlines()
+ else:
+ return glob.glob(
+ os.path.join(images_path, "*.jpg")) + \
+ glob.glob(os.path.join(images_path, "*.png")) + \
+ glob.glob(os.path.join(images_path, "*.jpeg"))
+
+def prepare_batch(images_path, bs=16, input_size=(608, 608)):
+
+ width, height = input_size
+
+ batch_names = []
+ batch_images = []
+ batch_shapes = []
+
+ temp_names = []
+ temp_images = []
+ temp_shapes = []
+
+ for i, image_path in tqdm(enumerate(images_path), desc="Loading coco data"):
+ name = os.path.basename(image_path)
+ image = cv2.imread(image_path)
+ h, w, _ = image.shape
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+ image_resized = cv2.resize(image_rgb, (width, height),
+ interpolation=cv2.INTER_LINEAR)
+ custom_image = image_resized.transpose(2, 0, 1).astype(np.float32) / 255.
+ custom_image = np.expand_dims(custom_image, axis=0)
+
+ if i != 0 and i % bs == 0:
+ batch_names.append(temp_names)
+ batch_images.append(np.concatenate(temp_images, axis=0))
+ batch_shapes.append(temp_shapes)
+
+ temp_names = [name]
+ temp_images = [custom_image]
+ temp_shapes = [(h, w)]
+ else:
+ temp_names.append(name)
+ temp_images.append(custom_image)
+ temp_shapes.append((h, w))
+
+ return batch_names, batch_images, batch_shapes
+##########################################################
+
+
+################## About Operating box #################
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+ # Resize and pad image while meeting stride-multiple constraints
+ shape = im.shape[:2] # current shape [height, width]
+ if isinstance(new_shape, int):
+ new_shape = (new_shape, new_shape)
+
+ # Scale ratio (new / old)
+ r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+ if not scaleup: # only scale down, do not scale up (for better val mAP)
+ r = min(r, 1.0)
+
+ # Compute padding
+ ratio = r, r # width, height ratios
+ new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+ dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
+ if auto: # minimum rectangle
+ dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
+ elif scaleFill: # stretch
+ dw, dh = 0.0, 0.0
+ new_unpad = (new_shape[1], new_shape[0])
+ ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
+
+ dw /= 2 # divide padding into 2 sides
+ dh /= 2
+
+ if shape[::-1] != new_unpad: # resize
+ im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+ top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+ left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+ im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
+ return im, ratio, (dw, dh)
+
+def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
+ # Rescale boxes (xyxy) from net_shape to ori_shape
+
+ if use_letterbox:
+
+ gain = min(
+ net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
+ ) # gain = new / old
+ pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
+ net_shape[0] - ori_shape[0] * gain
+ ) / 2.0
+
+ boxes[:, [0, 2]] -= pad[0] # x padding
+ boxes[:, [1, 3]] -= pad[1] # y padding
+ boxes[:, :4] /= gain
+ else:
+ x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
+
+ boxes[:, 0] /= x_scale
+ boxes[:, 1] /= y_scale
+ boxes[:, 2] /= x_scale
+ boxes[:, 3] /= y_scale
+
+ clip_boxes(boxes, ori_shape)
+ return boxes
+
+def clip_boxes(boxes, shape):
+
+ boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1]) # x1, x2
+ boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0]) # y1, y2
+##########################################################
+
+
+################## About pre and post processing #########
+def pre_processing(src_img, imgsz=608):
+ resized = cv2.resize(src_img, (imgsz, imgsz), interpolation=cv2.INTER_LINEAR)
+ in_img = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+ in_img = np.transpose(in_img, (2, 0, 1)).astype(np.float32)
+ in_img = np.expand_dims(in_img, axis=0)
+ in_img /= 255.0
+ return in_img
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+ # print(boxes.shape)
+ x1 = boxes[:, 0]
+ y1 = boxes[:, 1]
+ x2 = boxes[:, 2]
+ y2 = boxes[:, 3]
+
+ areas = (x2 - x1) * (y2 - y1)
+ order = confs.argsort()[::-1]
+
+ keep = []
+ while order.size > 0:
+ idx_self = order[0]
+ idx_other = order[1:]
+
+ keep.append(idx_self)
+
+ xx1 = np.maximum(x1[idx_self], x1[idx_other])
+ yy1 = np.maximum(y1[idx_self], y1[idx_other])
+ xx2 = np.minimum(x2[idx_self], x2[idx_other])
+ yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+ w = np.maximum(0.0, xx2 - xx1)
+ h = np.maximum(0.0, yy2 - yy1)
+ inter = w * h
+
+ if min_mode:
+ over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+ else:
+ over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+ inds = np.where(over <= nms_thresh)[0]
+ order = order[inds + 1]
+
+ return np.array(keep)
+
+
+def post_processing(img, conf_thresh, nms_thresh, output, num_classes=80):
+
+ # [batch, num, 1, 4]
+ box_array = output[:, :, :4]
+ # [batch, num, 2]
+ class_confs = output[:, :, 4:]
+
+ max_conf = class_confs[:, :, 1]
+ max_id = class_confs[:, :, 0]
+
+ bboxes_batch = []
+ for i in range(box_array.shape[0]):
+
+ argwhere = max_conf[i] > conf_thresh
+ l_box_array = box_array[i, argwhere, :]
+ l_max_conf = max_conf[i, argwhere]
+ l_max_id = max_id[i, argwhere]
+
+ bboxes = []
+ # nms for each class
+ for j in range(num_classes):
+
+ cls_argwhere = l_max_id == j
+ ll_box_array = l_box_array[cls_argwhere, :]
+ ll_max_conf = l_max_conf[cls_argwhere]
+ ll_max_id = l_max_id[cls_argwhere]
+
+ keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+
+ if (keep.size > 0):
+ ll_box_array = ll_box_array[keep, :]
+ ll_max_conf = ll_max_conf[keep]
+ ll_max_id = ll_max_id[keep]
+
+ for k in range(ll_box_array.shape[0]):
+ bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2],
+ ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+
+ bboxes_batch.append(bboxes)
+
+ return bboxes_batch
+##########################################################
+
diff --git a/models/cv/detection/yolov4/ixrt/cut_model.py b/models/cv/detection/yolov4/ixrt/cut_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf4f88dae926b8d15356c7f6b48d89fe80dc9f2a
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/cut_model.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import onnx
+import argparse
+from onnxsim import simplify
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input_model", type=str)
+ parser.add_argument("--output_model", type=str)
+ parser.add_argument("--input_names", nargs='+', type=str)
+ parser.add_argument("--output_names", nargs='+', type=str)
+ args = parser.parse_args()
+ return args
+
+args = parse_args()
+onnx.utils.extract_model(args.input_model, args.output_model, args.input_names, args.output_names)
+print(" Cut Model Done.")
diff --git a/models/cv/detection/yolov4/ixrt/deploy.py b/models/cv/detection/yolov4/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..084356ec8cb14a0604bf994faca4ce15834e4b15
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/deploy.py
@@ -0,0 +1,210 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import argparse
+import copy
+
+from typing import Union, Callable, List
+
+from tensorrt.deploy.api import *
+from tensorrt.deploy.backend.onnx.converter import default_converter
+from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
+from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
+from tensorrt.deploy.ir.operator_type import OperatorType as OP
+from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
+from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
+from tensorrt.deploy.ir import Graph
+from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
+from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+
+class FuseMishPass(BasePass):
+ def process(self, graph: Graph) -> Graph:
+ pattern = build_sequence_graph([OP.SOFTPLUS, OP.TANH, OP.MUL])
+
+ matcher = GraphMatcher(pattern, strict=False)
+ self.transform = GraphTransform(graph)
+ matcher.findall(graph, self.fuse_mish)
+ return graph
+
+ def fuse_mish(self, graph: Graph, pattern_graph: PatternGraph):
+ softplus = pattern_graph.nodes[0].operator
+ mul = pattern_graph.nodes[-1].operator
+
+ if not self.can_fused(graph, pattern_graph):
+ return
+
+ self.transform.delete_operators_between_op_op(softplus, mul)
+
+ mish_op = Operator(
+ name=generate_operator_name(graph, pattern="Mish_{idx}"),
+ op_type=OP.MISH,
+ inputs=copy.copy(softplus.inputs),
+ outputs=copy.copy(mul.outputs),
+ )
+ mish_op.is_quant_operator = softplus.is_quant_operator and mul.is_quant_operator
+ graph.add_operator(mish_op)
+
+ def can_fused(self, graph: Graph, pattern_graph: PatternGraph):
+ softplus = pattern_graph.nodes[0].operator
+ mul = pattern_graph.nodes[-1].operator
+
+ # 检查 Softplus, tanh 的输出是不是只有一个 OP 使用
+ # 如果有多个 OP 使用,则不能融合
+ for node in pattern_graph.nodes[:2]:
+ next_ops = graph.get_next_operators(node.operator)
+ if len(next_ops) != 1:
+ return False
+
+ # 检查 Mul 的输入是不是和 Softplus 是同源的
+ softplus_prev_op = graph.get_previous_operators(softplus)
+ if len(softplus_prev_op) != 1:
+ return False
+
+ mul_prev_op = graph.get_previous_operators(mul)
+ if len(mul_prev_op) != 2:
+ return False
+
+ for op in mul_prev_op:
+ if op is softplus_prev_op[0]:
+ return True
+
+ return False
+
+
+class Transform:
+ def __init__(self, graph):
+ self.t = GraphTransform(graph)
+ self.graph = graph
+
+ def ReplaceFocus(self, input_edge, outputs, to_op):
+ input_var = self.graph.get_variable(input_edge)
+ op = self.graph.get_operator(to_op)
+ self.t.delete_operators_between_var_op(
+ from_var=input_var, to_op=op
+ )
+ self.t.make_operator(
+ "Focus", inputs=input_edge, outputs=outputs
+ )
+ return self.graph
+
+ def AddYoloDecoderOp(self, inputs: list, outputs: list, op_type, **attributes):
+ if attributes["anchor"] is None:
+ del attributes["anchor"]
+ self.t.make_operator(
+ op_type, inputs=inputs, outputs=outputs, **attributes
+ )
+ return self.graph
+
+ def AddConcatOp(self, inputs: list, outputs, **attributes):
+ self.t.make_operator(
+ "Concat", inputs=inputs, outputs=outputs, **attributes
+ )
+ return self.graph
+
+def customize_ops(graph, args):
+ t = Transform(graph)
+ fuse_focus = args.focus_input is not None and args.focus_output is not None and args.focus_last_node is not None
+ if fuse_focus:
+ graph = t.ReplaceFocus(
+ input_edge=args.focus_input,
+ outputs=args.focus_output,
+ to_op=args.focus_last_node
+ )
+ decoder_input = args.decoder_input_names
+ num = len(decoder_input) // 3
+ graph = t.AddYoloDecoderOp(
+ inputs=decoder_input[:num],
+ outputs=["decoder_8"],
+ op_type=args.decoder_type,
+ anchor=args.decoder8_anchor,
+ num_class=args.num_class,
+ stride=8,
+ faster_impl=args.faster
+ )
+ graph = t.AddYoloDecoderOp(
+ inputs=decoder_input[num:num*2],
+ outputs=["decoder_16"],
+ op_type=args.decoder_type,
+ anchor=args.decoder16_anchor,
+ num_class=args.num_class,
+ stride=16,
+ faster_impl=args.faster
+ )
+ graph = t.AddYoloDecoderOp(
+ inputs=decoder_input[num*2:num*2+1],
+ outputs=["decoder_32"],
+ op_type=args.decoder_type,
+ anchor=args.decoder32_anchor,
+ num_class=args.num_class,
+ stride=32,
+ faster_impl=args.faster
+ )
+ if args.decoder64_anchor is not None:
+ graph = t.AddYoloDecoderOp(
+ inputs=decoder_input[num*2+1:],
+ outputs=["decoder_64"],
+ op_type=args.decoder_type,
+ anchor=args.decoder64_anchor,
+ num_class=args.num_class,
+ stride=64,
+ faster_impl=args.faster
+ )
+ graph = t.AddConcatOp(
+ inputs=["decoder_8", "decoder_16", "decoder_32", "decoder_64"],
+ outputs=["output"],
+ axis=1
+ )
+ else:
+ graph = t.AddConcatOp(
+ inputs=["decoder_32", "decoder_16", "decoder_8"],
+ outputs=["output"],
+ axis=1
+ )
+
+ graph.outputs.clear()
+ graph.add_output("output")
+ graph.outputs["output"].dtype = "FLOAT"
+ return graph
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--src", type=str)
+ parser.add_argument("--dst", type=str)
+ parser.add_argument("--decoder_type", type=str, choices=["YoloV3Decoder", "YoloV5Decoder", "YoloV7Decoder", "YoloxDecoder"])
+ parser.add_argument("--decoder_input_names", nargs='+', type=str)
+ parser.add_argument("--decoder8_anchor", nargs='*', type=int)
+ parser.add_argument("--decoder16_anchor", nargs='*', type=int)
+ parser.add_argument("--decoder32_anchor", nargs='*', type=int)
+ parser.add_argument("--decoder64_anchor", nargs='*', type=int, default=None)
+ parser.add_argument("--num_class", type=int, default=80)
+ parser.add_argument("--faster", type=int, default=1)
+ parser.add_argument("--focus_input", type=str, default=None)
+ parser.add_argument("--focus_output", type=str, default=None)
+ parser.add_argument("--focus_last_node", type=str, default=None)
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+
+ args = parse_args()
+ graph = create_source(args.src)()
+ graph = customize_ops(graph, args)
+ graph = FuseMishPass().process(graph)
+ create_target(saved_path=args.dst).export(graph)
+ print("Surged onnx lies on", args.dst)
diff --git a/models/cv/detection/yolov4/ixrt/export.py b/models/cv/detection/yolov4/ixrt/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c8bbfa5aa79f1a982c340690658325d23fa4b54
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/export.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import sys
+sys.path.insert(0, "yolov4")
+import argparse
+
+from yolov4.tool.darknet2onnx import *
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--cfg",
+ type=str,
+ required=True,
+ help="darknet cfg path.")
+
+ parser.add_argument("--weight",
+ type=str,
+ required=True,
+ help="darknet weights path.")
+
+ parser.add_argument("--batchsize",
+ type=int,
+ required=True,
+ help="Onnx model batchsize.")
+
+ parser.add_argument("--output",
+ type=str,
+ required=True,
+ help="export onnx model path.")
+
+ args = parser.parse_args()
+
+ return args
+
+def main():
+ args = parse_args()
+
+ transform_to_onnx(args.cfg, args.weight, args.batchsize, args.output)
+
+if __name__ == "__main__":
+ main()
+
diff --git a/models/cv/detection/yolov4/ixrt/inference.py b/models/cv/detection/yolov4/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d740507b3a54bf2248000b2ac60d09f12a9886a
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/inference.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import argparse
+import glob
+import json
+import os
+import time
+import sys
+from tqdm import tqdm
+
+import torch
+import numpy as np
+import tensorrt
+from tensorrt import Dims
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+from coco_labels import coco80_to_coco91_class
+from common import save2json, box_class85to6
+from common import load_images, prepare_batch
+from common import create_engine_context, setup_io_bindings
+from common import scale_boxes, post_processing
+
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+
+
+def main(config):
+
+ # Step1: Load dataloader
+ images_path = load_images(config.eval_dir)
+ dataloader = prepare_batch(images_path, config.bsz)
+
+ # Step2: Load Engine
+ input_name = "input"
+ host_mem = tensorrt.IHostMemory
+ logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+ engine, context = create_engine_context(config.model_engine, logger)
+ input_idx = engine.get_binding_index(input_name)
+ context.set_binding_shape(input_idx, Dims((config.bsz,3,config.imgsz,config.imgsz)))
+ inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+ # Warm up
+ if config.warm_up > 0:
+ print("\nWarm Start.")
+ for i in range(config.warm_up):
+ context.execute_v2(allocations)
+ print("Warm Done.")
+
+ json_result = []
+ forward_time = 0.0
+ class_map = coco80_to_coco91_class()
+ num_samples = 0
+ # Step3: Run on coco dataset
+ for batch_names, batch_images, batch_shapes in tqdm(zip(*dataloader)):
+ batch_data = np.ascontiguousarray(batch_images)
+ data_shape = batch_data.shape
+ h, w = zip(*batch_shapes)
+ batch_img_shape = [h, w]
+ batch_img_id = [int(x.split('.')[0]) for x in batch_names]
+
+ cur_bsz_sample = batch_images.shape[0]
+ num_samples += cur_bsz_sample
+ # Set input
+ input_idx = engine.get_binding_index(input_name)
+ context.set_binding_shape(input_idx, Dims(data_shape))
+ inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+ cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+ # Prepare the output data
+ output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+ # print(f"output shape : {output.shape} output type : {output.dtype}")
+
+ # Forward
+ start_time = time.time()
+ context.execute_v2(allocations)
+ end_time = time.time()
+ forward_time += end_time - start_time
+
+ if config.test_mode == "MAP":
+ # Fetch output
+ cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+ pred_boxes = post_processing(None, 0.001, 0.6, output)
+
+ pred_results = []
+ # Calculate pred box on raw shape
+ for (pred_box, raw_shape) in zip(pred_boxes, batch_shapes):
+ h, w = raw_shape
+ if len(pred_box) == 0:continue # no detection results
+ pred_box = np.array(pred_box, dtype=np.float32)
+ pred_box = scale_boxes((config.imgsz, config.imgsz), pred_box, raw_shape, use_letterbox=False)
+
+ pred_results.append(pred_box.tolist())
+
+ save2json(batch_img_id, pred_results, json_result, class_map)
+
+ fps = num_samples / forward_time
+
+ if config.test_mode == "FPS":
+ print("FPS : ", fps)
+ print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+ if fps >= config.fps_target:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+ if config.test_mode == "MAP":
+ if len(json_result) == 0:
+ print("Predict zero box!")
+ exit(1)
+
+ if not os.path.exists(config.pred_dir):
+ os.makedirs(config.pred_dir)
+
+ pred_json = os.path.join(
+ config.pred_dir, f"{config.model_name}_{config.precision}_preds.json"
+ )
+ with open(pred_json, "w") as f:
+ json.dump(json_result, f)
+
+ anno_json = config.coco_gt
+ anno = COCO(anno_json) # init annotations api
+ pred = anno.loadRes(pred_json) # init predictions api
+ eval = COCOeval(anno, pred, "bbox")
+
+ eval.evaluate()
+ eval.accumulate()
+ print(
+ f"==============================eval {config.model_name} {config.precision} coco map =============================="
+ )
+ eval.summarize()
+
+ map, map50 = eval.stats[:2]
+ print("MAP@0.5 : ", map50)
+ print(f"Accuracy Check : Test {map50} >= target {config.map_target}")
+ if map50 >= config.map_target:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+
+def parse_config():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--model_name", type=str, default="YOLOV4", help="YOLOV3 YOLOV4 YOLOV5 YOLOV7 YOLOX"
+ )
+ parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+ help="The precision of datatype")
+ parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+ parser.add_argument(
+ "--model_engine",
+ type=str,
+ default="",
+ help="model engine path",
+ )
+ parser.add_argument(
+ "--coco_gt",
+ type=str,
+ default="data/datasets/cv/coco2017/annotations/instances_val2017.json",
+ help="coco instances_val2017.json",
+ )
+ parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
+ parser.add_argument("--loop_count", type=int, default=-1, help="loop count")
+ parser.add_argument(
+ "--eval_dir",
+ type=str,
+ default="data/datasets/cv/coco2017/val2017",
+ help="coco image dir",
+ )
+ parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+ parser.add_argument(
+ "--imgsz",
+ "--img",
+ "--img-size",
+ type=int,
+ default=608,
+ help="inference size h,w",
+ )
+ parser.add_argument("--pred_dir", type=str, default=".", help="pred save json dirs")
+ parser.add_argument("--map_target", type=float, default=0.56, help="target mAP")
+ parser.add_argument("--fps_target", type=float, default=-1.0, help="target fps")
+
+ config = parser.parse_args()
+ print("config:", config)
+ return config
+
+
+if __name__ == "__main__":
+ config = parse_config()
+ main(config)
diff --git a/models/cv/detection/yolov4/ixrt/load_ixrt_plugin.py b/models/cv/detection/yolov4/ixrt/load_ixrt_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bb0abc21bd5806c51d6b908e3e3407cfdb62cc8
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import ctypes
+import tensorrt
+from os.path import join, dirname, exists
+def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+ if not dynamic_path:
+ dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+ if not exists(dynamic_path):
+ raise FileNotFoundError(
+ f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+ ctypes.CDLL(dynamic_path)
+ tensorrt.init_libnvinfer_plugins(logger, namespace)
+ print(f"Loaded plugin from {dynamic_path}")
diff --git a/models/cv/detection/yolov4/ixrt/quant.py b/models/cv/detection/yolov4/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..70265cbc25d24d4ed41640c76f78a1839555f749
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/quant.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import random
+import argparse
+import numpy as np
+from tensorrt.deploy import static_quantize
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from common import letterbox
+
+
+def setseed(seed=42):
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_name", type=str)
+ parser.add_argument("--model", type=str, default="yolov4_bs16_without_decoder.onnx")
+ parser.add_argument("--dataset_dir", type=str, default="./coco2017/val2017")
+ parser.add_argument("--ann_file", type=str, default="./coco2017/annotations/instances_val2017.json")
+ parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
+ parser.add_argument("--disable_quant_names", nargs='*', type=str)
+ parser.add_argument("--save_quant_model", type=str, help="save the quantization model path", default=None)
+ parser.add_argument("--bsz", type=int, default=16)
+ parser.add_argument("--step", type=int, default=32)
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--imgsz", type=int, default=608)
+ parser.add_argument("--use_letterbox", action="store_true")
+ args = parser.parse_args()
+ return args
+
+args = parse_args()
+setseed(args.seed)
+model_name = args.model_name
+
+
+def get_dataloader(data_dir, step=32, batch_size=16, new_shape=[608, 608], use_letterbox=False):
+ num = step * batch_size
+ val_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
+ random.shuffle(val_list)
+ pic_list = val_list[:num]
+
+ calibration_dataset = []
+ for file_path in pic_list:
+ pic_data = cv2.imread(file_path)
+ org_img = pic_data
+ assert org_img is not None, 'Image not Found ' + file_path
+ h0, w0 = org_img.shape[:2]
+
+ if use_letterbox:
+ img, ratio, dwdh = letterbox(org_img, new_shape=(new_shape[1], new_shape[0]), auto=False, scaleup=True)
+ else:
+ img = cv2.resize(org_img, new_shape)
+ img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
+ img = np.ascontiguousarray(img) / 255.0 # 0~1 np array
+ img = torch.from_numpy(img).float()
+
+ calibration_dataset.append(img)
+
+ calibration_dataloader = DataLoader(
+ calibration_dataset,
+ shuffle=True,
+ batch_size=batch_size,
+ drop_last=True
+ )
+ return calibration_dataloader
+
+dataloader = get_dataloader(
+ data_dir=args.dataset_dir,
+ step=args.step,
+ batch_size=args.bsz,
+ new_shape=(args.imgsz, args.imgsz),
+ use_letterbox=args.use_letterbox
+)
+
+dirname = os.path.dirname(args.save_quant_model)
+quant_json_path = os.path.join(dirname, f"quantized_{model_name}.json")
+
+static_quantize(args.model,
+ calibration_dataloader=dataloader,
+ save_quant_onnx_path=args.save_quant_model,
+ save_quant_params_path=quant_json_path,
+ observer=args.observer,
+ data_preprocess=lambda x: x.to("cuda"),
+ quant_format="qdq",
+ disable_quant_names=args.disable_quant_names)
diff --git a/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_accuary.sh b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_accuary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b732d4eb297b6319ad5bef4660a6f7dde0ef0abc
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_accuary.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov4_darknet
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=16
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+
+# Cut decoder part
+echo "Cut decoder part"
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "CUT Model Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/cut_model.py \
+ --input_model ${CURRENT_MODEL} \
+ --output_model ${FINAL_MODEL} \
+ --input_names input \
+ --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# add decoder op
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Add Decoder Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/deploy.py \
+ --src ${CURRENT_MODEL} \
+ --dst ${FINAL_MODEL} \
+ --decoder_type YoloV3Decoder \
+ --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
+ --decoder8_anchor 12 16 19 36 40 28 \
+ --decoder16_anchor 36 75 76 55 72 146 \
+ --decoder32_anchor 142 110 192 243 459 401
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_fp16.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision float16 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=16
+python3 ${RUN_DIR}/inference.py \
+ --test_mode MAP \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 608 \
+ --loop_count 10 \
+ --eval_dir ${EVAL_DIR} \
+ --coco_gt ${COCO_GT} \
+ --pred_dir ${CHECKPOINTS_DIR} \
+ --precision float16 \
+ --map_target 0.30; check_status
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_performance.sh b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..796dad720e13250b6ee81c66defca990c416e220
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_fp16_performance.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov4_darknet
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=16
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+
+# Cut decoder part
+echo "Cut decoder part"
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "CUT Model Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/cut_model.py \
+ --input_model ${CURRENT_MODEL} \
+ --output_model ${FINAL_MODEL} \
+ --input_names input \
+ --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# add decoder op
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Add Decoder Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/deploy.py \
+ --src ${CURRENT_MODEL} \
+ --dst ${FINAL_MODEL} \
+ --decoder_type YoloV3Decoder \
+ --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
+ --decoder8_anchor 12 16 19 36 40 28 \
+ --decoder16_anchor 36 75 76 55 72 146 \
+ --decoder32_anchor 142 110 192 243 459 401
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_fp16.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision float16 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=16
+python3 ${RUN_DIR}/inference.py \
+ --test_mode FPS \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 608 \
+ --loop_count 10 \
+ --eval_dir ${EVAL_DIR} \
+ --coco_gt ${COCO_GT} \
+ --pred_dir ${CHECKPOINTS_DIR} \
+ --precision float16 \
+ --map_target 0.30; check_status
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_accuary.sh b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_accuary.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c62d174c09e6f4b005a9b1e7ce028cc47643a930
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_accuary.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov4_darknet
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=16
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+
+# Cut decoder part
+echo "Cut decoder part"
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "CUT Model Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/cut_model.py \
+ --input_model ${CURRENT_MODEL} \
+ --output_model ${FINAL_MODEL} \
+ --input_names input \
+ --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/quant.py \
+ --model_name "YOLOV4_DARKNET" \
+ --model ${CURRENT_MODEL} \
+ --bsz ${BATCH_SIZE} \
+ --dataset_dir ${EVAL_DIR} \
+ --ann_file ${COCO_GT} \
+ --observer "hist_percentile" \
+ --save_quant_model ${FINAL_MODEL} \
+ --imgsz 608
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# add decoder op
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Add Decoder Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/deploy.py \
+ --src ${CURRENT_MODEL} \
+ --dst ${FINAL_MODEL} \
+ --decoder_type YoloV3Decoder \
+ --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
+ --decoder8_anchor 12 16 19 36 40 28 \
+ --decoder16_anchor 36 75 76 55 72 146 \
+ --decoder32_anchor 142 110 192 243 459 401
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_int8.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision int8 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=16
+python3 ${RUN_DIR}/inference.py \
+ --test_mode MAP \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 608 \
+ --loop_count 10 \
+ --eval_dir ${EVAL_DIR} \
+ --coco_gt ${COCO_GT} \
+ --pred_dir ${CHECKPOINTS_DIR} \
+ --precision int8 \
+ --map_target 0.30; check_status
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_performance.sh b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2e335fa1d013961c136cda4f79fd2be712311494
--- /dev/null
+++ b/models/cv/detection/yolov4/ixrt/scripts/infer_yolov4darknet_int8_performance.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov4_darknet
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=16
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov4_sim.onnx
+
+# Cut decoder part
+echo "Cut decoder part"
+FINAL_MODEL=${CHECKPOINTS_DIR}/yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "CUT Model Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/cut_model.py \
+ --input_model ${CURRENT_MODEL} \
+ --output_model ${FINAL_MODEL} \
+ --input_names input \
+ --output_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_without_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/quant.py \
+ --model_name "YOLOV4_DARKNET" \
+ --model ${CURRENT_MODEL} \
+ --bsz ${BATCH_SIZE} \
+ --dataset_dir ${EVAL_DIR} \
+ --ann_file ${COCO_GT} \
+ --observer "hist_percentile" \
+ --save_quant_model ${FINAL_MODEL} \
+ --imgsz 608
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# add decoder op
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov4_bs${BATCH_SIZE}_with_decoder.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Add Decoder Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/deploy.py \
+ --src ${CURRENT_MODEL} \
+ --dst ${FINAL_MODEL} \
+ --decoder_type YoloV3Decoder \
+ --decoder_input_names /models.138/conv94/Conv_output_0 /models.149/conv102/Conv_output_0 /models.160/conv110/Conv_output_0 \
+ --decoder8_anchor 12 16 19 36 40 28 \
+ --decoder16_anchor 36 75 76 55 72 146 \
+ --decoder32_anchor 142 110 192 243 459 401
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov4_int8.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision int8 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=16
+python3 ${RUN_DIR}/inference.py \
+ --test_mode FPS \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 608 \
+ --loop_count 10 \
+ --eval_dir ${EVAL_DIR} \
+ --coco_gt ${COCO_GT} \
+ --pred_dir ${CHECKPOINTS_DIR} \
+ --precision int8 \
+ --map_target 0.30; check_status
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov6/ixrt/README.md b/models/cv/detection/yolov6/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..66258563113c66ba4aa22c98cd8b00ef056900a5
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/README.md
@@ -0,0 +1,84 @@
+# YOLOv6
+
+## Description
+
+YOLOv6 integrates cutting-edge object detection advancements from industry and academia, incorporating recent innovations in network design, training strategies, testing techniques, quantization, and optimization methods. This culmination results in a suite of deployment-ready networks, accommodating varied use cases across different scales.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install onnxsim
+pip3 install pycocotools
+pip3 install pycuda
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the validation dataset.
+
+```bash
+# get yolov6s.pt
+wget https://github.com/meituan/YOLOv6/releases/download/0.4.0/yolov6s.pt
+# set coco path
+mkdir -p data/
+ln -s /Path/to/coco/ data/coco
+```
+
+### Model Conversion
+
+```bash
+# install yolov6
+git clone https://github.com/meituan/YOLOv6.git
+
+pushd YOLOv6
+pip3 install -r requirements.txt
+
+# export onnx model
+python3 deploy/ONNX/export_onnx.py --weights ../yolov6s.pt --img 640 --batch-size 32 --simplify
+mv ../yolov6s.onnx ../data/
+
+popd
+```
+
+## Inference
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_yolov6s_fp16_accuracy.sh
+# Performance
+bash scripts/infer_yolov6s_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_yolov6s_int8_accuracy.sh
+# Performance
+bash scripts/infer_yolov6s_int8_performance.sh
+```
+
+## Results
+
+| Model | BatchSize | Precision | FPS | MAP@0.5 |
+| ------ | --------- | --------- | -------- | ------- |
+| YOLOv6 | 32 | FP16 | 1107.511 | - |
+| YOLOv6 | 32 | INT8 | 2080.475 | - |
+
+## Reference
+
+YOLOv6:
diff --git a/models/cv/detection/yolov6/ixrt/build_engine.py b/models/cv/detection/yolov6/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e1719a22c84b400a2ba9b9cbfdea6bae99e80d
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/build_engine.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from tensorrt import Dims
+
+
+def build_engine_trtapi_staticshape(config):
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file(config.model)
+
+ precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+ # print("precision : ", precision)
+ build_config.set_flag(precision)
+
+ plan = builder.build_serialized_network(network, build_config)
+ engine_file_path = config.engine
+ with open(engine_file_path, "wb") as f:
+ f.write(plan)
+ print("Build static shape engine done!")
+
+
+def build_engine_trtapi_dynamicshape(config):
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+
+ profile = builder.create_optimization_profile()
+ profile.set_shape("input",
+ Dims([1, 3, 608, 608]),
+ Dims([32, 3, 608, 608]),
+ Dims([64, 3, 608, 608]),
+ )
+ build_config.add_optimization_profile(profile)
+
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file(config.model)
+ precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+ # print("precision : ", precision)
+ build_config.set_flag(precision)
+
+ # set dynamic
+ num_inputs = network.num_inputs
+ for i in range(num_inputs):
+ input_tensor = network.get_input(i)
+ input_tensor.shape = Dims([-1, 3, 608, 608])
+
+ plan = builder.build_serialized_network(network, build_config)
+ engine_file_path = config.engine
+ with open(engine_file_path, "wb") as f:
+ f.write(plan)
+ print("Build dynamic shape engine done!")
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str)
+ parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+ help="The precision of datatype")
+ # engine args
+ parser.add_argument("--engine", type=str, default=None)
+
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ build_engine_trtapi_staticshape(args)
+ # build_engine_trtapi_dynamicshape(args)
diff --git a/models/cv/detection/yolov6/ixrt/common.py b/models/cv/detection/yolov6/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c2766533fa5a334a61231adb168ecf09622c3
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/common.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import glob
+import time
+import numpy as np
+from tqdm import tqdm
+
+import tensorrt
+import pycuda.driver as cuda
+
+
+def load_class_names(namesfile):
+ class_names = []
+ with open(namesfile, 'r') as fp:
+ lines = fp.readlines()
+ for line in lines:
+ line = line.rstrip()
+ class_names.append(line)
+ return class_names
+
+# input : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
+# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
+def box_class85to6(input):
+ center_x_y = input[:, :2]
+ side = input[:, 2:4]
+ conf = input[:, 4:5]
+ class_id = np.argmax(input[:, 5:], axis = -1)
+ class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
+ max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
+ x1_y1 = center_x_y - 0.5 * side
+ x2_y2 = center_x_y + 0.5 * side
+ nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
+ return nms_input
+
+def save2json(batch_img_id, pred_boxes, json_result, class_trans):
+ for i, boxes in enumerate(pred_boxes):
+ if boxes is not None:
+ image_id = int(batch_img_id[i])
+ # have no target
+ if image_id == -1:
+ continue
+
+ for x1, y1, x2, y2, _, p, c in boxes:
+ x1, y1, x2, y2, p = float(x1), float(y1), float(x2), float(y2), float(p)
+ c = int(c)
+ x = x1
+ y = y1
+ w = x2 - x1
+ h = y2 - y1
+
+ json_result.append(
+ {
+ "image_id": image_id,
+ "category_id": class_trans[c - 1],
+ "bbox": [x, y, w, h],
+ "score": p,
+ }
+ )
+
+################## About TensorRT #################
+def create_engine_context(engine_path, logger):
+ with open(engine_path, "rb") as f:
+ runtime = tensorrt.Runtime(logger)
+ assert runtime
+ engine = runtime.deserialize_cuda_engine(f.read())
+ assert engine
+ context = engine.create_execution_context()
+ assert context
+
+ return engine, context
+
+def setup_io_bindings(engine, context):
+ # Setup I/O bindings
+ inputs = []
+ outputs = []
+ allocations = []
+
+ for i in range(engine.num_bindings):
+ is_input = False
+ if engine.binding_is_input(i):
+ is_input = True
+ name = engine.get_binding_name(i)
+ dtype = engine.get_binding_dtype(i)
+ shape = context.get_binding_shape(i)
+ if is_input:
+ batch_size = shape[0]
+ size = np.dtype(tensorrt.nptype(dtype)).itemsize
+ for s in shape:
+ size *= s
+ allocation = cuda.mem_alloc(size)
+ binding = {
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(tensorrt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
+ }
+ # print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}")
+ allocations.append(allocation)
+ if engine.binding_is_input(i):
+ inputs.append(binding)
+ else:
+ outputs.append(binding)
+ return inputs, outputs, allocations
+##########################################################
+
+
+################## About Loading Dataset #################
+def load_images(images_path):
+ """
+ If image path is given, return it directly
+ For txt file, read it and return each line as image path
+ In other case, it's a folder, return a list with names of each
+ jpg, jpeg and png file
+ """
+ input_path_extension = images_path.split('.')[-1]
+ if input_path_extension in ['jpg', 'jpeg', 'png']:
+ return [images_path]
+ elif input_path_extension == "txt":
+ with open(images_path, "r") as f:
+ return f.read().splitlines()
+ else:
+ return glob.glob(
+ os.path.join(images_path, "*.jpg")) + \
+ glob.glob(os.path.join(images_path, "*.png")) + \
+ glob.glob(os.path.join(images_path, "*.jpeg"))
+
+def prepare_batch(images_path, bs=16, input_size=(608, 608)):
+
+ width, height = input_size
+
+ batch_names = []
+ batch_images = []
+ batch_shapes = []
+
+ temp_names = []
+ temp_images = []
+ temp_shapes = []
+
+ for i, image_path in tqdm(enumerate(images_path), desc="Loading coco data"):
+ name = os.path.basename(image_path)
+ image = cv2.imread(image_path)
+ h, w, _ = image.shape
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+ image_resized = cv2.resize(image_rgb, (width, height),
+ interpolation=cv2.INTER_LINEAR)
+ custom_image = image_resized.transpose(2, 0, 1).astype(np.float32) / 255.
+ custom_image = np.expand_dims(custom_image, axis=0)
+
+ if i != 0 and i % bs == 0:
+ batch_names.append(temp_names)
+ batch_images.append(np.concatenate(temp_images, axis=0))
+ batch_shapes.append(temp_shapes)
+
+ temp_names = [name]
+ temp_images = [custom_image]
+ temp_shapes = [(h, w)]
+ else:
+ temp_names.append(name)
+ temp_images.append(custom_image)
+ temp_shapes.append((h, w))
+
+ return batch_names, batch_images, batch_shapes
+##########################################################
+
+
+################## About Operating box #################
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+ # Resize and pad image while meeting stride-multiple constraints
+ shape = im.shape[:2] # current shape [height, width]
+ if isinstance(new_shape, int):
+ new_shape = (new_shape, new_shape)
+
+ # Scale ratio (new / old)
+ r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+ if not scaleup: # only scale down, do not scale up (for better val mAP)
+ r = min(r, 1.0)
+
+ # Compute padding
+ ratio = r, r # width, height ratios
+ new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+ dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
+ if auto: # minimum rectangle
+ dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
+ elif scaleFill: # stretch
+ dw, dh = 0.0, 0.0
+ new_unpad = (new_shape[1], new_shape[0])
+ ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
+
+ dw /= 2 # divide padding into 2 sides
+ dh /= 2
+
+ if shape[::-1] != new_unpad: # resize
+ im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+ top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+ left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+ im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
+ return im, ratio, (dw, dh)
+
+def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
+ # Rescale boxes (xyxy) from net_shape to ori_shape
+
+ if use_letterbox:
+
+ gain = min(
+ net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
+ ) # gain = new / old
+ pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
+ net_shape[0] - ori_shape[0] * gain
+ ) / 2.0
+
+ boxes[:, [0, 2]] -= pad[0] # x padding
+ boxes[:, [1, 3]] -= pad[1] # y padding
+ boxes[:, :4] /= gain
+ else:
+ x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
+
+ boxes[:, 0] /= x_scale
+ boxes[:, 1] /= y_scale
+ boxes[:, 2] /= x_scale
+ boxes[:, 3] /= y_scale
+
+ clip_boxes(boxes, ori_shape)
+ return boxes
+
+def clip_boxes(boxes, shape):
+
+ boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1]) # x1, x2
+ boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0]) # y1, y2
+##########################################################
+
+
+################## About pre and post processing #########
+def pre_processing(src_img, imgsz=608):
+ resized = cv2.resize(src_img, (imgsz, imgsz), interpolation=cv2.INTER_LINEAR)
+ in_img = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+ in_img = np.transpose(in_img, (2, 0, 1)).astype(np.float32)
+ in_img = np.expand_dims(in_img, axis=0)
+ in_img /= 255.0
+ return in_img
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+ # print(boxes.shape)
+ x1 = boxes[:, 0]
+ y1 = boxes[:, 1]
+ x2 = boxes[:, 2]
+ y2 = boxes[:, 3]
+
+ areas = (x2 - x1) * (y2 - y1)
+ order = confs.argsort()[::-1]
+
+ keep = []
+ while order.size > 0:
+ idx_self = order[0]
+ idx_other = order[1:]
+
+ keep.append(idx_self)
+
+ xx1 = np.maximum(x1[idx_self], x1[idx_other])
+ yy1 = np.maximum(y1[idx_self], y1[idx_other])
+ xx2 = np.minimum(x2[idx_self], x2[idx_other])
+ yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+ w = np.maximum(0.0, xx2 - xx1)
+ h = np.maximum(0.0, yy2 - yy1)
+ inter = w * h
+
+ if min_mode:
+ over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+ else:
+ over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+ inds = np.where(over <= nms_thresh)[0]
+ order = order[inds + 1]
+
+ return np.array(keep)
+
+
+def post_processing(img, conf_thresh, nms_thresh, output, num_classes=80):
+
+ # [batch, num, 1, 4]
+ box_array = output[:, :, :4]
+ # [batch, num, 2]
+ class_confs = output[:, :, 4:]
+
+ max_conf = class_confs[:, :, 1]
+ max_id = class_confs[:, :, 0]
+
+ bboxes_batch = []
+ for i in range(box_array.shape[0]):
+
+ argwhere = max_conf[i] > conf_thresh
+ l_box_array = box_array[i, argwhere, :]
+ l_max_conf = max_conf[i, argwhere]
+ l_max_id = max_id[i, argwhere]
+
+ bboxes = []
+ # nms for each class
+ for j in range(num_classes):
+
+ cls_argwhere = l_max_id == j
+ ll_box_array = l_box_array[cls_argwhere, :]
+ ll_max_conf = l_max_conf[cls_argwhere]
+ ll_max_id = l_max_id[cls_argwhere]
+
+ keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+
+ if (keep.size > 0):
+ ll_box_array = ll_box_array[keep, :]
+ ll_max_conf = ll_max_conf[keep]
+ ll_max_id = ll_max_id[keep]
+
+ for k in range(ll_box_array.shape[0]):
+ bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2],
+ ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+
+ bboxes_batch.append(bboxes)
+
+ return bboxes_batch
+##########################################################
+
diff --git a/models/cv/detection/yolov6/ixrt/deploy.py b/models/cv/detection/yolov6/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..f73d14b2617eee1e458825dc66d38177f482a1b1
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/deploy.py
@@ -0,0 +1,99 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import argparse
+import copy
+
+from typing import Union, Callable, List
+
+from tensorrt.deploy.api import *
+from tensorrt.deploy.backend.onnx.converter import default_converter
+from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
+from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
+from tensorrt.deploy.ir.operator_type import OperatorType as OP
+from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
+from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
+from tensorrt.deploy.ir import Graph
+from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
+from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+
+class FuseSiLUPass(BasePass):
+ def process(self, graph: Graph) -> Graph:
+ pattern = build_sequence_graph([OP.SIGMOID, OP.MUL])
+
+ matcher = GraphMatcher(pattern, strict=False)
+ self.transform = GraphTransform(graph)
+ matcher.findall(graph, self.fuse_mish)
+ return graph
+
+ def fuse_mish(self, graph: Graph, pattern_graph: PatternGraph):
+ sigmoid = pattern_graph.nodes[0].operator
+ mul = pattern_graph.nodes[-1].operator
+
+ if not self.can_fused(graph, pattern_graph):
+ return
+
+ self.transform.delete_operators_between_op_op(sigmoid, mul)
+
+ silu_op = Operator(
+ name=generate_operator_name(graph, pattern="SiLU_{idx}"),
+ op_type=OP.SILU,
+ inputs=copy.copy(sigmoid.inputs),
+ outputs=copy.copy(mul.outputs),
+ )
+ silu_op.is_quant_operator = sigmoid.is_quant_operator and mul.is_quant_operator
+ graph.add_operator(silu_op)
+
+ def can_fused(self, graph: Graph, pattern_graph: PatternGraph):
+ sigmoid = pattern_graph.nodes[0].operator
+ mul = pattern_graph.nodes[-1].operator
+
+ # 如果 sigmoid 的结果 被多个 OP 使用,则不能融合
+ if len(self.transform.get_next_operators(sigmoid)) > 1:
+ return False
+
+ # 检查 mul 的输入是不是和 sigmoid 是同源的
+ softplus_prev_op = graph.get_previous_operators(sigmoid)
+ if len(softplus_prev_op) != 1:
+ return False
+
+ mul_prev_op = graph.get_previous_operators(mul)
+ if len(mul_prev_op) != 2:
+ return False
+
+ for op in mul_prev_op:
+ if op is softplus_prev_op[0]:
+ return True
+
+ return False
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--src", type=str)
+ parser.add_argument("--dst", type=str)
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+
+ args = parse_args()
+ graph = create_source(args.src)()
+ graph = FuseSiLUPass().process(graph)
+ create_target(saved_path=args.dst).export(graph)
+ print("Surged onnx lies on", args.dst)
diff --git a/models/cv/detection/yolov6/ixrt/inference.py b/models/cv/detection/yolov6/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..836f13b2376ded6144ea9bf0da7ed47cd3f5905f
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/inference.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+sys.path.insert(0, "YOLOv6")
+import json
+import argparse
+import time
+import tensorrt
+from tensorrt import Dims
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import create_engine_context, setup_io_bindings
+
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+from yolov6.core.evaler import Evaler
+from yolov6.utils.events import NCOLS
+from yolov6.utils.nms import non_max_suppression
+from yolov6.data.data_load import create_dataloader
+
+
+coco_classes = {
+ 0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
+ 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
+ 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
+ 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
+ 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange',
+ 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed',
+ 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven',
+ 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
+}
+
+class EvalerIXRT(Evaler):
+ def eval_ixrt(self, args, stride=32):
+ self.stride = stride
+ def init_data(dataloader, task):
+ self.is_coco = self.data.get("is_coco", False)
+ self.ids = self.coco80_to_coco91_class() if self.is_coco else list(range(1000))
+ pad = 0.0
+ dataloader = create_dataloader(
+ self.data[task], self.img_size, self.batch_size, self.stride,
+ check_labels=True, pad=pad, rect=False, data_dict=self.data, task=task)[0]
+ return dataloader
+
+ dataloader = init_data(None,'val')
+ pred_results = []
+
+ input_name = "input"
+ host_mem = tensorrt.IHostMemory
+ logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+ engine, context = create_engine_context(args.model_engine, logger)
+ input_idx = engine.get_binding_index(input_name)
+ context.set_binding_shape(input_idx, Dims((args.bsz,3,args.imgsz,args.imgsz)))
+ inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+ if args.warm_up > 0:
+ print("\nWarm Start.")
+ for i in range(args.warm_up):
+ context.execute_v2(allocations)
+ print("Warm Done.")
+
+ pbar = tqdm(dataloader, desc="Inferencing model in validation dataset.", ncols=NCOLS)
+
+ forward_time = 0.0
+ num_samples = 0
+ for imgs, targes, paths, shapes in pbar:
+ imgs = imgs.float()
+ pad_batch = len(imgs) != self.batch_size
+ if pad_batch:
+ origin_size = len(imgs)
+ imgs = np.resize(imgs, (self.batch_size, *imgs.shape[1:]))
+ imgs /= 255.0
+ # print(imgs.shape)
+ batch_data = np.ascontiguousarray(imgs)
+ data_shape = batch_data.shape
+
+ cur_bsz_sample = batch_data.shape[0]
+ num_samples += cur_bsz_sample
+
+ # Set input
+ input_idx = engine.get_binding_index(input_name)
+ context.set_binding_shape(input_idx, Dims(data_shape))
+ inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+ cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+ # Prepare the output data
+ output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+
+
+ start_time = time.time()
+ context.execute_v2(allocations)
+ end_time = time.time()
+ forward_time += end_time - start_time
+
+ cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+
+ if not args.perf_only:
+ if pad_batch:
+ output = output[:origin_size]
+
+ outputs = torch.from_numpy(output)
+ outputs = non_max_suppression(outputs, self.conf_thres, self.iou_thres, multi_label=True)
+ pred_results.extend(self.convert_to_coco_format(outputs, imgs, paths, shapes, self.ids))
+ if args.perf_only:
+ fps = num_samples / forward_time
+ return fps
+ else:
+ return dataloader, pred_results
+
+ def eval_ixrt_map(self, pred_results, dataloader, task):
+ '''Evaluate models
+ For task speed, this function only evaluates the speed of model and outputs inference time.
+ For task val, this function evaluates the speed and mAP by pycocotools, and returns
+ inference time and mAP value.
+ '''
+ if not self.do_coco_metric and self.do_pr_metric:
+ return self.pr_metric_result
+ print(f'\nEvaluating mAP by pycocotools.')
+ if task != 'speed' and len(pred_results):
+ if 'anno_path' in self.data:
+ anno_json = self.data['anno_path']
+ else:
+ # generated coco format labels in dataset initialization
+ task = 'val' if task == 'train' else task
+ dataset_root = os.path.dirname(os.path.dirname(self.data[task]))
+ base_name = os.path.basename(self.data[task])
+ anno_json = os.path.join(dataset_root, 'annotations', f'instances_{base_name}.json')
+ pred_json = os.path.join(self.save_dir, "predictions.json")
+ print(f'Saving {pred_json}...')
+ with open(pred_json, 'w') as f:
+ json.dump(pred_results, f)
+
+ anno = COCO(anno_json)
+ pred = anno.loadRes(pred_json)
+ cocoEval = COCOeval(anno, pred, 'bbox')
+ if self.is_coco:
+ imgIds = [int(os.path.basename(x).split(".")[0])
+ for x in dataloader.dataset.img_paths]
+ cocoEval.params.imgIds = imgIds
+ cocoEval.evaluate()
+ cocoEval.accumulate()
+ cocoEval.summarize()
+
+ return cocoEval.stats
+ else:
+ print("pred_results is none")
+ return None
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument(
+ "--model_engine",
+ type=str,
+ default="",
+ help="model engine path",
+ )
+
+ parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+ parser.add_argument(
+ "--imgsz",
+ "--img",
+ "--img-size",
+ type=int,
+ default=608,
+ help="inference size h,w",
+ )
+
+ parser.add_argument("--datasets",
+ type=str,
+ required=True,
+ help="datasets path.")
+
+ parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
+
+ parser.add_argument("--acc_target",
+ type=float,
+ default=None,
+ help="Model inference Accuracy target.")
+
+ parser.add_argument("--fps_target",
+ type=float,
+ default=None,
+ help="Model inference FPS target.")
+
+ parser.add_argument("--perf_only",
+ type=bool,
+ default=False,
+ help="Run performance test only")
+
+ args = parser.parse_args()
+
+ return args
+
+def main():
+ args = parse_args()
+
+ task = 'val'
+
+ batch_size = args.bsz
+ data_path = os.path.join(args.datasets, "images", "val2017")
+ label_path = os.path.join(args.datasets, "annotations", "instances_val2017.json")
+
+
+ data = {
+ 'task': 'val',
+ 'val': data_path,
+ 'anno_path': label_path,
+ 'names': coco_classes,
+ 'is_coco': True,
+ 'nc': 80
+ }
+
+ evaluator = EvalerIXRT(data, batch_size)
+
+ if args.perf_only:
+ fps = evaluator.eval_ixrt(args)
+ print("FPS : ", fps)
+ print(f"Performance Check : Test {fps} >= target {args.fps_target}")
+ else:
+ dataloader, pred_results = evaluator.eval_ixrt(args)
+ eval_result = evaluator.eval_ixrt_map(pred_results, dataloader, task)
+ map, map50 = eval_result[:2]
+ print("MAP@0.5 : ", map50)
+ print(f"Accuracy Check : Test {map50} >= target {args.acc_target}")
+ if map50 >= args.acc_target:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/models/cv/detection/yolov6/ixrt/quant.py b/models/cv/detection/yolov6/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..70265cbc25d24d4ed41640c76f78a1839555f749
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/quant.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import random
+import argparse
+import numpy as np
+from tensorrt.deploy import static_quantize
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from common import letterbox
+
+
+def setseed(seed=42):
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_name", type=str)
+ parser.add_argument("--model", type=str, default="yolov4_bs16_without_decoder.onnx")
+ parser.add_argument("--dataset_dir", type=str, default="./coco2017/val2017")
+ parser.add_argument("--ann_file", type=str, default="./coco2017/annotations/instances_val2017.json")
+ parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
+ parser.add_argument("--disable_quant_names", nargs='*', type=str)
+ parser.add_argument("--save_quant_model", type=str, help="save the quantization model path", default=None)
+ parser.add_argument("--bsz", type=int, default=16)
+ parser.add_argument("--step", type=int, default=32)
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--imgsz", type=int, default=608)
+ parser.add_argument("--use_letterbox", action="store_true")
+ args = parser.parse_args()
+ return args
+
+args = parse_args()
+setseed(args.seed)
+model_name = args.model_name
+
+
+def get_dataloader(data_dir, step=32, batch_size=16, new_shape=[608, 608], use_letterbox=False):
+ num = step * batch_size
+ val_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
+ random.shuffle(val_list)
+ pic_list = val_list[:num]
+
+ calibration_dataset = []
+ for file_path in pic_list:
+ pic_data = cv2.imread(file_path)
+ org_img = pic_data
+ assert org_img is not None, 'Image not Found ' + file_path
+ h0, w0 = org_img.shape[:2]
+
+ if use_letterbox:
+ img, ratio, dwdh = letterbox(org_img, new_shape=(new_shape[1], new_shape[0]), auto=False, scaleup=True)
+ else:
+ img = cv2.resize(org_img, new_shape)
+ img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
+ img = np.ascontiguousarray(img) / 255.0 # 0~1 np array
+ img = torch.from_numpy(img).float()
+
+ calibration_dataset.append(img)
+
+ calibration_dataloader = DataLoader(
+ calibration_dataset,
+ shuffle=True,
+ batch_size=batch_size,
+ drop_last=True
+ )
+ return calibration_dataloader
+
+dataloader = get_dataloader(
+ data_dir=args.dataset_dir,
+ step=args.step,
+ batch_size=args.bsz,
+ new_shape=(args.imgsz, args.imgsz),
+ use_letterbox=args.use_letterbox
+)
+
+dirname = os.path.dirname(args.save_quant_model)
+quant_json_path = os.path.join(dirname, f"quantized_{model_name}.json")
+
+static_quantize(args.model,
+ calibration_dataloader=dataloader,
+ save_quant_onnx_path=args.save_quant_model,
+ save_quant_params_path=quant_json_path,
+ observer=args.observer,
+ data_preprocess=lambda x: x.to("cuda"),
+ quant_format="qdq",
+ disable_quant_names=args.disable_quant_names)
diff --git a/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_accuracy.sh b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..09cc0ac03802a697696ff3e68ea2c2157e240ea7
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_accuracy.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov6s
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov6s.onnx
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov6s_fp16.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision float16 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 640 \
+ --datasets ${DATASETS_DIR} \
+ --acc_target 0.3
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_performance.sh b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..409fd354e86d7fa3092fda68bd1da2c1ed35498d
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_fp16_performance.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov6s
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov6s.onnx
+
+# fuse silu
+# FINAL_MODEL=${CHECKPOINTS_DIR}/yolov6_bs${BATCH_SIZE}_fused.onnx
+# if [ -f $FINAL_MODEL ];then
+# echo " "Fuse silu Skip, $FINAL_MODEL has been existed
+# else
+# python3 ${RUN_DIR}/deploy.py \
+# --src ${CURRENT_MODEL} \
+# --dst ${FINAL_MODEL}
+# echo " "Generate ${FINAL_MODEL}
+# fi
+# CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov6s_fp16.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision float16 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 640 \
+ --datasets ${DATASETS_DIR} \
+ --perf_only true \
+ --fps_target 0.0
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_accuracy.sh b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..701f80f06ac1ca46d154c1122f02913b247a83af
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_accuracy.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov6s
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov6s.onnx
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov6s_bs${BATCH_SIZE}.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/quant.py \
+ --model_name "YOLOV6s" \
+ --model ${CURRENT_MODEL} \
+ --bsz ${BATCH_SIZE} \
+ --dataset_dir ${EVAL_DIR} \
+ --ann_file ${COCO_GT} \
+ --observer "hist_percentile" \
+ --save_quant_model ${FINAL_MODEL} \
+ --imgsz 640 \
+ --disable_quant_names '/detect/Split' '/detect/Div' '/detect/Sub' '/detect/Add' '/detect/Add_1' '/detect/Sub_1' '/detect/Div' '/detect/Concat_6' '/detect/Mul' '/detect/Concat_7' \
+ --use_letterbox
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov6s_int8.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision int8 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 640 \
+ --datasets ${DATASETS_DIR} \
+ --acc_target 0.3
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_performance.sh b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..58f77417058c5461fe84161bb139bcecad4623c6
--- /dev/null
+++ b/models/cv/detection/yolov6/ixrt/scripts/infer_yolov6s_int8_performance.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov6s
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov6s.onnx
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov6s_bs${BATCH_SIZE}.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Change Batchsize Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/quant.py \
+ --model_name "YOLOV6s" \
+ --model ${CURRENT_MODEL} \
+ --bsz ${BATCH_SIZE} \
+ --dataset_dir ${EVAL_DIR} \
+ --ann_file ${COCO_GT} \
+ --observer "hist_percentile" \
+ --save_quant_model ${FINAL_MODEL} \
+ --imgsz 640 \
+ --disable_quant_names '/detect/Split' '/detect/Div' '/detect/Sub' '/detect/Add' '/detect/Add_1' '/detect/Sub_1' '/detect/Div' '/detect/Concat_6' '/detect/Mul' '/detect/Concat_7' \
+ --use_letterbox
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov6s_int8.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision int8 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 640 \
+ --datasets ${DATASETS_DIR} \
+ --perf_only true \
+ --fps_target 0.0
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/detection/yolov8/ixrt/README.md b/models/cv/detection/yolov8/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..07558edf6f3591a70262c778309d67484d1edf4f
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/README.md
@@ -0,0 +1,72 @@
+# YOLOv8
+
+## Description
+
+Yolov8 combines speed and accuracy in real-time object detection tasks. With a focus on simplicity and efficiency, this model employs a single neural network to make predictions, enabling fast and accurate identification of objects in images or video streams.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install onnxsim
+pip3 install pycocotools
+pip3 install ultralytics
+pip3 install pycuda
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the validation dataset.
+
+```bash
+# get yolov8n.pt
+wget https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt
+# set coco path
+mkdir -p data/
+ln -s /Path/to/coco/ data/coco
+```
+
+### Model Conversion
+
+```bash
+python3 export.py --weight yolov8n.pt --batch 32
+onnxsim yolov8n.onnx ./data/yolov8n.onnx
+```
+
+## Inference
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_yolov8n_fp16_accuracy.sh
+# Performance
+bash scripts/infer_yolov8n_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_yolov8n_int8_accuracy.sh
+# Performance
+bash scripts/infer_yolov8n_int8_performance.sh
+```
+
+## Results
+
+| Model | BatchSize | Precision | FPS | MAP@0.5 |
+| ------ | --------- | --------- | -------- | ------- |
+| YOLOv8 | 32 | FP16 | 1511.366 | 0.525 |
+| YOLOv8 | 32 | INT8 | 1841.017 | 0.517 |
diff --git a/models/cv/detection/yolov8/ixrt/build_engine.py b/models/cv/detection/yolov8/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e1719a22c84b400a2ba9b9cbfdea6bae99e80d
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/build_engine.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+from tensorrt import Dims
+
+
+def build_engine_trtapi_staticshape(config):
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file(config.model)
+
+ precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+ # print("precision : ", precision)
+ build_config.set_flag(precision)
+
+ plan = builder.build_serialized_network(network, build_config)
+ engine_file_path = config.engine
+ with open(engine_file_path, "wb") as f:
+ f.write(plan)
+ print("Build static shape engine done!")
+
+
+def build_engine_trtapi_dynamicshape(config):
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+
+ profile = builder.create_optimization_profile()
+ profile.set_shape("input",
+ Dims([1, 3, 608, 608]),
+ Dims([32, 3, 608, 608]),
+ Dims([64, 3, 608, 608]),
+ )
+ build_config.add_optimization_profile(profile)
+
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file(config.model)
+ precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+ # print("precision : ", precision)
+ build_config.set_flag(precision)
+
+ # set dynamic
+ num_inputs = network.num_inputs
+ for i in range(num_inputs):
+ input_tensor = network.get_input(i)
+ input_tensor.shape = Dims([-1, 3, 608, 608])
+
+ plan = builder.build_serialized_network(network, build_config)
+ engine_file_path = config.engine
+ with open(engine_file_path, "wb") as f:
+ f.write(plan)
+ print("Build dynamic shape engine done!")
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str)
+ parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+ help="The precision of datatype")
+ # engine args
+ parser.add_argument("--engine", type=str, default=None)
+
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ build_engine_trtapi_staticshape(args)
+ # build_engine_trtapi_dynamicshape(args)
diff --git a/models/cv/detection/yolov8/ixrt/common.py b/models/cv/detection/yolov8/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c2766533fa5a334a61231adb168ecf09622c3
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/common.py
@@ -0,0 +1,335 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import glob
+import time
+import numpy as np
+from tqdm import tqdm
+
+import tensorrt
+import pycuda.driver as cuda
+
+
+def load_class_names(namesfile):
+ class_names = []
+ with open(namesfile, 'r') as fp:
+ lines = fp.readlines()
+ for line in lines:
+ line = line.rstrip()
+ class_names.append(line)
+ return class_names
+
+# input : [bsz, box_num, 5(cx, cy, w, h, conf) + class_num(prob[0], prob[1], ...)]
+# output : [bsz, box_num, 6(left_top_x, left_top_y, right_bottom_x, right_bottom_y, class_id, max_prob*conf)]
+def box_class85to6(input):
+ center_x_y = input[:, :2]
+ side = input[:, 2:4]
+ conf = input[:, 4:5]
+ class_id = np.argmax(input[:, 5:], axis = -1)
+ class_id = class_id.astype(np.float32).reshape(-1, 1) + 1
+ max_prob = np.max(input[:, 5:], axis = -1).reshape(-1, 1)
+ x1_y1 = center_x_y - 0.5 * side
+ x2_y2 = center_x_y + 0.5 * side
+ nms_input = np.concatenate([x1_y1, x2_y2, class_id, max_prob*conf], axis = -1)
+ return nms_input
+
+def save2json(batch_img_id, pred_boxes, json_result, class_trans):
+ for i, boxes in enumerate(pred_boxes):
+ if boxes is not None:
+ image_id = int(batch_img_id[i])
+ # have no target
+ if image_id == -1:
+ continue
+
+ for x1, y1, x2, y2, _, p, c in boxes:
+ x1, y1, x2, y2, p = float(x1), float(y1), float(x2), float(y2), float(p)
+ c = int(c)
+ x = x1
+ y = y1
+ w = x2 - x1
+ h = y2 - y1
+
+ json_result.append(
+ {
+ "image_id": image_id,
+ "category_id": class_trans[c - 1],
+ "bbox": [x, y, w, h],
+ "score": p,
+ }
+ )
+
+################## About TensorRT #################
+def create_engine_context(engine_path, logger):
+ with open(engine_path, "rb") as f:
+ runtime = tensorrt.Runtime(logger)
+ assert runtime
+ engine = runtime.deserialize_cuda_engine(f.read())
+ assert engine
+ context = engine.create_execution_context()
+ assert context
+
+ return engine, context
+
+def setup_io_bindings(engine, context):
+ # Setup I/O bindings
+ inputs = []
+ outputs = []
+ allocations = []
+
+ for i in range(engine.num_bindings):
+ is_input = False
+ if engine.binding_is_input(i):
+ is_input = True
+ name = engine.get_binding_name(i)
+ dtype = engine.get_binding_dtype(i)
+ shape = context.get_binding_shape(i)
+ if is_input:
+ batch_size = shape[0]
+ size = np.dtype(tensorrt.nptype(dtype)).itemsize
+ for s in shape:
+ size *= s
+ allocation = cuda.mem_alloc(size)
+ binding = {
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(tensorrt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
+ }
+ # print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}")
+ allocations.append(allocation)
+ if engine.binding_is_input(i):
+ inputs.append(binding)
+ else:
+ outputs.append(binding)
+ return inputs, outputs, allocations
+##########################################################
+
+
+################## About Loading Dataset #################
+def load_images(images_path):
+ """
+ If image path is given, return it directly
+ For txt file, read it and return each line as image path
+ In other case, it's a folder, return a list with names of each
+ jpg, jpeg and png file
+ """
+ input_path_extension = images_path.split('.')[-1]
+ if input_path_extension in ['jpg', 'jpeg', 'png']:
+ return [images_path]
+ elif input_path_extension == "txt":
+ with open(images_path, "r") as f:
+ return f.read().splitlines()
+ else:
+ return glob.glob(
+ os.path.join(images_path, "*.jpg")) + \
+ glob.glob(os.path.join(images_path, "*.png")) + \
+ glob.glob(os.path.join(images_path, "*.jpeg"))
+
+def prepare_batch(images_path, bs=16, input_size=(608, 608)):
+
+ width, height = input_size
+
+ batch_names = []
+ batch_images = []
+ batch_shapes = []
+
+ temp_names = []
+ temp_images = []
+ temp_shapes = []
+
+ for i, image_path in tqdm(enumerate(images_path), desc="Loading coco data"):
+ name = os.path.basename(image_path)
+ image = cv2.imread(image_path)
+ h, w, _ = image.shape
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+ image_resized = cv2.resize(image_rgb, (width, height),
+ interpolation=cv2.INTER_LINEAR)
+ custom_image = image_resized.transpose(2, 0, 1).astype(np.float32) / 255.
+ custom_image = np.expand_dims(custom_image, axis=0)
+
+ if i != 0 and i % bs == 0:
+ batch_names.append(temp_names)
+ batch_images.append(np.concatenate(temp_images, axis=0))
+ batch_shapes.append(temp_shapes)
+
+ temp_names = [name]
+ temp_images = [custom_image]
+ temp_shapes = [(h, w)]
+ else:
+ temp_names.append(name)
+ temp_images.append(custom_image)
+ temp_shapes.append((h, w))
+
+ return batch_names, batch_images, batch_shapes
+##########################################################
+
+
+################## About Operating box #################
+def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
+ # Resize and pad image while meeting stride-multiple constraints
+ shape = im.shape[:2] # current shape [height, width]
+ if isinstance(new_shape, int):
+ new_shape = (new_shape, new_shape)
+
+ # Scale ratio (new / old)
+ r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+ if not scaleup: # only scale down, do not scale up (for better val mAP)
+ r = min(r, 1.0)
+
+ # Compute padding
+ ratio = r, r # width, height ratios
+ new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+ dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
+ if auto: # minimum rectangle
+ dw, dh = np.mod(dw, stride), np.mod(dh, stride) # wh padding
+ elif scaleFill: # stretch
+ dw, dh = 0.0, 0.0
+ new_unpad = (new_shape[1], new_shape[0])
+ ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios
+
+ dw /= 2 # divide padding into 2 sides
+ dh /= 2
+
+ if shape[::-1] != new_unpad: # resize
+ im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+ top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+ left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+ im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
+ return im, ratio, (dw, dh)
+
+def scale_boxes(net_shape, boxes, ori_shape, use_letterbox=False):
+ # Rescale boxes (xyxy) from net_shape to ori_shape
+
+ if use_letterbox:
+
+ gain = min(
+ net_shape[0] / ori_shape[0], net_shape[1] / ori_shape[1]
+ ) # gain = new / old
+ pad = (net_shape[1] - ori_shape[1] * gain) / 2, (
+ net_shape[0] - ori_shape[0] * gain
+ ) / 2.0
+
+ boxes[:, [0, 2]] -= pad[0] # x padding
+ boxes[:, [1, 3]] -= pad[1] # y padding
+ boxes[:, :4] /= gain
+ else:
+ x_scale, y_scale = net_shape[1] / ori_shape[1], net_shape[0] / ori_shape[0]
+
+ boxes[:, 0] /= x_scale
+ boxes[:, 1] /= y_scale
+ boxes[:, 2] /= x_scale
+ boxes[:, 3] /= y_scale
+
+ clip_boxes(boxes, ori_shape)
+ return boxes
+
+def clip_boxes(boxes, shape):
+
+ boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1]) # x1, x2
+ boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0]) # y1, y2
+##########################################################
+
+
+################## About pre and post processing #########
+def pre_processing(src_img, imgsz=608):
+ resized = cv2.resize(src_img, (imgsz, imgsz), interpolation=cv2.INTER_LINEAR)
+ in_img = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
+ in_img = np.transpose(in_img, (2, 0, 1)).astype(np.float32)
+ in_img = np.expand_dims(in_img, axis=0)
+ in_img /= 255.0
+ return in_img
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+ # print(boxes.shape)
+ x1 = boxes[:, 0]
+ y1 = boxes[:, 1]
+ x2 = boxes[:, 2]
+ y2 = boxes[:, 3]
+
+ areas = (x2 - x1) * (y2 - y1)
+ order = confs.argsort()[::-1]
+
+ keep = []
+ while order.size > 0:
+ idx_self = order[0]
+ idx_other = order[1:]
+
+ keep.append(idx_self)
+
+ xx1 = np.maximum(x1[idx_self], x1[idx_other])
+ yy1 = np.maximum(y1[idx_self], y1[idx_other])
+ xx2 = np.minimum(x2[idx_self], x2[idx_other])
+ yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+ w = np.maximum(0.0, xx2 - xx1)
+ h = np.maximum(0.0, yy2 - yy1)
+ inter = w * h
+
+ if min_mode:
+ over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+ else:
+ over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+ inds = np.where(over <= nms_thresh)[0]
+ order = order[inds + 1]
+
+ return np.array(keep)
+
+
+def post_processing(img, conf_thresh, nms_thresh, output, num_classes=80):
+
+ # [batch, num, 1, 4]
+ box_array = output[:, :, :4]
+ # [batch, num, 2]
+ class_confs = output[:, :, 4:]
+
+ max_conf = class_confs[:, :, 1]
+ max_id = class_confs[:, :, 0]
+
+ bboxes_batch = []
+ for i in range(box_array.shape[0]):
+
+ argwhere = max_conf[i] > conf_thresh
+ l_box_array = box_array[i, argwhere, :]
+ l_max_conf = max_conf[i, argwhere]
+ l_max_id = max_id[i, argwhere]
+
+ bboxes = []
+ # nms for each class
+ for j in range(num_classes):
+
+ cls_argwhere = l_max_id == j
+ ll_box_array = l_box_array[cls_argwhere, :]
+ ll_max_conf = l_max_conf[cls_argwhere]
+ ll_max_id = l_max_id[cls_argwhere]
+
+ keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+
+ if (keep.size > 0):
+ ll_box_array = ll_box_array[keep, :]
+ ll_max_conf = ll_max_conf[keep]
+ ll_max_id = ll_max_id[keep]
+
+ for k in range(ll_box_array.shape[0]):
+ bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2],
+ ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+
+ bboxes_batch.append(bboxes)
+
+ return bboxes_batch
+##########################################################
+
diff --git a/models/cv/detection/yolov8/ixrt/export.py b/models/cv/detection/yolov8/ixrt/export.py
new file mode 100644
index 0000000000000000000000000000000000000000..383b327e5794fd7930a78e2acfbf4237c556c4d8
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/export.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import argparse
+from ultralytics import YOLO
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--weight",
+ type=str,
+ required=True,
+ help="pytorch model weight.")
+
+ parser.add_argument("--batch",
+ type=int,
+ required=True,
+ help="batchsize of the model.")
+ args = parser.parse_args()
+
+ return args
+
+def main():
+ args = parse_args()
+
+ model = YOLO(args.weight).cpu()
+
+ model.export(format='onnx', batch=args.batch, imgsz=(640, 640), opset=11)
+
+if __name__ == "__main__":
+ main()
diff --git a/models/cv/detection/yolov8/ixrt/inference.py b/models/cv/detection/yolov8/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..d83b013610c132a776a2dc02663177e20a7ea2e3
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/inference.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import json
+import argparse
+import time
+import tensorrt
+from tensorrt import Dims
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import numpy as np
+from tqdm import tqdm
+
+from common import create_engine_context, setup_io_bindings
+
+from pathlib import Path
+
+from ultralytics.cfg import get_cfg
+from ultralytics.data import converter
+from ultralytics.utils import DEFAULT_CFG
+from ultralytics.data.utils import check_det_dataset
+from ultralytics.utils.metrics import ConfusionMatrix
+from ultralytics.models.yolo.detect import DetectionValidator
+
+coco_classes = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light',
+ 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow',
+ 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee',
+ 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
+ 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange',
+ 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed',
+ 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven',
+ 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--model_engine",
+ type=str,
+ required=True,
+ help="ixrt engine path.")
+
+ parser.add_argument("--bsz",
+ type=int,
+ required=True,
+ help="inference batch size.")
+
+ parser.add_argument(
+ "--imgsz",
+ "--img",
+ "--img-size",
+ type=int,
+ default=640,
+ help="inference size h,w",
+ )
+
+ parser.add_argument("--datasets",
+ type=str,
+ required=True,
+ help="datasets path.")
+
+ parser.add_argument("--warm_up",
+ type=int,
+ default=3,
+ help="number of warmup before test.")
+
+ parser.add_argument("--num_workers",
+ type=int,
+ default=16,
+ help="number of workers used in pytorch dataloader.")
+
+ parser.add_argument("--acc_target",
+ type=float,
+ default=0.0,
+ help="Model inference Accuracy target.")
+
+ parser.add_argument("--fps_target",
+ type=float,
+ default=0.0,
+ help="Model inference FPS target.")
+
+ parser.add_argument("--conf",
+ type=float,
+ default=0.001,
+ help="confidence threshold.")
+
+ parser.add_argument("--iou",
+ type=float,
+ default=0.65,
+ help="iou threshold.")
+
+ parser.add_argument("--perf_only",
+ type=bool,
+ default=False,
+ help="Run performance test only")
+
+ args = parser.parse_args()
+
+ return args
+
+class IxRT_Validator(DetectionValidator):
+ def __call__(self, config, data):
+ self.data = data
+ self.stride = 32
+ self.dataloader = self.get_dataloader(self.data.get(self.args.split), self.args.batch)
+ self.init_metrics()
+
+ total_num = 0
+
+ input_name = "input"
+ host_mem = tensorrt.IHostMemory
+ logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+ engine, context = create_engine_context(config.model_engine, logger)
+ input_idx = engine.get_binding_index(input_name)
+ context.set_binding_shape(input_idx, Dims((config.bsz,3,config.imgsz,config.imgsz)))
+ inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+ if config.warm_up > 0:
+ print("\nWarm Start.")
+ for i in range(config.warm_up):
+ context.execute_v2(allocations)
+ print("Warm Done.")
+
+ forward_time = 0.0
+ num_samples = 0
+
+ for batch in tqdm(self.dataloader):
+ batch = self.preprocess(batch)
+
+ imgs = batch['img']
+ pad_batch = len(imgs) != self.args.batch
+ if pad_batch:
+ origin_size = len(imgs)
+ imgs = np.resize(imgs, (self.args.batch, *imgs.shape[1:]))
+
+ batch_data = np.ascontiguousarray(imgs)
+ data_shape = batch_data.shape
+
+ cur_bsz_sample = batch_data.shape[0]
+ num_samples += cur_bsz_sample
+
+ # Set input
+ input_idx = engine.get_binding_index(input_name)
+ context.set_binding_shape(input_idx, Dims(data_shape))
+ inputs, outputs, allocations = setup_io_bindings(engine, context)
+
+ cuda.memcpy_htod(inputs[0]["allocation"], batch_data)
+ # Prepare the output data
+ output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+
+
+ start_time = time.time()
+ context.execute_v2(allocations)
+ end_time = time.time()
+ forward_time += end_time - start_time
+
+ cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+ if pad_batch:
+ output = output[:origin_size]
+
+ outputs = torch.from_numpy(output)
+
+ preds = self.postprocess([outputs])
+
+ self.update_metrics(preds, batch)
+
+ if config.perf_only:
+ fps = num_samples / forward_time
+ return fps
+ else:
+ stats = self.get_stats()
+
+ if self.args.save_json and self.jdict:
+ with open(str(self.save_dir / 'predictions.json'), 'w') as f:
+ print(f'Saving {f.name} ...')
+ json.dump(self.jdict, f) # flatten and save
+
+ stats = self.eval_json(stats)
+
+ return stats
+
+ def init_metrics(self):
+ """Initialize evaluation metrics for YOLO."""
+ val = self.data.get(self.args.split, '') # validation path
+ self.is_coco = isinstance(val, str) and 'coco' in val and val.endswith(f'{os.sep}val2017.txt') # is COCO
+ self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(1000))
+ self.args.save_json |= self.is_coco and not self.training # run on final val if training COCO
+ self.names = self.data['names']
+ self.nc = len(self.names)
+ self.metrics.names = self.names
+ self.confusion_matrix = ConfusionMatrix(nc=80)
+ self.seen = 0
+ self.jdict = []
+ self.stats = dict(tp=[], conf=[], pred_cls=[], target_cls=[], target_img=[])
+
+def main():
+ config = parse_args()
+
+ batch_size = config.bsz
+
+ overrides = {'mode': 'val'}
+ cfg_args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
+
+ cfg_args.batch = batch_size
+ cfg_args.save_json = True
+
+ data = {
+ 'path': Path(config.datasets),
+ 'val': os.path.join(config.datasets, 'val2017.txt'),
+ 'names': coco_classes
+ }
+
+ validator = IxRT_Validator(args=cfg_args, save_dir=Path('.'))
+
+ if config.perf_only:
+ fps = validator(config, data)
+ print("FPS : ", fps)
+ print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+ else:
+ stats = validator(config, data)
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/models/cv/detection/yolov8/ixrt/quant.py b/models/cv/detection/yolov8/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..70265cbc25d24d4ed41640c76f78a1839555f749
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/quant.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import cv2
+import random
+import argparse
+import numpy as np
+from tensorrt.deploy import static_quantize
+
+import torch
+import torchvision.datasets
+from torch.utils.data import DataLoader
+from common import letterbox
+
+
+def setseed(seed=42):
+ random.seed(seed)
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_name", type=str)
+ parser.add_argument("--model", type=str, default="yolov4_bs16_without_decoder.onnx")
+ parser.add_argument("--dataset_dir", type=str, default="./coco2017/val2017")
+ parser.add_argument("--ann_file", type=str, default="./coco2017/annotations/instances_val2017.json")
+ parser.add_argument("--observer", type=str, choices=["hist_percentile", "percentile", "minmax", "entropy", "ema"], default="hist_percentile")
+ parser.add_argument("--disable_quant_names", nargs='*', type=str)
+ parser.add_argument("--save_quant_model", type=str, help="save the quantization model path", default=None)
+ parser.add_argument("--bsz", type=int, default=16)
+ parser.add_argument("--step", type=int, default=32)
+ parser.add_argument("--seed", type=int, default=42)
+ parser.add_argument("--imgsz", type=int, default=608)
+ parser.add_argument("--use_letterbox", action="store_true")
+ args = parser.parse_args()
+ return args
+
+args = parse_args()
+setseed(args.seed)
+model_name = args.model_name
+
+
+def get_dataloader(data_dir, step=32, batch_size=16, new_shape=[608, 608], use_letterbox=False):
+ num = step * batch_size
+ val_list = [os.path.join(data_dir, x) for x in os.listdir(data_dir)]
+ random.shuffle(val_list)
+ pic_list = val_list[:num]
+
+ calibration_dataset = []
+ for file_path in pic_list:
+ pic_data = cv2.imread(file_path)
+ org_img = pic_data
+ assert org_img is not None, 'Image not Found ' + file_path
+ h0, w0 = org_img.shape[:2]
+
+ if use_letterbox:
+ img, ratio, dwdh = letterbox(org_img, new_shape=(new_shape[1], new_shape[0]), auto=False, scaleup=True)
+ else:
+ img = cv2.resize(org_img, new_shape)
+ img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
+ img = np.ascontiguousarray(img) / 255.0 # 0~1 np array
+ img = torch.from_numpy(img).float()
+
+ calibration_dataset.append(img)
+
+ calibration_dataloader = DataLoader(
+ calibration_dataset,
+ shuffle=True,
+ batch_size=batch_size,
+ drop_last=True
+ )
+ return calibration_dataloader
+
+dataloader = get_dataloader(
+ data_dir=args.dataset_dir,
+ step=args.step,
+ batch_size=args.bsz,
+ new_shape=(args.imgsz, args.imgsz),
+ use_letterbox=args.use_letterbox
+)
+
+dirname = os.path.dirname(args.save_quant_model)
+quant_json_path = os.path.join(dirname, f"quantized_{model_name}.json")
+
+static_quantize(args.model,
+ calibration_dataloader=dataloader,
+ save_quant_onnx_path=args.save_quant_model,
+ save_quant_params_path=quant_json_path,
+ observer=args.observer,
+ data_preprocess=lambda x: x.to("cuda"),
+ quant_format="qdq",
+ disable_quant_names=args.disable_quant_names)
diff --git a/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_accuracy.sh b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..44e7537657a65fc84d89531b8df9ad647513dfbe
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_accuracy.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov8n
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov8n.onnx
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov8n_fp16.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision float16 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 640 \
+ --datasets ${DATASETS_DIR} \
+ --acc_target 0.3
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_performance.sh b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1ab3808f1f45cf2072fa41a2107fa88c17fa3610
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_fp16_performance.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov8n
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov8n.onnx
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov8n_fp16.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision float16 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 640 \
+ --datasets ${DATASETS_DIR} \
+ --perf_only true \
+ --fps_target 0.0
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_accuracy.sh b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a2257463d70ee8fe6e9853db0fafd44f98ad8c83
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_accuracy.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+DISABLE_NAMES=('/model.22/Concat' '/model.22/Concat_1' '/model.22/Concat_2' '/model.22/Reshape' '/model.22/Reshape_1' '/model.22/Reshape_2' '/model.22/Concat_3' '/model.22/Split' '/model.22/dfl/Reshape' '/model.22/dfl/Transpose' '/model.22/dfl/Softmax' '/model.22/dfl/Transpose_1' '/model.22/dfl/conv/Conv' '/model.22/dfl/Reshape_1' '/model.22/Slice' '/model.22/Slice_1' '/model.22/Sub' '/model.22/Add_1' '/model.22/Add_2' '/model.22/Div_1' '/model.22/Sub_1' '/model.22/Concat_4' '/model.22/Mul_2' '/model.22/Sigmoid' '/model.22/Concat_5')
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov8n
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov8n.onnx
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov8n_bs${BATCH_SIZE}.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Quantize Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/quant.py \
+ --model_name "YOLOV8N" \
+ --model ${CURRENT_MODEL} \
+ --bsz ${BATCH_SIZE} \
+ --dataset_dir ${EVAL_DIR} \
+ --ann_file ${COCO_GT} \
+ --observer "hist_percentile" \
+ --save_quant_model ${FINAL_MODEL} \
+ --disable_quant_names '/model.22/Concat' '/model.22/Concat_1' '/model.22/Concat_2' '/model.22/Reshape' '/model.22/Reshape_1' '/model.22/Reshape_2' '/model.22/Concat_3' '/model.22/Split' '/model.22/dfl/Reshape' '/model.22/dfl/Transpose' '/model.22/dfl/Softmax' '/model.22/dfl/Transpose_1' '/model.22/dfl/conv/Conv' '/model.22/dfl/Reshape_1' '/model.22/Slice' '/model.22/Slice_1' '/model.22/Sub' '/model.22/Add_1' '/model.22/Add_2' '/model.22/Div_1' '/model.22/Sub_1' '/model.22/Concat_4' '/model.22/Mul_2' '/model.22/Sigmoid' '/model.22/Concat_5' \
+ --imgsz 640
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov8n_int8.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision int8 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 640 \
+ --datasets ${DATASETS_DIR} \
+ --acc_target 0.3
+exit ${EXIT_STATUS}
diff --git a/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_performance.sh b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f1774d5b2b28ce734dadb3e022a3359b3790f2da
--- /dev/null
+++ b/models/cv/detection/yolov8/ixrt/scripts/infer_yolov8n_int8_performance.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+PROJ_DIR=$(cd $(dirname $0);cd ../; pwd)
+DATASETS_DIR="${PROJ_DIR}/data/coco"
+COCO_GT=${DATASETS_DIR}/annotations/instances_val2017.json
+EVAL_DIR=${DATASETS_DIR}/images/val2017
+CHECKPOINTS_DIR="${PROJ_DIR}/data"
+RUN_DIR="${PROJ_DIR}"
+ORIGINE_MODEL=${CHECKPOINTS_DIR}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo ====================== Model Info ======================
+echo Model Name : yolov8n
+echo Onnx Path : ${ORIGINE_MODEL}
+
+BATCH_SIZE=32
+CURRENT_MODEL=${CHECKPOINTS_DIR}/yolov8n.onnx
+
+# quant
+FINAL_MODEL=${CHECKPOINTS_DIR}/quantized_yolov8n_bs${BATCH_SIZE}.onnx
+if [ -f $FINAL_MODEL ];then
+ echo " "Quantize Skip, $FINAL_MODEL has been existed
+else
+ python3 ${RUN_DIR}/quant.py \
+ --model_name "YOLOV8N" \
+ --model ${CURRENT_MODEL} \
+ --bsz ${BATCH_SIZE} \
+ --dataset_dir ${EVAL_DIR} \
+ --ann_file ${COCO_GT} \
+ --observer "hist_percentile" \
+ --save_quant_model ${FINAL_MODEL} \
+ --disable_quant_names '/model.22/Concat' '/model.22/Concat_1' '/model.22/Concat_2' '/model.22/Reshape' '/model.22/Reshape_1' '/model.22/Reshape_2' '/model.22/Concat_3' '/model.22/Split' '/model.22/dfl/Reshape' '/model.22/dfl/Transpose' '/model.22/dfl/Softmax' '/model.22/dfl/Transpose_1' '/model.22/dfl/conv/Conv' '/model.22/dfl/Reshape_1' '/model.22/Slice' '/model.22/Slice_1' '/model.22/Sub' '/model.22/Add_1' '/model.22/Add_2' '/model.22/Div_1' '/model.22/Sub_1' '/model.22/Concat_4' '/model.22/Mul_2' '/model.22/Sigmoid' '/model.22/Concat_5' \
+ --imgsz 640
+ echo " "Generate ${FINAL_MODEL}
+fi
+CURRENT_MODEL=${FINAL_MODEL}
+
+# Build Engine
+echo Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/yolov8n_int8.engine
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision int8 \
+ --model ${CURRENT_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+echo Inference
+RUN_BATCH_SIZE=32
+python3 ${RUN_DIR}/inference.py \
+ --model_engine ${ENGINE_FILE} \
+ --warm_up 2 \
+ --bsz ${RUN_BATCH_SIZE} \
+ --imgsz 640 \
+ --datasets ${DATASETS_DIR} \
+ --perf_only true \
+ --fps_target 0.0
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/README.md b/models/cv/face/facenet/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c2df5120bf75917c11d1d5a68c7dd377c5c823a
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/README.md
@@ -0,0 +1,101 @@
+# FaceNet
+
+## Description
+
+Facenet is a facial recognition system originally proposed and developed by Google. It utilizes deep learning techniques, specifically convolutional neural networks (CNNs), to transform facial images into high-dimensional feature vectors. These feature vectors possess high discriminative power, enabling comparison and identification of different faces. The core idea of Facenet is to map faces into a multi-dimensional space of feature vectors, achieving efficient representation and recognition of faces.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install tensorflow
+pip3 install onnxsim
+pip3 install scikit-learn
+pip3 install tf_slim
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install scipy==1.8.0
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+pip3 install simplejson
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the lfw dataset.
+
+```bash
+cd ${DeepSparkInference_PATH}/models/cv/face/facenet/ixrt
+# download and unzip 20180408-102900.zip
+unzip 20180408-102900.zip
+```
+
+### Model Conversion
+
+```bash
+
+mkdir -p checkpoints
+mkdir -p facenet_weights
+git clone https://github.com/timesler/facenet-pytorch
+mv /Path/facenet/ixrt/tensorflow2pytorch.py facenet-pytorch
+python3 /facenet-pytorch/tensorflow2pytorch.py \
+ --facenet_weights_path ./facenet_weights \
+ --facenet_pb_path ./20180408-102900 \
+ --onnx_save_name facenet_export.onnx
+mv facenet_export.onnx ./facenet_weights
+```
+
+### Data preprocessing
+
+We need to adjust the image resolution of the original dataset to 160x160. For details, please refer to the following link: . This code relies on tensorflow 1.xx; If you encounter problems with TensorFlow version incompatibility during dataset processing, you can also download the preprocessed dataset from here:
+
+```bash
+# download and unzip facenet_datasets.zip
+wget https://raw.githubusercontent.com/lanrax/Project_dataset/master/facenet_datasets.zip
+unzip facenet_datasets.zip
+```
+
+## Inference
+
+Because there are differences in model export, it is necessary to verify the following information before executing inference: In deploy.py, "/last_bn/BatchNormalization_output_0" refers to the output name of the BatchNormalization node in the exported ONNX model, such as "1187". "/avgpool_1a/GlobalAveragePool_output_0" refers to the output name of the GlobalAveragePool node, such as "1178". Additionally, make sure to update "/last_bn/BatchNormalization_output_0" in build_engine.py to the corresponding name, such as "1187".
+
+```bash
+sed -i -e 's#/last_bn/BatchNormalization_output_0#1187#g' -e 's#/avgpool_1a/GlobalAveragePool_output_0#1178#g' deploy.py build_engine.py
+```
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_facenet_fp16_accuracy.sh
+# Performance
+bash scripts/infer_facenet_fp16_performance.sh
+```
+
+### INT8
+
+```bash
+# Accuracy
+bash scripts/infer_facenet_int8_accuracy.sh
+# Performance
+bash scripts/infer_facenet_int8_performance.sh
+```
+
+## Results
+
+| Model | BatchSize | Precision | FPS | AUC | ACC |
+| ------- | --------- | --------- | --------- | ----- | ---------------- |
+| FaceNet | 64 | FP16 | 8825.802 | 0.999 | 0.98667+-0.00641 |
+| FaceNet | 64 | INT8 | 14274.306 | 0.999 | 0.98633+-0.00605 |
diff --git a/models/cv/face/facenet/ixrt/build_engine.py b/models/cv/face/facenet/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..74a62202defa50397cc4227da2181eebe10ab3e9
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/build_engine.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import cv2
+import argparse
+import numpy as np
+
+import torch
+import tensorrt
+
+import onnx
+from onnx import helper
+from onnx import TensorProto,numpy_helper
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+def add_facenet_norm(onnx_model):
+ norm = helper.make_node('FacenetNorm_IxRT', inputs=['/last_bn/BatchNormalization_output_0'] , outputs=['/Pow_1_output_0'], name='facenet_norm_1', size=512)
+
+ onnx_model = onnx.load(onnx_model)
+ graph = onnx_model.graph
+ nodes = graph.node
+ graph.node.append(norm)
+ output = onnx.helper.make_tensor_value_info('/Pow_1_output_0', TensorProto.FLOAT, [64, 512, 1, 1])
+ graph = onnx.helper.make_graph(
+ graph.node,
+ "facenet model",
+ graph.input,
+ [output],
+ graph.initializer
+ )
+ info_model = onnx.helper.make_model(graph, producer_name="facenet")
+ info_model.opset_import[0].version = 11
+ onnx.save(info_model, "tmp4.onnx")
+
+def main(config):
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+ print("start prepare...")
+ add_facenet_norm(config.model)
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file("tmp4.onnx")
+
+ precision = tensorrt.BuilderFlag.INT8 if config.precision == "int8" else tensorrt.BuilderFlag.FP16
+ # print("precision : ", precision)
+ build_config.set_flag(precision)
+
+ plan = builder.build_serialized_network(network, build_config)
+ engine_file_path = config.engine
+ with open(engine_file_path, "wb") as f:
+ f.write(plan)
+ os.remove("tmp4.onnx")
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model", type=str)
+ parser.add_argument("--precision", type=str, choices=["float16", "int8", "float32"], default="int8",
+ help="The precision of datatype")
+ parser.add_argument("--engine", type=str, default=None)
+ args = parser.parse_args()
+ return args
+
+if __name__ == "__main__":
+ args = parse_args()
+ main(args)
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/common.py b/models/cv/face/facenet/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db1327ad1531c452fb38182d747c81fc6f8eccf
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/common.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import cv2
+import glob
+import torch
+import tensorrt
+import numpy as np
+import pycuda.driver as cuda
+
+from torch.utils.data import DataLoader, SubsetRandomSampler, SequentialSampler
+from torchvision import datasets, transforms
+
+def create_engine_context(engine_path, logger):
+ with open(engine_path, "rb") as f:
+ runtime = tensorrt.Runtime(logger)
+ assert runtime
+ engine = runtime.deserialize_cuda_engine(f.read())
+ assert engine
+ context = engine.create_execution_context()
+ assert context
+
+ return engine, context
+
+def get_io_bindings(engine):
+ # Setup I/O bindings
+ inputs = []
+ outputs = []
+ allocations = []
+
+ for i in range(engine.num_bindings):
+ is_input = False
+ if engine.binding_is_input(i):
+ is_input = True
+ name = engine.get_binding_name(i)
+ dtype = engine.get_binding_dtype(i)
+ shape = engine.get_binding_shape(i)
+ if is_input:
+ batch_size = shape[0]
+ size = np.dtype(tensorrt.nptype(dtype)).itemsize
+ for s in shape:
+ size *= s
+ allocation = cuda.mem_alloc(size)
+ binding = {
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(tensorrt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
+ }
+ print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}")
+ allocations.append(allocation)
+ if engine.binding_is_input(i):
+ inputs.append(binding)
+ else:
+ outputs.append(binding)
+ return inputs, outputs, allocations
+
+def fixed_image_standardization(image_tensor):
+ processed_tensor = (image_tensor - 127.5) / 128.0
+ return processed_tensor
+
+def collate_pil(x):
+ out_x, out_y = [], []
+ for xx, yy in x:
+ out_x.append(xx)
+ out_y.append(yy)
+ return out_x, out_y
+
+def getdataloader(datasets_dir, step=20, batch_size=64, image_size=160):
+ orig_img_ds = datasets.ImageFolder(datasets_dir + 'lfw', transform=None)
+ orig_img_ds.samples = [
+ (p, p)
+ for p, _ in orig_img_ds.samples
+ ]
+ loader = DataLoader(
+ orig_img_ds,
+ num_workers=16,
+ batch_size=batch_size,
+ collate_fn=collate_pil
+ )
+ crop_paths = []
+ box_probs = []
+ for i, (x, b_paths) in enumerate(loader):
+ crops = [p for p in b_paths]
+ crop_paths.extend(crops)
+ # print('\rBatch {} of {}'.format(i + 1, len(loader)), end='')
+
+ trans = transforms.Compose([
+ np.float32,
+ transforms.ToTensor(),
+ fixed_image_standardization
+ ])
+
+ dataset = datasets.ImageFolder(datasets_dir + 'lfw', transform=trans)
+ embed_loader = DataLoader(
+ dataset,
+ num_workers=16,
+ batch_size=batch_size,
+ sampler=SequentialSampler(dataset)
+ )
+
+ return embed_loader, crop_paths
diff --git a/models/cv/face/facenet/ixrt/config/FACENET_CONFIG b/models/cv/face/facenet/ixrt/config/FACENET_CONFIG
new file mode 100644
index 0000000000000000000000000000000000000000..3b3282eff772fa4a2d46d2cc2aace1570ad0f1bb
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/config/FACENET_CONFIG
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# IMGSIZE : 模型输入hw大小
+# MODEL_NAME : 生成onnx/engine的basename
+# ORIGINE_MODEL : 原始onnx文件名称
+IMGSIZE=160
+MODEL_NAME=facenet
+ORIGINE_MODEL=facenet_export.onnx
+
+# QUANT CONFIG (仅PRECISION为int8时生效)
+ # QUANT_OBSERVER : 量化策略,可选 [hist_percentile, percentile, minmax, entropy, ema]
+ # QUANT_BATCHSIZE : 量化时组dataloader的batchsize, 最好和onnx中的batchsize保持一致,有些op可能推导shape错误(比如Reshape)
+ # QUANT_STEP : 量化步数
+ # QUANT_SEED : 随机种子 保证量化结果可复现
+ # QUANT_EXIST_ONNX : 如果有其他来源的量化模型则填写
+QUANT_OBSERVER=hist_percentile
+QUANT_BATCHSIZE=64
+QUANT_STEP=32
+QUANT_SEED=42
+DISABLE_QUANT_LIST=
+QUANT_EXIST_ONNX=
diff --git a/models/cv/face/facenet/ixrt/deploy.py b/models/cv/face/facenet/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f4ce5880bb50f78127a923e09c446547ac3fd2
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/deploy.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import onnx
+import os
+import simplejson as json
+import argparse
+from onnxsim import simplify
+import numpy as np
+import shutil
+from onnx import numpy_helper
+from onnx import AttributeProto, TensorProto, GraphProto
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+def onnx_sim(onnx_name, save_name):
+ # simplify onnx
+ cmd = "onnxsim {} {}".format(onnx_name, save_name)
+ os.system(cmd)
+ print("[info] onnxsim done!")
+
+
+def cut_model(onnx_name):
+ input_names = ["input"]
+ output_names = ["/last_bn/BatchNormalization_output_0"]
+ onnx.utils.extract_model(onnx_name, onnx_name, input_names, output_names)
+
+def fuse_matmul(onnx_name, save_onnx_name):
+ find_matmul = 0
+
+ onnx_model = onnx.load(onnx_name)
+
+ graph = onnx_model.graph
+ nodes = graph.node
+
+ conv_weights = None
+ conv_bias = None
+ bn_weights = None
+ bn_bias = None
+ conv_weights_new = None
+ conv_bias_new = None
+
+ pre_node = None
+ for i, node in enumerate(nodes):
+ if (node.op_type == "Conv"):
+ pass
+ if (node.op_type == "MatMul"):
+ for k, ten in enumerate(graph.initializer):
+ if ten.name == node.input[1]:
+ H , W = ten.dims
+ weights = np.fromstring(ten.raw_data, dtype=np.float32)
+ weights = weights.reshape(ten.dims)
+ conv_weights = weights.transpose()
+ if (node.op_type == "BatchNormalization" and pre_node.op_type == "MatMul"):
+ find_matmul=1
+ weights = None
+ bias = None
+ mean = None
+ var = None
+
+ for k, ten in enumerate(graph.initializer):
+ if ten.name == node.input[1]:
+ weights = np.fromstring(ten.raw_data, dtype=np.float32)
+ if ten.name == node.input[2]:
+ bias = np.fromstring(ten.raw_data, dtype=np.float32)
+ if ten.name == node.input[3]:
+ mean = np.fromstring(ten.raw_data, dtype=np.float32)
+ if ten.name == node.input[4]:
+ var = np.fromstring(ten.raw_data, dtype=np.float32)
+
+ bn_weights = np.diag(weights / np.sqrt(var + 1e-8))
+ bn_bias = bias - weights * mean / np.sqrt(var + 1e-8)
+
+ conv_weights_new = np.matmul(bn_weights, conv_weights)
+ a, b = conv_weights_new.shape
+ conv_weights_new = conv_weights_new.reshape((a,b,1,1))
+ # conv_bias_new = bn_weights * conv_bias + bn_bias
+ conv_bias_new = 0 + bn_bias
+ conv_weights_new_initializer = onnx.numpy_helper.from_array(conv_weights_new, name='conv_weights_new')
+ graph.initializer.append(conv_weights_new_initializer)
+ conv_bias_new_initializer = onnx.numpy_helper.from_array(conv_bias_new, name='conv_bias_new')
+ graph.initializer.append(conv_bias_new_initializer)
+
+ pre_node.op_type = "Conv"
+ pre_node.input[0] = "/avgpool_1a/GlobalAveragePool_output_0"
+ pre_node.input[1] = "conv_weights_new"
+ pre_node.input.append("conv_bias_new")
+ pre_node.output[0] = "/last_bn/BatchNormalization_output_0"
+ dilations = onnx.helper.make_attribute("dilations", [1,1])
+ group = onnx.helper.make_attribute("group", 1)
+ kernel_shape = onnx.helper.make_attribute("kernel_shape", [1,1])
+ pads = onnx.helper.make_attribute("pads", [0,0,0,0])
+ strides = onnx.helper.make_attribute("strides", [1,1])
+
+ pre_node.attribute.append(dilations)
+ pre_node.attribute.append(group)
+ pre_node.attribute.append(kernel_shape)
+ pre_node.attribute.append(pads)
+ pre_node.attribute.append(strides)
+ graph.node.remove(node)
+
+ pre_node = node
+
+ for i, node in enumerate(nodes):
+ if (node.name == "Reshape_353"):
+ # print("[reshape] : ", node.name)
+ graph.node.remove(node)
+
+ if find_matmul==1:
+ output = onnx.helper.make_tensor_value_info('/last_bn/BatchNormalization_output_0', TensorProto.FLOAT, [64, 512, 1, 1])
+ graph = onnx.helper.make_graph(
+ graph.node,
+ "facenet model",
+ graph.input,
+ [output],
+ graph.initializer
+ )
+
+ info_model = onnx.helper.make_model(graph, producer_name="facenet")
+ info_model.opset_import[0].version = 11
+ onnx_model = onnx.shape_inference.infer_shapes(info_model)
+
+ onnx.checker.check_model(onnx_model)
+ onnx.save(onnx_model, save_onnx_name)
+
+def fuse_mul(onnx_name, save_onnx_name):
+ onnx_model = onnx.load(onnx_name)
+
+ graph = onnx_model.graph
+ nodes = graph.node
+ pre_node = None
+
+ for i, node in enumerate(nodes):
+ if (node.op_type == "Constant"):
+ pass
+
+ if (node.op_type == "Mul" and pre_node.op_type == "Conv" ):
+ for ten in graph.initializer:
+ if ten.name == node.input[1]:
+ scale_name = ten.name
+ scale = np.fromstring(ten.raw_data, dtype=np.float32)
+
+ for k, ten in enumerate(graph.initializer):
+ # print(ten.name)
+ if ten.name == pre_node.input[1]:
+ weights_name = ten.name
+ weights = np.fromstring(ten.raw_data, dtype=np.float32)
+ weights *= scale
+ graph.initializer[k].raw_data = weights.tobytes()
+
+ if ten.name == pre_node.input[2]:
+ bias_name = ten.name
+ bias = np.fromstring(ten.raw_data, dtype=np.float32)
+ # print("bias len: ",len(da))
+ bias *= scale
+ graph.initializer[k].raw_data = bias.tobytes()
+
+ new_conv = pre_node
+ new_conv.output[0] = node.output[0]
+ graph.node.remove(node)
+ pre_node = node
+
+ onnx.checker.check_model(onnx_model)
+ onnx.save(onnx_model, save_onnx_name)
+
+def create_graph_json(onnx_name):
+ # create graph json and weights
+ graph_path = onnx_name[0:-5] + "_graph.json"
+ weight_path = onnx_name[0:-5] + ".weights"
+
+ model = onnx.load(onnx_name)
+ graph = model.graph
+ nodes = graph.node
+ initializer = graph.initializer
+ value_info = graph.value_info # Infer shape info
+
+ model_inputs = [tensor.name for tensor in graph.input]
+ model_outputs = [tensor.name for tensor in graph.output]
+
+ model = {}
+ model["nodes"] = {}
+ model["tensors"] = {}
+ model["edges"] = {}
+ model["output"] = {}
+ data_type_table = {
+ 1: "float32",
+ 2: "uint8",
+ 3: "int8",
+ 4: "uint16",
+ 5: "int16",
+ 6: "int32",
+ 7: "int64",
+ 9: "bool",
+ 10: "float16",
+ 11: "double",
+ 12: "uint32",
+ 13: "uint64",
+ }
+ input_cache = []
+ for item in graph.input:
+ if item.type.tensor_type.elem_type in data_type_table:
+ cache = {
+ "name": item.name,
+ "type": data_type_table[item.type.tensor_type.elem_type],
+ }
+ else:
+ cache = {"name": item.name}
+ input_cache.append(cache)
+ model["input"] = input_cache
+
+ output_cache = []
+ for item in graph.output:
+ if item.type.tensor_type.elem_type in data_type_table:
+ cache = {
+ "name": item.name,
+ "type": data_type_table[item.type.tensor_type.elem_type],
+ }
+ else:
+ cache = {"name": item.name}
+ output_cache.append(cache)
+ model["output"] = output_cache
+
+ # find cast dict
+ input_cast_dict = {}
+ output_cast_dict = {}
+ for i, item in enumerate(nodes):
+ node_name = item.name
+ input_edge_list = list(item.input)
+ output_edge_list = list(item.output)
+ # remove input and output cast op
+ if item.op_type == "Cast":
+ if input_edge_list[0] in model_inputs:
+ input_cast_dict[output_edge_list[0]] = input_edge_list[0]
+ if output_edge_list[0] in model_outputs:
+ output_cast_dict[input_edge_list[0]] = output_edge_list[0]
+
+ for i, item in enumerate(nodes):
+ node_name = item.name
+ input_edge_list = list(item.input)
+ output_edge_list = list(item.output)
+ # remove input and output cast op
+ if item.op_type == "Cast":
+ if input_edge_list[0] in model_inputs:
+ continue
+ if output_edge_list[0] in model_outputs:
+ continue
+
+ for idx, edge_name in enumerate(input_edge_list):
+ if edge_name in input_cast_dict.keys():
+ input_edge_list[idx] = input_cast_dict[edge_name]
+
+ for idx, edge_name in enumerate(output_edge_list):
+ if edge_name in output_cast_dict.keys():
+ output_edge_list[idx] = output_cast_dict[edge_name]
+
+ # remove mask in EmbedLayerNormalization
+ if item.op_type == "EmbedLayerNormalization":
+ no_attention_mask_in_Embed = True
+ for input_edge in input_edge_list:
+ if "attention_mask" in input_edge:
+ input_edge_list.remove(input_edge)
+ no_attention_mask_in_Embed = False
+ if no_attention_mask_in_Embed:
+ for tensor_name in model_inputs:
+ if "attention_mask" in tensor_name:
+ output_edge_list[1] = tensor_name
+
+ node_dict = {"inputs": input_edge_list, "outputs": output_edge_list}
+ node_dict["op_type"] = item.op_type
+ attribute_dict = {}
+
+ if node_name == "":
+ for input_edge in input_edge_list:
+ node_name += input_edge + "_"
+ node_name += "to"
+ for output_edge in output_edge_list:
+ node_name += "_" + output_edge
+
+ for attr in item.attribute:
+
+ if attr.type == onnx.AttributeProto().AttributeType.FLOAT:
+ attribute_dict[attr.name] = attr.f
+ if attr.type == onnx.AttributeProto().AttributeType.FLOATS:
+ attribute_dict[attr.name] = [x for x in attr.floats]
+ if attr.type == onnx.AttributeProto().AttributeType.INT:
+ attribute_dict[attr.name] = attr.i
+ if attr.type == onnx.AttributeProto().AttributeType.INTS:
+ attribute_dict[attr.name] = [x for x in attr.ints]
+ if attr.type == onnx.AttributeProto().AttributeType.STRING:
+ attribute_dict[attr.name] = str(attr.s.decode("UTF-8"))
+ if attr.type == onnx.AttributeProto().AttributeType.STRINGS:
+ attribute_dict[attr.name] = [str(x.decode("UTF-8")) for x in attr.strings]
+
+ node_dict["attrbiute"] = attribute_dict
+ model["nodes"][node_name] = node_dict
+
+ for i, item in enumerate(initializer):
+ tensor_name = item.name
+ tensor_dict = {}
+ if item.data_type in data_type_table:
+ tensor_dict["data_type"] = data_type_table[item.data_type]
+ else:
+ print(
+ tensor_name,
+ " use unsupport data type: ",
+ item.data_type,
+ ", data info will not be saved",
+ )
+ continue
+ tensor_dict["dims"] = list(item.dims)
+
+ model["tensors"][tensor_name] = tensor_dict
+
+ with open(graph_path, "w") as fh:
+ json.dump(model, fh, indent=4)
+
+
+ """
+ Export weight
+ """
+ byte_string = "".encode()
+
+ weight_file_postfix = ".weights"
+ for item in initializer:
+ tensor_name = item.name
+
+ np_data = None
+ if len(item.raw_data):
+ np_data = np.frombuffer(item.raw_data, dtype=np.byte)
+ elif item.data_type == 1 and len(item.float_data):
+ np_data = np.array(list(item.float_data), dtype=np.float32)
+ elif item.data_type == 2 and len(item.int32_data):
+ np_data = np.array(list(item.int32_data), dtype=np.uint8)
+ elif item.data_type == 6 and len(item.int32_data):
+ np_data = np.array(list(item.int32_data), dtype=np.int32)
+ elif item.data_type == 7 and len(item.int64_data):
+ np_data = np.array(list(item.int64_data), dtype=np.int64)
+ elif item.data_type == 10 and len(item.int32_data):
+ np_data = (
+ np.asarray(item.int32_data, dtype=np.uint16)
+ .reshape(item.dims)
+ .view(np.float16)
+ )
+ else:
+ print(
+ "tensor name: ",
+ tensor_name,
+ ", type: ",
+ item.data_type,
+ ", len: ",
+ len(item.raw_data),
+ len(item.float_data),
+ len(item.int32_data),
+ len(item.int64_data),
+ ", will not save into weights file",
+ )
+
+ if np_data is not None:
+ byte_string += np.uint64(len(tensor_name)).tobytes()
+ byte_string += tensor_name.encode()
+ np_bytes = np_data.tobytes()
+ byte_string += np.uint64(len(np_bytes)).tobytes()
+ byte_string += np_bytes
+
+
+ # Export weight values as bin file
+ with open(weight_path, "wb") as fh:
+ fh.write(byte_string)
+ print("----------------------------")
+ print("[OK] graph and weights file save at :")
+ print(graph_path)
+ print(weight_path)
+ return graph_path, weight_path
+
+def add_facenet_norm(cfg_name):
+ graph_json = json.load(open(cfg_name))
+
+ graph_json["nodes"]["facenet_norm_1"] = {
+ "inputs": [
+ "/last_bn/BatchNormalization_output_0"
+ ],
+ "outputs": [
+ "/Pow_1_output_0"
+ ],
+ "op_type": "FacenetNorm",
+ "attrbiute": {
+ "size": 512
+ }
+ }
+ graph_json["output"] = []
+ graph_json["output"].append({"name":"/Pow_1_output_0", "type":"float32"})
+
+ with open(cfg_name, "w") as fh:
+ json.dump(graph_json, fh, indent=4)
+
+
+def main(args):
+ print("[info] input onnx name :", args.onnx_name)
+ # onnxsim
+ onnx_sim(args.onnx_name, "tmp1.onnx")
+ # cut model
+ cut_model("tmp1.onnx")
+ # fuse matmul bn
+ fuse_matmul("tmp1.onnx", "tmp2.onnx")
+ # fuse mul
+ fuse_mul("tmp2.onnx", "facenet_weights/facenet.onnx")
+ # generate cfg weights
+ # graph_path, weight_path = create_graph_json("facenet_weights/facenet.onnx")
+ # add facenet norm
+ # add_facenet_norm(graph_path)
+
+ os.remove("tmp1.onnx")
+ os.remove("tmp2.onnx")
+ print("\n[info] facenet deploy done!!!")
+
+
+def parse_args():
+ parser = argparse.ArgumentParser("deploy facenet")
+ parser.add_argument("--model_name", default="facenet", help="model name")
+ parser.add_argument("--onnx_name", default="facenet_weights/facenet_export.onnx", help="onnx filepath")
+ parser.add_argument("--save_name", default="facenet_weights/facenet.onnx", help="onnx filepath")
+ parser.add_argument("--data_type", default="int8", type=str, choices=["float16", "int8"], help="int8 float16")
+ parser.add_argument("--batch_size", default="64", type=int, help="batch_size")
+ parser.add_argument("--quant_file", default="", type=str, help="quant file")
+ parser.add_argument("--img_size", default="160", type=int, help="image size")
+ parser.add_argument("--device", default=0, type=int, help="cuda device 0 1 3 ...")
+
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ main(args)
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/inference.py b/models/cv/face/facenet/ixrt/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec9876e33c800206003d4d5e2c2d165929ba6591
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/inference.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+import json
+import os
+import re
+import time
+from tqdm import tqdm
+
+import cv2
+import numpy as np
+import pycuda.autoinit
+import pycuda.driver as cuda
+import torch
+import tensorrt
+from tensorrt.utils import topk
+from sklearn import metrics
+from scipy.optimize import brentq
+from sklearn.model_selection import KFold
+from scipy import interpolate
+
+from utils import read_pairs, get_paths, evaluate
+from common import getdataloader, create_engine_context, get_io_bindings
+from load_ixrt_plugin import load_ixrt_plugin
+load_ixrt_plugin()
+
+def main(config):
+ embed_loader, crop_paths = getdataloader(config.datasets_dir, config.loop_count, config.bsz, config.imgsz)
+
+ host_mem = tensorrt.IHostMemory
+ logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+
+ # Load Engine && I/O bindings
+ engine, context = create_engine_context(config.engine_file, logger)
+ inputs, outputs, allocations = get_io_bindings(engine)
+
+ # Warm up
+ if config.warm_up > 0:
+ print("\nWarm Start.")
+ for i in range(config.warm_up):
+ context.execute_v2(allocations)
+ print("Warm Done.")
+
+ # Inference
+ if config.test_mode == "FPS":
+ torch.cuda.synchronize()
+ start_time = time.time()
+
+ for i in range(config.loop_count):
+ context.execute_v2(allocations)
+
+ torch.cuda.synchronize()
+ end_time = time.time()
+ forward_time = end_time - start_time
+
+ fps = config.loop_count * config.bsz / forward_time
+
+ print("FPS : ", fps)
+ print(f"Performance Check : Test {fps} >= target {config.fps_target}")
+ if fps >= config.fps_target:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+ elif config.test_mode == "ACC":
+
+ classes = []
+ embeddings = []
+
+ for xb, yb in tqdm(embed_loader):
+
+ output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+ current_imgs_num = xb.numpy().shape[0]
+ xb = xb.numpy()
+ xb = np.ascontiguousarray(xb)
+
+ cuda.memcpy_htod(inputs[0]["allocation"], xb)
+ context.execute_v2(allocations)
+ cuda.memcpy_dtoh(output, outputs[0]["allocation"])
+
+ output = output.reshape(output.shape[0],output.shape[1])
+ #print("output shape ",output.shape)
+
+ classes.extend(yb[0:current_imgs_num].numpy())
+ embeddings.extend(output)
+
+
+ embeddings_dict = dict(zip(crop_paths,embeddings))
+
+ pairs = read_pairs(config.datasets_dir + config.pairs_name)
+ path_list, issame_list = get_paths(config.datasets_dir + 'lfw', pairs)
+ # embeddings = np.array([embeddings_dict[path.replace(".png",".jpg")] for path in path_list])
+ embeddings = np.array([embeddings_dict[path] for path in path_list])
+ tpr, fpr, accuracy, val, val_std, far, fp, fn = evaluate(embeddings, issame_list)
+
+ print('\nAccuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy)))
+ print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
+
+ auc = metrics.auc(fpr, tpr)
+ print('Area Under Curve (AUC): %1.3f' % auc)
+ #eer = brentq(lambda x: 1. - x - interpolate.interp1d(fpr, tpr, fill_value="extrapolate")(x), 0., 1.)
+ #print('Equal Error Rate (EER): %1.3f' % eer)
+
+ acc = np.mean(accuracy)
+ print(f"Accuracy Check : Test {acc} >= target {config.acc_target}")
+ if acc >= config.acc_target:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+def parse_config():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--test_mode", type=str, default="FPS", help="FPS MAP")
+ parser.add_argument(
+ "--engine_file",
+ type=str,
+ help="engine file path"
+ )
+ parser.add_argument(
+ "--datasets_dir",
+ type=str,
+ default="",
+ help="ImageNet dir",
+ )
+ parser.add_argument("--pairs_name", type=str, default="pairs.txt", help="binary weights file name")
+ parser.add_argument("--warm_up", type=int, default=-1, help="warm_up times")
+ parser.add_argument("--bsz", type=int, default=32, help="test batch size")
+ parser.add_argument(
+ "--imgsz",
+ "--img",
+ "--img-size",
+ type=int,
+ default=160,
+ help="inference size h,w",
+ )
+ parser.add_argument("--use_async", action="store_true")
+ parser.add_argument(
+ "--device", type=int, default=0, help="cuda device, i.e. 0 or 0,1,2,3,4"
+ )
+ parser.add_argument("--fps_target", type=float, default=-1.0)
+ parser.add_argument("--acc_target", type=float, default=-1.0)
+ parser.add_argument("--loop_count", type=int, default=-1)
+
+ config = parser.parse_args()
+ return config
+
+if __name__ == "__main__":
+ config = parse_config()
+ main(config)
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/load_ixrt_plugin.py b/models/cv/face/facenet/ixrt/load_ixrt_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae47dc8e854b6bea1f768e65c4dd481048bfebce
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/load_ixrt_plugin.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+import tensorrt
+from os.path import join, dirname, exists
+def load_ixrt_plugin(logger=tensorrt.Logger(tensorrt.Logger.INFO), namespace="", dynamic_path=""):
+ if not dynamic_path:
+ dynamic_path = join(dirname(tensorrt.__file__), "lib", "libixrt_plugin.so")
+ if not exists(dynamic_path):
+ raise FileNotFoundError(
+ f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!")
+ ctypes.CDLL(dynamic_path)
+ tensorrt.init_libnvinfer_plugins(logger, namespace)
+ print(f"Loaded plugin from {dynamic_path}")
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/quant.py b/models/cv/face/facenet/ixrt/quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..26413e3e0f58f219cce2bd78804de288cba1fd1a
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/quant.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import torch
+from tensorrt.deploy.api import *
+from tensorrt.deploy.utils.seed import manual_seed
+from torchvision import models
+from argparse import ArgumentParser
+from torch.utils.data import DataLoader
+from torch.utils.data.dataset import Dataset
+from torchvision import datasets, transforms
+import json
+import cv2
+import numpy as np
+import math
+import simplejson as json
+from tensorrt.deploy import static_quantize
+
+
+# manual_seed(43)
+device = 0 if torch.cuda.is_available() else "cpu"
+
+
+def fixed_image_standardization(image_tensor):
+ processed_tensor = (image_tensor - 127.5) / 128.0
+ return processed_tensor
+
+def create_dataloader(args):
+ image_dir_path = os.path.join(args.data_path, "lfw")
+
+ trans = transforms.Compose([
+ np.float32,
+ transforms.ToTensor(),
+ fixed_image_standardization
+ ])
+
+ dataset = datasets.ImageFolder(args.data_path + 'lfw', transform=trans)
+
+ calibration_dataset = dataset
+ print("image folder total images : ", len(dataset))
+ if args.num_samples is not None:
+ indices = np.random.permutation(len(dataset))[:args.num_samples]
+ calibration_dataset = torch.utils.data.Subset(
+ dataset, indices=indices
+ )
+ print("calibration_dataset images : ", len(calibration_dataset))
+
+ assert len(dataset), f"data size is 0, check data path please"
+ calibration_dataloader = DataLoader(
+ calibration_dataset,
+ batch_size=args.batch_size,
+ shuffle=True,
+ num_workers=args.workers,
+ )
+ verify_dataloader = DataLoader(
+ dataset,
+ batch_size=args.batch_size,
+ shuffle=True,
+ num_workers=args.workers,
+ )
+
+ return calibration_dataloader, verify_dataloader
+
+
+@torch.no_grad()
+def quantize_model(args, model_name, model, dataloader):
+
+ calibration_dataloader, verify_dataloader = dataloader
+ print("calibration dataset length: ", len(calibration_dataloader))
+
+ if isinstance(model, torch.nn.Module):
+ model = model.to(device)
+ model.eval()
+
+ static_quantize(args.model,
+ calibration_dataloader=calibration_dataloader,
+ save_quant_onnx_path=os.path.join("./facenet_weights", f"{model_name}-quant.onnx"),
+ observer=args.observer,
+ data_preprocess=lambda x: x[0].to("cuda"),
+ quant_format="qdq",
+ disable_quant_names=None)
+
+def create_argparser(*args, **kwargs):
+ parser = ArgumentParser(*args, **kwargs)
+ parser.add_argument("--batch_size", type=int, default=64)
+ parser.add_argument("--img_size", type=int, default=160)
+ parser.add_argument("-j", "--workers", type=int, default=4)
+ parser.add_argument("--model", type=str, default="./facenet_weights/facenet.onnx")
+ parser.add_argument("--num_samples", type=int, default=1000)
+ parser.add_argument("--data_path", type=str, default="./facenet_datasets/")
+ parser.add_argument("--analyze", action="store_true")
+ parser.add_argument("--observer", type=str, default="hist_percentile")
+ parser.add_argument("--fp32_acc", action="store_true")
+ parser.add_argument("--use_ixrt", action="store_true")
+ parser.add_argument("--quant_params", type=str, default=None)
+ parser.add_argument("--disable_bias_correction", action="store_true")
+ return parser
+
+def parse_args():
+ parser = create_argparser("PTQ Quantization")
+ args = parser.parse_args()
+ args.use_ixquant = not args.use_ixrt
+ return args
+
+
+def main():
+ args = parse_args()
+ print(args)
+ dataloader = create_dataloader(args)
+
+ if args.model.endswith(".onnx"):
+ model_name = os.path.basename(args.model)
+ model_name = model_name.rsplit(".", maxsplit=1)[0]
+ model = args.model
+ else:
+ print("[Error] file name not correct ", args.model)
+ quantize_model(args, model_name, model, dataloader)
+
+if __name__ == "__main__":
+ main()
diff --git a/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh b/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..27e5e8ad859d95c86dfc9b29fdc78150b0c60c95
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_accuracy.sh
@@ -0,0 +1,136 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+# Run paraments
+BSZ=64
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+ index=`expr $index + 1`
+ case $argument in
+ --bs) BSZ=${arguments[index]};;
+ --tgt) TGT=${arguments[index]};;
+ esac
+done
+PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
+echo PROJ_DIR : ${PROJ_DIR}
+RUN_DIR="${PROJ_DIR}/ixrt/"
+DATASETS_DIR="${RUN_DIR}/facenet_datasets/"
+CHECKPOINTS_DIR="${RUN_DIR}/facenet_weights/"
+CONFIG_DIR="${PROJ_DIR}/ixrt/config/FACENET_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+ echo " "Simplify Model, ${SIM_MODEL} has been existed
+else
+ cd $RUN_DIR
+ python3 ${RUN_DIR}/deploy.py \
+ --onnx_name ${CHECKPOINTS_DIR}/facenet_export.onnx
+ echo " "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+ let step++
+ echo;
+ echo [STEP ${step}] : Quant Model
+ if [[ -z ${QUANT_EXIST_ONNX} ]];then
+ QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/${MODEL_NAME}-quant.onnx
+ fi
+ if [[ -f ${QUANT_EXIST_ONNX} ]];then
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+ else
+ python3 ${RUN_DIR}/quant.py \
+ --model ${SIM_MODEL} \
+ --batch_size ${QUANT_BATCHSIZE} \
+ --img_size ${IMGSIZE} \
+ --num_samples 6400 \
+ --observer ${QUANT_OBSERVER} \
+ --disable_bias_correction
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Generate ${SIM_MODEL}
+ fi
+fi
+
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+FINAL_MODEL=${SIM_MODEL}
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision ${PRECISION} \
+ --model ${FINAL_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py \
+ --engine_file=${ENGINE_FILE} \
+ --datasets_dir=${DATASETS_DIR} \
+ --imgsz=${IMGSIZE} \
+ --warm_up=${WARM_UP} \
+ --loop_count ${LOOP_COUNT} \
+ --test_mode ${RUN_MODE} \
+ --fps_target ${TGT} \
+ --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh b/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..401658cafd85297b9d98f7febb9e7c88746062ef
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/scripts/infer_facenet_fp16_performance.sh
@@ -0,0 +1,136 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+# Run paraments
+BSZ=64
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=float16
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+ index=`expr $index + 1`
+ case $argument in
+ --bs) BSZ=${arguments[index]};;
+ --tgt) TGT=${arguments[index]};;
+ esac
+done
+PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
+echo PROJ_DIR : ${PROJ_DIR}
+RUN_DIR="${PROJ_DIR}/ixrt/"
+DATASETS_DIR="${RUN_DIR}/facenet_datasets/"
+CHECKPOINTS_DIR="${RUN_DIR}/facenet_weights/"
+CONFIG_DIR="${PROJ_DIR}/ixrt/config/FACENET_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+ echo " "Simplify Model, ${SIM_MODEL} has been existed
+else
+ cd $RUN_DIR
+ python3 ${RUN_DIR}/deploy.py \
+ --onnx_name ${CHECKPOINTS_DIR}/facenet_export.onnx
+ echo " "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+ let step++
+ echo;
+ echo [STEP ${step}] : Quant Model
+ if [[ -z ${QUANT_EXIST_ONNX} ]];then
+ QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/${MODEL_NAME}-quant.onnx
+ fi
+ if [[ -f ${QUANT_EXIST_ONNX} ]];then
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+ else
+ python3 ${RUN_DIR}/quant.py \
+ --model ${SIM_MODEL} \
+ --batch_size ${QUANT_BATCHSIZE} \
+ --img_size ${IMGSIZE} \
+ --num_samples 6400 \
+ --observer ${QUANT_OBSERVER} \
+ --disable_bias_correction
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Generate ${SIM_MODEL}
+ fi
+fi
+
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+FINAL_MODEL=${SIM_MODEL}
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision ${PRECISION} \
+ --model ${FINAL_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py \
+ --engine_file=${ENGINE_FILE} \
+ --datasets_dir=${DATASETS_DIR} \
+ --imgsz=${IMGSIZE} \
+ --warm_up=${WARM_UP} \
+ --loop_count ${LOOP_COUNT} \
+ --test_mode ${RUN_MODE} \
+ --fps_target ${TGT} \
+ --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh b/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c2c2f176bcd0ea6bb00acedb6fbda80b47456a08
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_accuracy.sh
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+# Run paraments
+BSZ=64
+TGT=-1
+WARM_UP=0
+LOOP_COUNT=-1
+RUN_MODE=ACC
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+ index=`expr $index + 1`
+ case $argument in
+ --bs) BSZ=${arguments[index]};;
+ --tgt) TGT=${arguments[index]};;
+ esac
+done
+PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
+echo PROJ_DIR : ${PROJ_DIR}
+RUN_DIR="${PROJ_DIR}/ixrt/"
+DATASETS_DIR="${RUN_DIR}/facenet_datasets/"
+CHECKPOINTS_DIR="${RUN_DIR}/facenet_weights/"
+CONFIG_DIR="${PROJ_DIR}/ixrt/config/FACENET_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+ echo " "Simplify Model, ${SIM_MODEL} has been existed
+else
+ cd $RUN_DIR
+ python3 ${RUN_DIR}/deploy.py \
+ --onnx_name ${CHECKPOINTS_DIR}/facenet_export.onnx
+ echo " "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+ let step++
+ echo;
+ echo [STEP ${step}] : Quant Model
+ if [[ -z ${QUANT_EXIST_ONNX} ]];then
+ QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/${MODEL_NAME}-quant.onnx
+ fi
+ if [[ -f ${QUANT_EXIST_ONNX} ]];then
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+ else
+ cd $RUN_DIR
+ python3 ${RUN_DIR}/quant.py \
+ --model ${SIM_MODEL} \
+ --batch_size ${QUANT_BATCHSIZE} \
+ --data_path ${DATASETS_DIR} \
+ --img_size ${IMGSIZE} \
+ --num_samples 6400 \
+ --observer ${QUANT_OBSERVER} \
+ --disable_bias_correction
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Generate ${SIM_MODEL}
+ fi
+fi
+
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+FINAL_MODEL=${SIM_MODEL}
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision ${PRECISION} \
+ --model ${FINAL_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py \
+ --engine_file=${ENGINE_FILE} \
+ --datasets_dir=${DATASETS_DIR} \
+ --imgsz=${IMGSIZE} \
+ --warm_up=${WARM_UP} \
+ --loop_count ${LOOP_COUNT} \
+ --test_mode ${RUN_MODE} \
+ --fps_target ${TGT} \
+ --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_performance.sh b/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7574347c028dfdb28e3b06016d4c61fb6d3e1328
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/scripts/infer_facenet_int8_performance.sh
@@ -0,0 +1,138 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+#!/bin/bash
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ EXIT_STATUS=1
+ fi
+}
+
+# Run paraments
+BSZ=64
+TGT=-1
+WARM_UP=3
+LOOP_COUNT=20
+RUN_MODE=FPS
+PRECISION=int8
+
+# Update arguments
+index=0
+options=$@
+arguments=($options)
+for argument in $options
+do
+ index=`expr $index + 1`
+ case $argument in
+ --bs) BSZ=${arguments[index]};;
+ --tgt) TGT=${arguments[index]};;
+ esac
+done
+PROJ_DIR=$(cd $(dirname $0);cd ../../; pwd)
+echo PROJ_DIR : ${PROJ_DIR}
+RUN_DIR="${PROJ_DIR}/ixrt/"
+DATASETS_DIR="${RUN_DIR}/facenet_datasets/"
+CHECKPOINTS_DIR="${RUN_DIR}/facenet_weights/"
+CONFIG_DIR="${PROJ_DIR}/ixrt/config/FACENET_CONFIG"
+source ${CONFIG_DIR}
+ORIGINE_MODEL=${CHECKPOINTS_DIR}/${ORIGINE_MODEL}
+
+echo CHECKPOINTS_DIR : ${CHECKPOINTS_DIR}
+echo DATASETS_DIR : ${DATASETS_DIR}
+echo RUN_DIR : ${RUN_DIR}
+echo CONFIG_DIR : ${CONFIG_DIR}
+echo ====================== Model Info ======================
+echo Model Name : ${MODEL_NAME}
+echo Model Input Name : ${MODEL_INPUT_NAME}
+echo Model Output Name : ${MODEL_OUTPUT_NAME}
+echo Onnx Path : ${ORIGINE_MODEL}
+
+step=0
+SIM_MODEL=${CHECKPOINTS_DIR}/${MODEL_NAME}.onnx
+
+# Simplify Model
+let step++
+echo;
+echo [STEP ${step}] : Simplify Model
+if [ -f ${SIM_MODEL} ];then
+ echo " "Simplify Model, ${SIM_MODEL} has been existed
+else
+ cd $RUN_DIR
+ python3 ${RUN_DIR}/deploy.py \
+ --onnx_name ${CHECKPOINTS_DIR}/facenet_export.onnx
+ echo " "Generate ${SIM_MODEL}
+fi
+
+# Quant Model
+if [ $PRECISION == "int8" ];then
+ let step++
+ echo;
+ echo [STEP ${step}] : Quant Model
+ if [[ -z ${QUANT_EXIST_ONNX} ]];then
+ QUANT_EXIST_ONNX=$CHECKPOINTS_DIR/${MODEL_NAME}-quant.onnx
+ fi
+ if [[ -f ${QUANT_EXIST_ONNX} ]];then
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Quant Model Skip, ${QUANT_EXIST_ONNX} has been existed
+ else
+ cd $RUN_DIR
+ python3 ${RUN_DIR}/quant.py \
+ --model ${SIM_MODEL} \
+ --batch_size ${QUANT_BATCHSIZE} \
+ --data_path ${DATASETS_DIR} \
+ --img_size ${IMGSIZE} \
+ --num_samples 6400 \
+ --observer ${QUANT_OBSERVER} \
+ --disable_bias_correction
+ SIM_MODEL=${QUANT_EXIST_ONNX}
+ echo " "Generate ${SIM_MODEL}
+ fi
+fi
+
+
+# Build Engine
+let step++
+echo;
+echo [STEP ${step}] : Build Engine
+ENGINE_FILE=${CHECKPOINTS_DIR}/${MODEL_NAME}_${PRECISION}_bs${BSZ}.engine
+FINAL_MODEL=${SIM_MODEL}
+if [ -f $ENGINE_FILE ];then
+ echo " "Build Engine Skip, $ENGINE_FILE has been existed
+else
+ python3 ${RUN_DIR}/build_engine.py \
+ --precision ${PRECISION} \
+ --model ${FINAL_MODEL} \
+ --engine ${ENGINE_FILE}
+ echo " "Generate Engine ${ENGINE_FILE}
+fi
+
+# Inference
+let step++
+echo;
+echo [STEP ${step}] : Inference
+python3 ${RUN_DIR}/inference.py \
+ --engine_file=${ENGINE_FILE} \
+ --datasets_dir=${DATASETS_DIR} \
+ --imgsz=${IMGSIZE} \
+ --warm_up=${WARM_UP} \
+ --loop_count ${LOOP_COUNT} \
+ --test_mode ${RUN_MODE} \
+ --fps_target ${TGT} \
+ --bsz ${BSZ}; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/models/cv/face/facenet/ixrt/tensorflow2pytorch.py b/models/cv/face/facenet/ixrt/tensorflow2pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f76ba0fff91ae1ac334c2babbc10f0d65139b711
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/tensorflow2pytorch.py
@@ -0,0 +1,387 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import tensorflow.compat.v1 as tf
+tf.disable_v2_behavior()
+import torch
+import json
+import os, sys
+
+from dependencies.facenet.src import facenet
+from dependencies.facenet.src.models import inception_resnet_v1 as tf_mdl
+from dependencies.facenet.src.align import detect_face
+
+from models.inception_resnet_v1 import InceptionResnetV1
+from models.mtcnn import PNet, RNet, ONet
+
+
+def import_tf_params(tf_mdl_dir, sess):
+ """Import tensorflow model from save directory.
+
+ Arguments:
+ tf_mdl_dir {str} -- Location of protobuf, checkpoint, meta files.
+ sess {tensorflow.Session} -- Tensorflow session object.
+
+ Returns:
+ (list, list, list) -- Tuple of lists containing the layer names,
+ parameter arrays as numpy ndarrays, parameter shapes.
+ """
+ print('\nLoading tensorflow model\n')
+ if callable(tf_mdl_dir):
+ tf_mdl_dir(sess)
+ else:
+ facenet.load_model(tf_mdl_dir)
+
+ print('\nGetting model weights\n')
+ images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
+ print(images_placeholder)
+ tf_layers = tf.trainable_variables()
+ tf_params = sess.run(tf_layers)
+ print(tf.get_default_graph())
+
+ tf_shapes = [p.shape for p in tf_params]
+ tf_layers = [l.name for l in tf_layers]
+
+ print(tf_shapes)
+ print(tf_shapes)
+
+ if not callable(tf_mdl_dir):
+ path = os.path.join(tf_mdl_dir, 'layer_description.json')
+ else:
+ path = 'data/layer_description.json'
+ with open(path, 'w') as f:
+ json.dump({l: s for l, s in zip(tf_layers, tf_shapes)}, f)
+
+ return tf_layers, tf_params, tf_shapes
+
+
+def get_layer_indices(layer_lookup, tf_layers):
+ """Giving a lookup of model layer attribute names and tensorflow variable names,
+ find matching parameters.
+
+ Arguments:
+ layer_lookup {dict} -- Dictionary mapping pytorch attribute names to (partial)
+ tensorflow variable names. Expects dict of the form {'attr': ['tf_name', ...]}
+ where the '...'s are ignored.
+ tf_layers {list} -- List of tensorflow variable names.
+
+ Returns:
+ list -- The input dictionary with the list of matching inds appended to each item.
+ """
+ layer_inds = {}
+ for name, value in layer_lookup.items():
+ layer_inds[name] = value + [[i for i, n in enumerate(tf_layers) if value[0] in n]]
+ return layer_inds
+
+
+def load_tf_batchNorm(weights, layer):
+ """Load tensorflow weights into nn.BatchNorm object.
+
+ Arguments:
+ weights {list} -- Tensorflow parameters.
+ layer {torch.nn.Module} -- nn.BatchNorm.
+ """
+ layer.bias.data = torch.tensor(weights[0]).view(layer.bias.data.shape)
+ layer.weight.data = torch.ones_like(layer.weight.data)
+ layer.running_mean = torch.tensor(weights[1]).view(layer.running_mean.shape)
+ layer.running_var = torch.tensor(weights[2]).view(layer.running_var.shape)
+
+
+def load_tf_conv2d(weights, layer, transpose=False):
+ """Load tensorflow weights into nn.Conv2d object.
+
+ Arguments:
+ weights {list} -- Tensorflow parameters.
+ layer {torch.nn.Module} -- nn.Conv2d.
+ """
+ if isinstance(weights, list):
+ if len(weights) == 2:
+ layer.bias.data = (
+ torch.tensor(weights[1])
+ .view(layer.bias.data.shape)
+ )
+ weights = weights[0]
+
+ if transpose:
+ dim_order = (3, 2, 1, 0)
+ else:
+ dim_order = (3, 2, 0, 1)
+
+ layer.weight.data = (
+ torch.tensor(weights)
+ .permute(dim_order)
+ .view(layer.weight.data.shape)
+ )
+
+
+def load_tf_conv2d_trans(weights, layer):
+ return load_tf_conv2d(weights, layer, transpose=True)
+
+
+def load_tf_basicConv2d(weights, layer):
+ """Load tensorflow weights into grouped Conv2d+BatchNorm object.
+
+ Arguments:
+ weights {list} -- Tensorflow parameters.
+ layer {torch.nn.Module} -- Object containing Conv2d+BatchNorm.
+ """
+ load_tf_conv2d(weights[0], layer.conv)
+ load_tf_batchNorm(weights[1:], layer.bn)
+
+
+def load_tf_linear(weights, layer):
+ """Load tensorflow weights into nn.Linear object.
+
+ Arguments:
+ weights {list} -- Tensorflow parameters.
+ layer {torch.nn.Module} -- nn.Linear.
+ """
+ if isinstance(weights, list):
+ if len(weights) == 2:
+ layer.bias.data = (
+ torch.tensor(weights[1])
+ .view(layer.bias.data.shape)
+ )
+ weights = weights[0]
+ layer.weight.data = (
+ torch.tensor(weights)
+ .transpose(-1, 0)
+ .view(layer.weight.data.shape)
+ )
+
+
+# High-level parameter-loading functions:
+
+def load_tf_block35(weights, layer):
+ load_tf_basicConv2d(weights[:4], layer.branch0)
+ load_tf_basicConv2d(weights[4:8], layer.branch1[0])
+ load_tf_basicConv2d(weights[8:12], layer.branch1[1])
+ load_tf_basicConv2d(weights[12:16], layer.branch2[0])
+ load_tf_basicConv2d(weights[16:20], layer.branch2[1])
+ load_tf_basicConv2d(weights[20:24], layer.branch2[2])
+ load_tf_conv2d(weights[24:26], layer.conv2d)
+
+
+def load_tf_block17_8(weights, layer):
+ load_tf_basicConv2d(weights[:4], layer.branch0)
+ load_tf_basicConv2d(weights[4:8], layer.branch1[0])
+ load_tf_basicConv2d(weights[8:12], layer.branch1[1])
+ load_tf_basicConv2d(weights[12:16], layer.branch1[2])
+ load_tf_conv2d(weights[16:18], layer.conv2d)
+
+
+def load_tf_mixed6a(weights, layer):
+ if len(weights) != 16:
+ raise ValueError(f'Number of weight arrays ({len(weights)}) not equal to 16')
+ load_tf_basicConv2d(weights[:4], layer.branch0)
+ load_tf_basicConv2d(weights[4:8], layer.branch1[0])
+ load_tf_basicConv2d(weights[8:12], layer.branch1[1])
+ load_tf_basicConv2d(weights[12:16], layer.branch1[2])
+
+
+def load_tf_mixed7a(weights, layer):
+ if len(weights) != 28:
+ raise ValueError(f'Number of weight arrays ({len(weights)}) not equal to 28')
+ load_tf_basicConv2d(weights[:4], layer.branch0[0])
+ load_tf_basicConv2d(weights[4:8], layer.branch0[1])
+ load_tf_basicConv2d(weights[8:12], layer.branch1[0])
+ load_tf_basicConv2d(weights[12:16], layer.branch1[1])
+ load_tf_basicConv2d(weights[16:20], layer.branch2[0])
+ load_tf_basicConv2d(weights[20:24], layer.branch2[1])
+ load_tf_basicConv2d(weights[24:28], layer.branch2[2])
+
+
+def load_tf_repeats(weights, layer, rptlen, subfun):
+ if len(weights) % rptlen != 0:
+ raise ValueError(f'Number of weight arrays ({len(weights)}) not divisible by {rptlen}')
+ weights_split = [weights[i:i+rptlen] for i in range(0, len(weights), rptlen)]
+ for i, w in enumerate(weights_split):
+ subfun(w, getattr(layer, str(i)))
+
+
+def load_tf_repeat_1(weights, layer):
+ load_tf_repeats(weights, layer, 26, load_tf_block35)
+
+
+def load_tf_repeat_2(weights, layer):
+ load_tf_repeats(weights, layer, 18, load_tf_block17_8)
+
+
+def load_tf_repeat_3(weights, layer):
+ load_tf_repeats(weights, layer, 18, load_tf_block17_8)
+
+
+def test_loaded_params(mdl, tf_params, tf_layers):
+ """Check each parameter in a pytorch model for an equivalent parameter
+ in a list of tensorflow variables.
+
+ Arguments:
+ mdl {torch.nn.Module} -- Pytorch model.
+ tf_params {list} -- List of ndarrays representing tensorflow variables.
+ tf_layers {list} -- Corresponding list of tensorflow variable names.
+ """
+ tf_means = torch.stack([torch.tensor(p).mean() for p in tf_params])
+ for name, param in mdl.named_parameters():
+ pt_mean = param.data.mean()
+ matching_inds = ((tf_means - pt_mean).abs() < 1e-8).nonzero()
+ print(f'{name} equivalent to {[tf_layers[i] for i in matching_inds]}')
+
+
+def compare_model_outputs(pt_mdl, sess, test_data):
+ """Given some testing data, compare the output of pytorch and tensorflow models.
+
+ Arguments:
+ pt_mdl {torch.nn.Module} -- Pytorch model.
+ sess {tensorflow.Session} -- Tensorflow session object.
+ test_data {torch.Tensor} -- Pytorch tensor.
+ """
+ print('\nPassing test data through TF model\n')
+ if isinstance(sess, tf.Session):
+ images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0")
+ phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0")
+ embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0")
+ feed_dict = {images_placeholder: test_data.numpy(), phase_train_placeholder: False}
+ tf_output = torch.tensor(sess.run(embeddings, feed_dict=feed_dict))
+ else:
+ tf_output = sess(test_data)
+
+ print(tf_output.shape, tf_output)
+
+ print('\nPassing test data through PT model\n')
+ pt_output = pt_mdl(test_data.permute(0, 3, 1, 2))
+ print(pt_output.shape, pt_output)
+
+ distance = (tf_output - pt_output).norm()
+ print(f'\nDistance {distance}\n')
+
+
+def compare_mtcnn(pt_mdl, tf_fun, sess, ind, test_data):
+ tf_mdls = tf_fun(sess)
+ tf_mdl = tf_mdls[ind]
+
+ print('\nPassing test data through TF model\n')
+ tf_output = tf_mdl(test_data.numpy())
+ tf_output = [torch.tensor(out) for out in tf_output]
+ print('\n'.join([str(o.view(-1)[:10]) for o in tf_output]))
+
+ print('\nPassing test data through PT model\n')
+ with torch.no_grad():
+ pt_output = pt_mdl(test_data.permute(0, 3, 2, 1))
+ pt_output = [torch.tensor(out) for out in pt_output]
+ for i in range(len(pt_output)):
+ if len(pt_output[i].shape) == 4:
+ pt_output[i] = pt_output[i].permute(0, 3, 2, 1).contiguous()
+ print('\n'.join([str(o.view(-1)[:10]) for o in pt_output]))
+
+ distance = [(tf_o - pt_o).norm() for tf_o, pt_o in zip(tf_output, pt_output)]
+ print(f'\nDistance {distance}\n')
+
+
+def load_tf_model_weights(mdl, layer_lookup, tf_mdl_dir, is_resnet=True, arg_num=None):
+ """Load tensorflow parameters into a pytorch model.
+
+ Arguments:
+ mdl {torch.nn.Module} -- Pytorch model.
+ layer_lookup {[type]} -- Dictionary mapping pytorch attribute names to (partial)
+ tensorflow variable names, and a function suitable for loading weights.
+ Expects dict of the form {'attr': ['tf_name', function]}.
+ tf_mdl_dir {str} -- Location of protobuf, checkpoint, meta files.
+ """
+ tf.reset_default_graph()
+ with tf.Session() as sess:
+ tf_layers, tf_params, tf_shapes = import_tf_params(tf_mdl_dir, sess)
+ layer_info = get_layer_indices(layer_lookup, tf_layers)
+
+ for layer_name, info in layer_info.items():
+ print(f'Loading {info[0]}/* into {layer_name}')
+ weights = [tf_params[i] for i in info[2]]
+ layer = getattr(mdl, layer_name)
+ info[1](weights, layer)
+
+ test_loaded_params(mdl, tf_params, tf_layers)
+
+ if is_resnet:
+ compare_model_outputs(mdl, sess, torch.randn(5, 160, 160, 3).detach())
+
+
+def tensorflow2pytorch(args):
+ lookup_inception_resnet_v1 = {
+ 'conv2d_1a': ['InceptionResnetV1/Conv2d_1a_3x3', load_tf_basicConv2d],
+ 'conv2d_2a': ['InceptionResnetV1/Conv2d_2a_3x3', load_tf_basicConv2d],
+ 'conv2d_2b': ['InceptionResnetV1/Conv2d_2b_3x3', load_tf_basicConv2d],
+ 'conv2d_3b': ['InceptionResnetV1/Conv2d_3b_1x1', load_tf_basicConv2d],
+ 'conv2d_4a': ['InceptionResnetV1/Conv2d_4a_3x3', load_tf_basicConv2d],
+ 'conv2d_4b': ['InceptionResnetV1/Conv2d_4b_3x3', load_tf_basicConv2d],
+ 'repeat_1': ['InceptionResnetV1/Repeat/block35', load_tf_repeat_1],
+ 'mixed_6a': ['InceptionResnetV1/Mixed_6a', load_tf_mixed6a],
+ 'repeat_2': ['InceptionResnetV1/Repeat_1/block17', load_tf_repeat_2],
+ 'mixed_7a': ['InceptionResnetV1/Mixed_7a', load_tf_mixed7a],
+ 'repeat_3': ['InceptionResnetV1/Repeat_2/block8', load_tf_repeat_3],
+ 'block8': ['InceptionResnetV1/Block8', load_tf_block17_8],
+ 'last_linear': ['InceptionResnetV1/Bottleneck/weights', load_tf_linear],
+ 'last_bn': ['InceptionResnetV1/Bottleneck/BatchNorm', load_tf_batchNorm],
+ # 'logits': ['Logits', load_tf_linear],
+ }
+
+ print('\nLoad CASIA-Webface-trained weights and save\n')
+ mdl = InceptionResnetV1(num_classes=10575).eval()
+ tf_mdl_dir = args.facenet_pb_path
+
+ load_tf_model_weights(mdl, lookup_inception_resnet_v1, tf_mdl_dir)
+ # print(f'????????')
+ # data_name = 'casia-webfacexxxxxxx'
+ # state_dict = mdl.state_dict()
+ # torch.save(state_dict, f'{tf_mdl_dir}-{data_name}.pt')
+
+ x = torch.rand(64, 3, 160, 160)#.cuda()
+ # y = resnet(x)
+ # print(y.shape)
+
+
+ f = f"{args.facenet_weights_path}/{args.onnx_save_name}"
+ torch.onnx.export(mdl, x, f, verbose=False, opset_version=11,
+ input_names=['input'], output_names=['output'], dynamic_axes=None)
+
+
+
+import argparse
+def parse_args():
+ parser = argparse.ArgumentParser("deploy facenet")
+ parser.add_argument("--facenet_weights_path", default="", help="onnx model path")
+ parser.add_argument("--facenet_pb_path", default="", help="")
+ parser.add_argument("--onnx_save_name", default="", help="")
+
+ return parser.parse_args()
+args = parse_args()
+
+tensorflow2pytorch(args)
+
+
+# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+# print('Running on device: {}'.format(device))
+
+# # Load pretrained resnet model
+# resnet = InceptionResnetV1(
+# classify=False,
+# pretrained='casia-webface'
+# )#.to(device)
+
+# x = torch.rand(64, 3, 160, 160)#.cuda()
+# y = resnet(x)
+# print(y.shape)
+
+
+# f = f"{args.facenet_weights_path}/{args.onnx_save_name}"
+# torch.onnx.export(resnet, x, f, verbose=False, opset_version=11, input_names=['input'], output_names=['output'], dynamic_axes=None)
diff --git a/models/cv/face/facenet/ixrt/utils.py b/models/cv/face/facenet/ixrt/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab8f213bf6bf629ad073140f4ab886760c707759
--- /dev/null
+++ b/models/cv/face/facenet/ixrt/utils.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import math
+
+from sklearn.model_selection import KFold
+from scipy import interpolate
+import numpy as np
+
+
+# LFW functions taken from David Sandberg's FaceNet implementation
+def distance(embeddings1, embeddings2, distance_metric=0):
+ if distance_metric==0:
+ # Euclidian distance
+ diff = np.subtract(embeddings1, embeddings2)
+ dist = np.sum(np.square(diff),1)
+ elif distance_metric==1:
+ # Distance based on cosine similarity
+ dot = np.sum(np.multiply(embeddings1, embeddings2), axis=1)
+ norm = np.linalg.norm(embeddings1, axis=1) * np.linalg.norm(embeddings2, axis=1)
+ similarity = dot / norm
+ dist = np.arccos(similarity) / math.pi
+ else:
+ raise 'Undefined distance metric %d' % distance_metric
+
+ return dist
+
+def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False):
+ assert(embeddings1.shape[0] == embeddings2.shape[0])
+ assert(embeddings1.shape[1] == embeddings2.shape[1])
+ nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+ nrof_thresholds = len(thresholds)
+ k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+ tprs = np.zeros((nrof_folds,nrof_thresholds))
+ fprs = np.zeros((nrof_folds,nrof_thresholds))
+ accuracy = np.zeros((nrof_folds))
+
+ is_false_positive = []
+ is_false_negative = []
+
+ indices = np.arange(nrof_pairs)
+
+ for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+ if subtract_mean:
+ mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
+ else:
+ mean = 0.0
+ dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
+
+ # Find the best threshold for the fold
+ acc_train = np.zeros((nrof_thresholds))
+ for threshold_idx, threshold in enumerate(thresholds):
+ _, _, acc_train[threshold_idx], _ ,_ = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
+ best_threshold_index = np.argmax(acc_train)
+ for threshold_idx, threshold in enumerate(thresholds):
+ tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _, _, _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
+ _, _, accuracy[fold_idx], is_fp, is_fn = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
+
+ tpr = np.mean(tprs,0)
+ fpr = np.mean(fprs,0)
+ is_false_positive.extend(is_fp)
+ is_false_negative.extend(is_fn)
+
+ return tpr, fpr, accuracy, is_false_positive, is_false_negative
+
+def calculate_accuracy(threshold, dist, actual_issame):
+ predict_issame = np.less(dist, threshold)
+ tp = np.sum(np.logical_and(predict_issame, actual_issame))
+ fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+ tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame)))
+ fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+ is_fp = np.logical_and(predict_issame, np.logical_not(actual_issame))
+ is_fn = np.logical_and(np.logical_not(predict_issame), actual_issame)
+
+ tpr = 0 if (tp+fn==0) else float(tp) / float(tp+fn)
+ fpr = 0 if (fp+tn==0) else float(fp) / float(fp+tn)
+ acc = float(tp+tn)/dist.size
+ return tpr, fpr, acc, is_fp, is_fn
+
+def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10, distance_metric=0, subtract_mean=False):
+ assert(embeddings1.shape[0] == embeddings2.shape[0])
+ assert(embeddings1.shape[1] == embeddings2.shape[1])
+ nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+ nrof_thresholds = len(thresholds)
+ k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+ val = np.zeros(nrof_folds)
+ far = np.zeros(nrof_folds)
+
+ indices = np.arange(nrof_pairs)
+
+ for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+ if subtract_mean:
+ mean = np.mean(np.concatenate([embeddings1[train_set], embeddings2[train_set]]), axis=0)
+ else:
+ mean = 0.0
+ dist = distance(embeddings1-mean, embeddings2-mean, distance_metric)
+
+ # Find the threshold that gives FAR = far_target
+ far_train = np.zeros(nrof_thresholds)
+ for threshold_idx, threshold in enumerate(thresholds):
+ _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
+ if np.max(far_train)>=far_target:
+ f = interpolate.interp1d(far_train, thresholds, kind='slinear')
+ threshold = f(far_target)
+ else:
+ threshold = 0.0
+
+ val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
+
+ val_mean = np.mean(val)
+ far_mean = np.mean(far)
+ val_std = np.std(val)
+ return val_mean, val_std, far_mean
+
+def calculate_val_far(threshold, dist, actual_issame):
+ predict_issame = np.less(dist, threshold)
+ true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
+ false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+ n_same = np.sum(actual_issame)
+ n_diff = np.sum(np.logical_not(actual_issame))
+ val = float(true_accept) / float(n_same)
+ far = float(false_accept) / float(n_diff)
+ return val, far
+
+
+
+def evaluate(embeddings, actual_issame, nrof_folds=10, distance_metric=0, subtract_mean=False):
+ # Calculate evaluation metrics
+ thresholds = np.arange(0, 4, 0.01)
+ embeddings1 = embeddings[0::2]
+ embeddings2 = embeddings[1::2]
+ tpr, fpr, accuracy, fp, fn = calculate_roc(thresholds, embeddings1, embeddings2,
+ np.asarray(actual_issame), nrof_folds=nrof_folds, distance_metric=distance_metric, subtract_mean=subtract_mean)
+ thresholds = np.arange(0, 4, 0.001)
+ val, val_std, far = calculate_val(thresholds, embeddings1, embeddings2,
+ np.asarray(actual_issame), 1e-3, nrof_folds=nrof_folds, distance_metric=distance_metric, subtract_mean=subtract_mean)
+ return tpr, fpr, accuracy, val, val_std, far, fp, fn
+
+def add_extension(path):
+ if os.path.exists(path+'.jpg'):
+ return path+'.jpg'
+ elif os.path.exists(path+'.png'):
+ return path+'.png'
+ else:
+ raise RuntimeError('No file "%s" with extension png or jpg.' % path)
+
+def get_paths(lfw_dir, pairs):
+ nrof_skipped_pairs = 0
+ path_list = []
+ issame_list = []
+ for pair in pairs:
+ if len(pair) == 3:
+ path0 = add_extension(os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])))
+ path1 = add_extension(os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[2])))
+ issame = True
+ elif len(pair) == 4:
+ path0 = add_extension(os.path.join(lfw_dir, pair[0], pair[0] + '_' + '%04d' % int(pair[1])))
+ path1 = add_extension(os.path.join(lfw_dir, pair[2], pair[2] + '_' + '%04d' % int(pair[3])))
+ issame = False
+ if os.path.exists(path0) and os.path.exists(path1): # Only add the pair if both paths exist
+ path_list += (path0,path1)
+ issame_list.append(issame)
+ else:
+ nrof_skipped_pairs += 1
+ if nrof_skipped_pairs>0:
+ print('Skipped %d image pairs' % nrof_skipped_pairs)
+
+ return path_list, issame_list
+
+def read_pairs(pairs_filename):
+ pairs = []
+ with open(pairs_filename, 'r') as f:
+ for line in f.readlines()[1:]:
+ pair = line.strip().split()
+ pairs.append(pair)
+ return np.array(pairs, dtype=object)
\ No newline at end of file
diff --git a/models/nlp/language_model/albert/ixrt/README.md b/models/nlp/language_model/albert/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cace00994d1b030154c2064f87d8f110029edbd0
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/README.md
@@ -0,0 +1,105 @@
+# AlBERT
+
+## Description
+
+Albert (A Lite BERT) is a variant of the BERT (Bidirectional Encoder Representations from Transformers) model that focuses on efficiency and scalability while maintaining strong performance in natural language processing tasks. The AlBERT model introduces parameter reduction techniques and incorporates self-training strategies to enhance its effectiveness.
+
+## Setup
+
+### Install
+
+```bash
+apt install -y libnuma-dev
+
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+pip3 install transformers==4.33.3
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the squad dataset.
+
+or you can :
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/albert/ixrt
+cd ${MODEL_PATH}
+bash ./scripts/prepare_model_and_dataset.sh
+```
+
+### Model Conversion
+
+Please correct the paths in the following commands or files.
+
+```bash
+wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/albert-torch-fp32.json
+python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/open_albert/albert-base-squad.pt --output_path albert-torch-fp32.onnx
+onnxsim albert-torch-fp32.onnx albert-torch-fp32-sim.onnx
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./albert-torch-fp32-sim
+export OPTIMIER_FILE=./ixrt-oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+
+bash scripts/infer_albert_fp16_performance.sh
+```
+
+### Accuracy
+
+```bash
+# get madlag.tar
+wget http://files.deepspark.org.cn:880/deepspark/madlag.tar
+tar xvf madlag.tar
+rm -f madlag.tar
+
+# link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+
+# modify perf_engine.py
+mv ./perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+
+# edit madlag/albert-base-v2-squad path
+sed -i "s#madlag#/${MODEL_PATH}/madlag#" ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py
+
+# copy open_squad data
+cp datasets/open_squad/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/
+
+# copy open_albert data
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/open_albert
+cp ./general_perf/model_zoo/popular/open_albert/*.pt ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/open_albert
+
+# run acc script
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+sed -i 's/tensorrt_legacy/tensorrt/' ./backends/ILUVATAR/common.py
+sed -i 's/tensorrt_legacy/tensorrt/' ./backends/ILUVATAR/compile_backend_iluvatar.py
+sed -i 's/tensorrt_legacy/tensorrt/' ./backends/ILUVATAR/runtime_backend_iluvatar.py
+python3 core/perf_engine.py --hardware_type ILUVATAR --task albert-torch-fp32
+```
+
+## Results
+
+| Model | BatchSize | Precision | QPS | Exact Match | F1 Score |
+| ------ | --------- | --------- | ----- | ----------- | -------- |
+| AlBERT | 1 | FP16 | 50.99 | 80.18 | 87.57 |
diff --git a/models/nlp/language_model/albert/ixrt/perf_engine.py b/models/nlp/language_model/albert/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--task",
+ default="resnet50-tf-fp32",
+ help="The task going to be evaluted, refs to workloads/")
+ parser.add_argument(
+ "--hardware_type",
+ default="GPU",
+ help="The backend going to be evaluted, refs to backends/")
+ parser.add_argument("--compile_only",
+ action='store_true',
+ help="Run compilation only")
+
+ args = parser.parse_args()
+ return args
+
+
+class PerfEngine:
+ def __init__(self) -> None:
+ super().__init__()
+ self.args = get_args()
+ self.workload = load_workload(self.args.task)
+ self.backend_type = self.args.hardware_type
+ self.compile_backend = None
+ self.old_os_path = os.environ['PATH']
+ self.prev_sys_path = list(sys.path)
+ self.real_prefix = sys.prefix
+ self.compile_only_mode = False
+
+ def start_engine(self) -> None:
+ '''
+ Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+ '''
+ success, total = 0, len(self.workload)
+ if total == 0:
+ return
+ log.info("******************* Backend Env Initization *******************")
+ status = self.activate_venv(self.backend_type)
+ if not status:
+ log.warning("Activate virtualenv Failed, Please Check...")
+
+ self.compile_backend = init_compile_backend(self.backend_type)
+ self.runtime_backend = init_runtime_backend(self.backend_type)
+
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type)
+ os.makedirs(output_dir, exist_ok=True)
+
+ status = self.single_workload_perf(self.workload)
+
+ def single_workload_perf(
+ self, workload: Dict[str, Any]) -> bool:
+ log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+ # Check Compile Only Mode
+ self.compile_only_mode = False
+ if self.args.compile_only or workload['compile_only']:
+ self.compile_only_mode = True
+
+ base_report = {
+ "Model": workload['model'].upper(),
+ "Backend": self.backend_type,
+ "Host Info": self.get_cpu_name()
+ }
+
+ # Initalize Model Config Info
+ model_info = self.get_model_info(workload['model'])
+ pre_compile_config = {"workload": workload, 'model_info': model_info}
+ interact_info = self.check_interact_info(pre_compile_config)
+ pre_compile_config['interact_info'] = interact_info
+ if not model_info['dataset_name']:
+ model_info['dataset_name'] = 'fake_dataset'
+
+
+ '''
+ Compile Backend could do some optimization like convert model format here
+ '''
+ log.info("******************************************* Running Backend Compilation... *******************************************")
+ log.info("Running Backend Preoptimization...")
+ pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+ # Initalize dataset
+ dataset = load_dataset(model_info)
+ dataset.preprocess()
+ base_report['Dataset'] = model_info['dataset_name'].upper(
+ ) if model_info['dataset_name'] else None
+
+ #Placeholder Only
+ segment_info = self.compile_backend.segment(pre_compile_config)
+
+ best_batch_sizes = self.compile_backend.get_best_batch_size()
+ if isinstance(best_batch_sizes, list):
+ pre_compile_config['workload'][
+ 'batch_sizes'] = best_batch_sizes
+
+ log.info("Start to compile the model...")
+ start = time.time()
+ compile_info = self.compile_backend.compile(pre_compile_config,
+ dataset)
+ end = time.time()
+
+ graph_compile_report = {}
+ graph_compile_report["Compile Duration"] = round(end - start, 5)
+ graph_compile_report["Compile Precision"] = compile_info[
+ 'compile_precision']
+ graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+ if 'optimizations' in compile_info:
+ graph_compile_report['Optimizations'] = compile_info['optimizations']
+ if 'instance_count' in compile_info:
+ base_report['Instance Count'] = compile_info['instance_count']
+ if 'device_count' in compile_info:
+ base_report['Device Count'] = compile_info['device_count']
+ base_report['Graph Compile'] = graph_compile_report
+
+ # Initalize Output Dir and Reports
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type + '/' +
+ workload['model'])
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Compile only mode will stop here
+ if self.compile_only_mode:
+ base_report.pop("Backend")
+ return compile_info["compile_status"], base_report
+
+ # load runtime backend
+ """
+ Start Here
+ """
+ batch_sizes = pre_compile_config['workload']['batch_sizes']
+ self.runtime_backend.configs = compile_info
+ self.runtime_backend.workload = workload
+ self.runtime_backend.model_info = model_info
+
+ self.runtime_backend.load(workload['batch_sizes'][0])
+ # test accuracy
+ accuracy_report = {}
+ AccuracyChecker = self.get_accuracy_checker(
+ model_info['dataset_name']
+ if model_info['dataset_name'] else 'fake_dataset')
+ AccuracyChecker.runtime_backend = self.runtime_backend
+ AccuracyChecker.dataloader = dataset
+ AccuracyChecker.output_dir = output_dir
+ AccuracyChecker.configs = compile_info
+
+ if workload['test_accuracy']:
+ log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+
+ accuracy_report['Data Percent'] = workload['data_percent']
+ accuracy_report.update(accuracy_results)
+
+ # test numeric
+ if workload['test_numeric']:
+ log.info("******************************************* Running Numeric Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ if not workload['test_accuracy']:
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+ diff_results = AccuracyChecker.calculate_diff()
+ accuracy_report.update(diff_results)
+ # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+ if accuracy_report:
+ base_report['Accuracy'] = accuracy_report
+
+ # function to test qps and latency
+ if workload['test_perf']:
+ log.info("******************************************* Runing QPS Checker... *******************************************")
+ performance_reports = []
+ qs_status = self.runtime_backend.is_qs_mode_supported()
+ if qs_status:
+ qs_config = self.runtime_backend.generate_qs_config()
+ performance_reports = self.qs_benchmark(qs_config)
+ else:
+ for bs in batch_sizes:
+ self.runtime_backend.load(bs)
+ batch_reports = self.runtime_backend.benchmark(dataset)
+ performance_reports.append(batch_reports)
+ base_report['Performance'] = performance_reports
+
+ if "Instance Count" not in base_report:
+ log.warning("Vendors need to Add # of instances")
+ if "Device Count" not in base_report:
+ log.warning("Vendors need to Add # of devices")
+
+ # write output to json file
+ output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+ with open(output_report_path, 'w') as file:
+ json.dump(base_report, file, indent=4)
+
+ base_report.pop("Backend")
+ log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+ format(output_dir[output_dir.rfind('general_perf'):],
+ os.path.basename(output_report_path)))
+
+ return compile_info["compile_status"]
+
+ #WIP
+ def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+ return []
+
+ def get_accuracy_checker(self, dataset_name: str):
+ AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+ dataset_name +
+ ".test_accuracy")
+ AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+ return AccuracyChecker()
+
+ def get_model_info(self, model_name: str) -> Dict[str, Any]:
+ with open("general_perf/model_zoo/" + model_name + '.json',
+ 'r') as file:
+ model_info = json.load(file)
+ return model_info
+
+ def get_cpu_name(self):
+ command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+ cpu_name = subprocess.check_output(command, shell=True)
+ return cpu_name.decode().strip()
+
+ def check_interact_info(
+ self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+ interact_info = self.compile_backend.get_interact_profile(
+ pre_compile_config)
+
+ answer = {}
+ if len(interact_info) == 0:
+ return answer
+
+ dialog_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ })
+
+ input_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ 'text-area.prompt': 'bg:#ffffff',
+ 'text-area': '#000000',
+ })
+
+ option = yes_no_dialog(title=self.backend_type + '编译配置',
+ text='[请选择]:是否进行编译后端配置:',
+ style=dialog_style).run()
+ if option:
+ sum_question = len(interact_info)
+ for i, question in enumerate(interact_info):
+ if question['depends']:
+ state = 0
+ for title in question['depends'].split(','):
+ if not answer[title]:
+ state = 1
+ if state:
+ continue
+ if question['dialog_type'] == 'Yes/No Dialog':
+ option = yes_no_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=dialog_style).run()
+ elif question['dialog_type'] == "Input Dialog":
+ option = input_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=input_style).run()
+ elif question['dialog_type'] == "Radiolist Dialog":
+ choice = [(i, text)
+ for i, text in enumerate(question['options'])]
+ num = radiolist_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ values=choice,
+ style=dialog_style).run()
+ option = question['options'][num] if num is not None else question[
+ 'default']
+ answer[question['name']] = option
+
+ return answer
+
+ def activate_venv(self, hardware_type: str) -> bool:
+
+ return True
+
+ def deactivate_venv(self):
+ sys.path[:
+ 0] = self.prev_sys_path #will also revert the added site-packages
+ sys.prefix = self.real_prefix
+ os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+ engine = PerfEngine()
+ engine.start_engine()
diff --git a/models/nlp/language_model/albert/ixrt/scripts/infer_albert_fp16_performance.sh b/models/nlp/language_model/albert/ixrt/scripts/infer_albert_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..977eb85c3f3cb4aa4b337c79c9246114f369bf41
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/scripts/infer_albert_fp16_performance.sh
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+
+run(){
+ BS=16
+ TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+ TARGET_ENGINE=${ORIGIN_ONNX_NAME}_bs_${BS}_end.engine
+ if [[ ! -f "${ORIGIN_ONNX}" ]];then
+ echo "${ORIGIN_ONNX} not exists!"
+ exit 1
+ fi
+
+ # Graph optimize
+ python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+
+ # Build Engine
+ ixrtexec --onnx ${TARGET_ONNX} --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+ --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+ --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+ --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin
+
+ # Test Performance
+ ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/albert/ixrt/scripts/prepare_model_and_dataset.sh b/models/nlp/language_model/albert/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..115faac30dcef7617327a4083e4a67f1ff4c322b
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model.... *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_squad/
+
+wget -O general_perf/download/open_albert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_albert.tar
+tar xf general_perf/download/open_albert.tar -C general_perf/model_zoo/popular/
+
+
+# # Download Datasets
+wget -O general_perf/download/open_squad.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
+tar xf general_perf/download/open_squad.tar -C datasets/open_squad/
+
+
+echo "Extract Done."
diff --git a/models/nlp/language_model/albert/ixrt/torch2onnx.py b/models/nlp/language_model/albert/ixrt/torch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f115b730caf065b3f3dfc496c161916afc96d9e
--- /dev/null
+++ b/models/nlp/language_model/albert/ixrt/torch2onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+ model_name = output_path.split("/")[-1][:-4]
+ with open(model_name + "json", "r") as f:
+ model_info = json.load(f)
+ model_inputs = model_info["inputs"].split(",")
+ input_shapes = model_info["input_shape"]
+ input_type = model_info["input_type"].split(",")
+ example_inputs = _get_fake_samples(input_shapes, input_type)
+
+ model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+ model.eval()
+
+ names = model_inputs
+ dynamic_inputs = {}
+ for i in range(len(names)):
+ dynamic_inputs[names[i]] = {0: "batch_size"}
+ outputs = model_info["outputs"].split(",")
+ for output in outputs:
+ dynamic_inputs[output] = {0: "batch_size"}
+ torch.onnx.export(
+ model,
+ example_inputs,
+ output_path,
+ opset_version=11,
+ input_names=names,
+ output_names=outputs,
+ dynamic_axes=dynamic_inputs,
+ )
+
+
+def _get_fake_samples(shape, type):
+ data = []
+ idx = 0
+ for key, val in shape.items():
+ val = [val[0] * 1] + val[1:]
+ data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+ idx += 1
+ return data
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+ args = get_args()
+ torch_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/deberta/ixrt/README.md b/models/nlp/language_model/deberta/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..221a33a895d476b2d73672ab6c26420528d0a33a
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/README.md
@@ -0,0 +1,102 @@
+# DeBERTa
+
+## Description
+
+DeBERTa (Decoding-enhanced BERT with disentangled attention) is an enhanced version of the BERT (Bidirectional Encoder Representations from Transformers) model. It improves text representation learning by introducing disentangled attention mechanisms and decoding enhancement techniques.DeBERTa introduces disentangled attention mechanisms that decompose the self-attention matrix into different parts, focusing on different semantic information. This helps the model better capture relationships between texts.By incorporating decoding enhancement techniques, DeBERTa adjusts the decoder during fine-tuning to better suit specific downstream tasks, thereby improving the model’s performance on those tasks.
+
+## Setup
+
+### Install
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/deberta/ixrt
+cd ${MODEL_PATH}
+
+apt install -y libnuma-dev
+
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+pip3 install tf2onnx
+pip3 install transformers==4.33.3
+```
+
+### Download
+
+Pretrained model: < >
+
+Dataset: < > to download the squad dataset.
+
+```bash
+bash ./scripts/prepare_model_and_dataset.sh
+```
+
+### Model Conversion
+
+```bash
+wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/deberta-torch-fp32.json
+python3 torch2onnx.py --model_path ./general_perf/model_zoo/popular/open_deberta/deberta-base-squad.pt --output_path deberta-torch-fp32.onnx
+onnxsim deberta-torch-fp32.onnx deberta-torch-fp32-sim.onnx
+python3 remove_clip_and_cast.py
+
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./deberta-sim-drop-clip-drop-invaild-cast
+export OPTIMIER_FILE=/Path/ixrt/oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+
+bash scripts/infer_deberta_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit the website: < >, which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: < > Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+
+# setup
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+cp ./datasets/open_squad/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/
+
+mv ./deberta-sim-drop-clip-drop-invaild-cast.onnx general_perf/model_zoo/popular/open_deberta/
+mv ./general_perf/model_zoo/popular/ ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/
+
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+wget http://files.deepspark.org.cn:880/deepspark/Palak.tar
+tar -zxvf Palak.tar
+
+#接着修改代码:ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad/data_loader.py -AutoTokenizer.from_pretrained("Palak/microsoft_deberta-base_squad") => AutoTokenizer.from_pretrained("/Your/Path/Palak/microsoft_deberta-base_squad")
+
+# run acc perf
+sed -i 's/tensorrt_legacy/tensorrt/g' backends/ILUVATAR/common.py
+python3 core/perf_engine.py --hardware_type ILUVATAR --task deberta-torch-fp32
+```
+
+## Results
+
+| Model | BatchSize | Precision | QPS | Exact Match | F1 Score |
+| ------- | --------- | --------- | ----- | ----------- | -------- |
+| DeBERTa | 1 | FP16 | 18.58 | 73.76 | 81.24 |
diff --git a/models/nlp/language_model/deberta/ixrt/perf_engine.py b/models/nlp/language_model/deberta/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--task",
+ default="resnet50-tf-fp32",
+ help="The task going to be evaluted, refs to workloads/")
+ parser.add_argument(
+ "--hardware_type",
+ default="GPU",
+ help="The backend going to be evaluted, refs to backends/")
+ parser.add_argument("--compile_only",
+ action='store_true',
+ help="Run compilation only")
+
+ args = parser.parse_args()
+ return args
+
+
+class PerfEngine:
+ def __init__(self) -> None:
+ super().__init__()
+ self.args = get_args()
+ self.workload = load_workload(self.args.task)
+ self.backend_type = self.args.hardware_type
+ self.compile_backend = None
+ self.old_os_path = os.environ['PATH']
+ self.prev_sys_path = list(sys.path)
+ self.real_prefix = sys.prefix
+ self.compile_only_mode = False
+
+ def start_engine(self) -> None:
+ '''
+ Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+ '''
+ success, total = 0, len(self.workload)
+ if total == 0:
+ return
+ log.info("******************* Backend Env Initization *******************")
+ status = self.activate_venv(self.backend_type)
+ if not status:
+ log.warning("Activate virtualenv Failed, Please Check...")
+
+ self.compile_backend = init_compile_backend(self.backend_type)
+ self.runtime_backend = init_runtime_backend(self.backend_type)
+
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type)
+ os.makedirs(output_dir, exist_ok=True)
+
+ status = self.single_workload_perf(self.workload)
+
+ def single_workload_perf(
+ self, workload: Dict[str, Any]) -> bool:
+ log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+ # Check Compile Only Mode
+ self.compile_only_mode = False
+ if self.args.compile_only or workload['compile_only']:
+ self.compile_only_mode = True
+
+ base_report = {
+ "Model": workload['model'].upper(),
+ "Backend": self.backend_type,
+ "Host Info": self.get_cpu_name()
+ }
+
+ # Initalize Model Config Info
+ model_info = self.get_model_info(workload['model'])
+ pre_compile_config = {"workload": workload, 'model_info': model_info}
+ interact_info = self.check_interact_info(pre_compile_config)
+ pre_compile_config['interact_info'] = interact_info
+ if not model_info['dataset_name']:
+ model_info['dataset_name'] = 'fake_dataset'
+
+
+ '''
+ Compile Backend could do some optimization like convert model format here
+ '''
+ log.info("******************************************* Running Backend Compilation... *******************************************")
+ log.info("Running Backend Preoptimization...")
+ pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+ # Initalize dataset
+ dataset = load_dataset(model_info)
+ dataset.preprocess()
+ base_report['Dataset'] = model_info['dataset_name'].upper(
+ ) if model_info['dataset_name'] else None
+
+ #Placeholder Only
+ segment_info = self.compile_backend.segment(pre_compile_config)
+
+ best_batch_sizes = self.compile_backend.get_best_batch_size()
+ if isinstance(best_batch_sizes, list):
+ pre_compile_config['workload'][
+ 'batch_sizes'] = best_batch_sizes
+
+ log.info("Start to compile the model...")
+ start = time.time()
+ compile_info = self.compile_backend.compile(pre_compile_config,
+ dataset)
+ end = time.time()
+
+ graph_compile_report = {}
+ graph_compile_report["Compile Duration"] = round(end - start, 5)
+ graph_compile_report["Compile Precision"] = compile_info[
+ 'compile_precision']
+ graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+ if 'optimizations' in compile_info:
+ graph_compile_report['Optimizations'] = compile_info['optimizations']
+ if 'instance_count' in compile_info:
+ base_report['Instance Count'] = compile_info['instance_count']
+ if 'device_count' in compile_info:
+ base_report['Device Count'] = compile_info['device_count']
+ base_report['Graph Compile'] = graph_compile_report
+
+ # Initalize Output Dir and Reports
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type + '/' +
+ workload['model'])
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Compile only mode will stop here
+ if self.compile_only_mode:
+ base_report.pop("Backend")
+ return compile_info["compile_status"], base_report
+
+ # load runtime backend
+ """
+ Start Here
+ """
+ batch_sizes = pre_compile_config['workload']['batch_sizes']
+ self.runtime_backend.configs = compile_info
+ self.runtime_backend.workload = workload
+ self.runtime_backend.model_info = model_info
+
+ self.runtime_backend.load(workload['batch_sizes'][0])
+ # test accuracy
+ accuracy_report = {}
+ AccuracyChecker = self.get_accuracy_checker(
+ model_info['dataset_name']
+ if model_info['dataset_name'] else 'fake_dataset')
+ AccuracyChecker.runtime_backend = self.runtime_backend
+ AccuracyChecker.dataloader = dataset
+ AccuracyChecker.output_dir = output_dir
+ AccuracyChecker.configs = compile_info
+
+ if workload['test_accuracy']:
+ log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+
+ accuracy_report['Data Percent'] = workload['data_percent']
+ accuracy_report.update(accuracy_results)
+
+ # test numeric
+ if workload['test_numeric']:
+ log.info("******************************************* Running Numeric Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ if not workload['test_accuracy']:
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+ diff_results = AccuracyChecker.calculate_diff()
+ accuracy_report.update(diff_results)
+ # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+ if accuracy_report:
+ base_report['Accuracy'] = accuracy_report
+
+ # function to test qps and latency
+ if workload['test_perf']:
+ log.info("******************************************* Runing QPS Checker... *******************************************")
+ performance_reports = []
+ qs_status = self.runtime_backend.is_qs_mode_supported()
+ if qs_status:
+ qs_config = self.runtime_backend.generate_qs_config()
+ performance_reports = self.qs_benchmark(qs_config)
+ else:
+ for bs in batch_sizes:
+ self.runtime_backend.load(bs)
+ batch_reports = self.runtime_backend.benchmark(dataset)
+ performance_reports.append(batch_reports)
+ base_report['Performance'] = performance_reports
+
+ if "Instance Count" not in base_report:
+ log.warning("Vendors need to Add # of instances")
+ if "Device Count" not in base_report:
+ log.warning("Vendors need to Add # of devices")
+
+ # write output to json file
+ output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+ with open(output_report_path, 'w') as file:
+ json.dump(base_report, file, indent=4)
+
+ base_report.pop("Backend")
+ log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+ format(output_dir[output_dir.rfind('general_perf'):],
+ os.path.basename(output_report_path)))
+
+ return compile_info["compile_status"]
+
+ #WIP
+ def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+ return []
+
+ def get_accuracy_checker(self, dataset_name: str):
+ AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+ dataset_name +
+ ".test_accuracy")
+ AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+ return AccuracyChecker()
+
+ def get_model_info(self, model_name: str) -> Dict[str, Any]:
+ with open("general_perf/model_zoo/" + model_name + '.json',
+ 'r') as file:
+ model_info = json.load(file)
+ return model_info
+
+ def get_cpu_name(self):
+ command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+ cpu_name = subprocess.check_output(command, shell=True)
+ return cpu_name.decode().strip()
+
+ def check_interact_info(
+ self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+ interact_info = self.compile_backend.get_interact_profile(
+ pre_compile_config)
+
+ answer = {}
+ if len(interact_info) == 0:
+ return answer
+
+ dialog_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ })
+
+ input_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ 'text-area.prompt': 'bg:#ffffff',
+ 'text-area': '#000000',
+ })
+
+ option = yes_no_dialog(title=self.backend_type + '编译配置',
+ text='[请选择]:是否进行编译后端配置:',
+ style=dialog_style).run()
+ if option:
+ sum_question = len(interact_info)
+ for i, question in enumerate(interact_info):
+ if question['depends']:
+ state = 0
+ for title in question['depends'].split(','):
+ if not answer[title]:
+ state = 1
+ if state:
+ continue
+ if question['dialog_type'] == 'Yes/No Dialog':
+ option = yes_no_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=dialog_style).run()
+ elif question['dialog_type'] == "Input Dialog":
+ option = input_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=input_style).run()
+ elif question['dialog_type'] == "Radiolist Dialog":
+ choice = [(i, text)
+ for i, text in enumerate(question['options'])]
+ num = radiolist_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ values=choice,
+ style=dialog_style).run()
+ option = question['options'][num] if num is not None else question[
+ 'default']
+ answer[question['name']] = option
+
+ return answer
+
+ def activate_venv(self, hardware_type: str) -> bool:
+
+ return True
+
+ def deactivate_venv(self):
+ sys.path[:
+ 0] = self.prev_sys_path #will also revert the added site-packages
+ sys.prefix = self.real_prefix
+ os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+ engine = PerfEngine()
+ engine.start_engine()
diff --git a/models/nlp/language_model/deberta/ixrt/remove_clip_and_cast.py b/models/nlp/language_model/deberta/ixrt/remove_clip_and_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..d362439f13a195b8ba895a70407a59ae881db181
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/remove_clip_and_cast.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import onnx_graphsurgeon as gs
+import onnx
+
+onnx_op_set_2_ir_version = {
+ 11:6,
+ 12:7,
+ 13:7,
+}
+
+visited_add_tensor = {}
+def replace_expand_values(graph, expand_node, clip_node, cast_node, sub_node, add_node):
+ if add_node.inputs[0].name not in visited_add_tensor:
+ print(add_node.inputs[0].name)
+ print(add_node.inputs[0].values)
+ add_node.inputs[0].values = add_node.inputs[0].values + 384
+ add_node.inputs[0].values[add_node.inputs[0].values < 0] = 0
+ add_node.inputs[0].values[add_node.inputs[0].values > 767] = 767
+ print(add_node.inputs[0].values)
+ visited_add_tensor[add_node.inputs[0].name] = True
+ expand_node.inputs = [add_node.inputs[0]] + expand_node.inputs[1:]
+
+def replace_clip_related_nodes(graph):
+ node_name_to_index_map = {}
+ expand_node_names = []
+ output_name_to_node_name_map = {}
+ for i, node in enumerate(graph.nodes):
+ node_name_to_index_map[node.name] = i
+ if node.op == "Expand":
+ expand_node_names.append(node.name)
+ for j in node.outputs:
+ output_name_to_node_name_map[j.name] = node.name
+
+ for name in expand_node_names:
+ expand_node = graph.nodes[node_name_to_index_map[name]]
+ expand_producer_name = output_name_to_node_name_map[expand_node.inputs[0].name]
+ expand_producer = graph.nodes[node_name_to_index_map[expand_producer_name]]
+ if expand_producer.op == "Clip":
+ clip_node = expand_producer
+ clip_producer_name = output_name_to_node_name_map[clip_node.inputs[-1].name]
+ clip_producer = graph.nodes[node_name_to_index_map[clip_producer_name]]
+ if clip_producer.op == "Cast":
+ cast_producer_name = output_name_to_node_name_map[clip_producer.inputs[0].name]
+ cast_producer = graph.nodes[node_name_to_index_map[cast_producer_name]]
+ if cast_producer.op == "Sub":
+ add_node_name = output_name_to_node_name_map[clip_node.inputs[0].name]
+ add_node = graph.nodes[node_name_to_index_map[add_node_name]]
+ replace_expand_values(graph, expand_node, clip_node, clip_producer, cast_producer, add_node)
+
+def drop_cast_nodes(graph):
+ node_name_to_index_map = {}
+ cast_node_names = []
+ output_name_to_node_name_map = {}
+ for i, node in enumerate(graph.nodes):
+ node_name_to_index_map[node.name] = i
+ if node.op == "Cast":
+ cast_node_names.append(node.name)
+ for j in node.outputs:
+ output_name_to_node_name_map[j.name] = node.name
+
+ for name in cast_node_names:
+ cast_node = graph.nodes[node_name_to_index_map[name]]
+ cast_producer_name = output_name_to_node_name_map[cast_node.inputs[0].name]
+ cast_producer = graph.nodes[node_name_to_index_map[cast_producer_name]]
+ if cast_producer.op == "Cast":
+ cast_node.inputs = cast_producer.inputs
+
+
+input_path = r"./deberta-torch-fp32-sim.onnx"
+save_path = r"./deberta-sim-drop-clip-drop-invaild-cast.onnx"
+graph = gs.import_onnx(onnx.load(input_path))
+
+replace_clip_related_nodes(graph)
+drop_cast_nodes(graph)
+
+graph.cleanup().toposort()
+onnx.save(gs.export_onnx(graph), save_path)
+
+model = onnx.load(save_path)
+model.ir_version = onnx_op_set_2_ir_version[model.opset_import[0].version]
+onnx.save(model, save_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/deberta/ixrt/scripts/infer_deberta_fp16_performance.sh b/models/nlp/language_model/deberta/ixrt/scripts/infer_deberta_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c9ced2418f6be8ff775b509d01d0db121af79108
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/scripts/infer_deberta_fp16_performance.sh
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+ BS=${1:-1}
+ TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+ TARGET_ENGINE=${ORIGIN_ONNX_NAME}_bs_${BS}_end.engine
+ if [[ ! -f "${ORIGIN_ONNX}" ]];then
+ echo "${ORIGIN_ONNX} not exists!"
+ exit 1
+ fi
+
+ # Graph optimize
+ python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+
+ # Build Engine
+ ixrtexec --onnx ${TARGET_ONNX} --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin --shapes input_ids.1:${BS}x384,attention_mask.1:${BS}x384\
+ --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384 --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384 --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384
+
+ # Test Performance
+ ixrtexec --load_engine ${TARGET_ENGINE} --shapes input_ids.1:${BS}x384,attention_mask.1:${BS}x384 --plugins ixrt_plugin
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/deberta/ixrt/scripts/prepare_model_and_dataset.sh b/models/nlp/language_model/deberta/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..575ab8f7d141b387de01d44e84962d8e7e7900dc
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model.... *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_squad/
+
+wget -O general_perf/download/open_deberta.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_deberta.tar
+tar xf general_perf/download/open_deberta.tar -C general_perf/model_zoo/popular/
+
+
+# # Download Datasets
+wget -O general_perf/download/open_squad.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
+tar xf general_perf/download/open_squad.tar -C datasets/open_squad/
+
+
+echo "Extract Done."
diff --git a/models/nlp/language_model/deberta/ixrt/torch2onnx.py b/models/nlp/language_model/deberta/ixrt/torch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f7c06bbd18710f0820870c1ae5711505dd136bb
--- /dev/null
+++ b/models/nlp/language_model/deberta/ixrt/torch2onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+ model_name = output_path.split("/")[-1][:-4]
+ with open("./" + model_name + "json", "r") as f:
+ model_info = json.load(f)
+ model_inputs = model_info["inputs"].split(",")
+ input_shapes = model_info["input_shape"]
+ input_type = model_info["input_type"].split(",")
+ example_inputs = _get_fake_samples(input_shapes, input_type)
+
+ model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+ model.eval()
+
+ names = model_inputs
+ dynamic_inputs = {}
+ for i in range(len(names)):
+ dynamic_inputs[names[i]] = {0: "batch_size"}
+ outputs = model_info["outputs"].split(",")
+ for output in outputs:
+ dynamic_inputs[output] = {0: "batch_size"}
+ torch.onnx.export(
+ model,
+ example_inputs,
+ output_path,
+ opset_version=11,
+ input_names=names,
+ output_names=outputs,
+ dynamic_axes=dynamic_inputs,
+ )
+
+
+def _get_fake_samples(shape, type):
+ data = []
+ idx = 0
+ for key, val in shape.items():
+ val = [val[0] * 1] + val[1:]
+ data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+ idx += 1
+ return data
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+ args = get_args()
+ torch_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roberta/ixrt/README.md b/models/nlp/language_model/roberta/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0588c797f1f8bf147fe5d37607cf34c2821e7f6d
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/README.md
@@ -0,0 +1,100 @@
+# RoBERTa
+
+## Description
+
+Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code.
+
+## Setup
+
+### Install
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/roberta/ixrt
+cd ${MODEL_PATH}
+
+pip3 install onnxsim
+pip3 install py-libnuma==1.2
+pip3 install bert
+pip3 install pycuda
+pip3 install transformers==4.33.3
+```
+
+### Download
+
+Pretrained model:
+
+Dataset:
+
+```bash
+# Go to path of this model
+cd ${PROJ_ROOT}/models/nlp/language_model/roberta/ixrt/
+
+# get open_roberta
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roberta.tar
+tar xf open_roberta.tar
+rm -f open_roberta.tar
+
+# get roberta-torch-fp32.json
+wget https://raw.githubusercontent.com/bytedance/ByteMLPerf/main/byte_infer_perf/general_perf/model_zoo/roberta-torch-fp32.json
+
+# export onnx
+python3 export_onnx.py --model_path open_roberta/roberta-base-squad.pt --output_path open_roberta/roberta-torch-fp32.onnx
+
+# Simplify onnx model
+onnxsim open_roberta/roberta-torch-fp32.onnx open_roberta/roberta-torch-fp32_sim.onnx
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./open_roberta/roberta-torch-fp32_sim
+export OPTIMIER_FILE=${IXRT_OSS_ROOT}/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+bash scripts/infer_roberta_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit the website: , which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# Link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+
+# Move open_roberta
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+mv open_roberta ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+
+# Get open_squad
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_squad.tar
+tar xf open_squad.tar
+cp ./open_squad/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_squad
+rm -f open_squad.tar
+
+# Get csarron.tar
+wget http://files.deepspark.org.cn:880/deepspark/csarron.tar
+tar xf csarron.tar
+rm -f csarron.tar
+mv csarron/ ./ByteMLPerf/byte_infer_perf/
+
+# Run Acc scripts
+cd ./ByteMLPerf/byte_infer_perf/
+python3 general_perf/core/perf_engine.py --hardware_type ILUVATAR --task roberta-torch-fp32
+```
+
+## Results
+
+| Model | BatchSize | Precision | FPS | F1 | Exact Match |
+| ------- | --------- | --------- | ------ | -------- | ----------- |
+| RoBERTa | 1 | FP16 | 355.48 | 83.14387 | 76.50175 |
diff --git a/models/nlp/language_model/roberta/ixrt/export_onnx.py b/models/nlp/language_model/roberta/ixrt/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc9d2da750a00a4eefd2323faf0354d9eb3eaf69
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/export_onnx.py
@@ -0,0 +1,73 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+import numpy as np
+import torch
+
+
+def torch_to_onnx(model_path, output_path):
+ model_name = output_path.split(".")[0]
+ with open(model_name + ".json", "r") as f:
+ model_info = json.load(f)
+ model_inputs = model_info["inputs"].split(",")
+ input_shapes = model_info["input_shape"]
+ input_type = model_info["input_type"].split(",")
+ example_inputs = _get_fake_samples(input_shapes, input_type)
+
+ model = torch.jit.load(model_path, map_location=torch.device("cpu"))
+ model.eval()
+
+ names = model_inputs
+ dynamic_inputs = {}
+ for i in range(len(names)):
+ dynamic_inputs[names[i]] = {0: "batch_size"}
+ outputs = model_info["outputs"].split(",")
+ for output in outputs:
+ dynamic_inputs[output] = {0: "batch_size"}
+ torch.onnx.export(
+ model,
+ example_inputs,
+ output_path,
+ opset_version=11,
+ input_names=names,
+ output_names=outputs,
+ dynamic_axes=dynamic_inputs,
+ )
+
+
+def _get_fake_samples(shape, type):
+ data = []
+ idx = 0
+ for key, val in shape.items():
+ val = [val[0] * 1] + val[1:]
+ data.append(torch.from_numpy(np.random.random(val).astype(type[idx].lower())))
+ idx += 1
+ return data
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+ args = get_args()
+ torch_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roberta/ixrt/gen_data.py b/models/nlp/language_model/roberta/ixrt/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a59225b2613b2e456b88ed4c79329287713e77a6
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/gen_data.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import argparse
+
+import numpy as np
+import torch
+
+
+def gen_data(batch_size, output):
+ a = torch.randint(0, 50265, (batch_size, 384))
+ a = a.numpy().astype(np.int64)
+ a.tofile(output+"input_ids.bin")
+
+ a = np.ones((batch_size, 384), dtype=np.int64)
+ a.tofile(output+"input_mask.bin")
+
+ a = np.zeros((batch_size, 384), dtype=np.int64)
+ a.tofile(output+"token_type_ids.bin")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Generate data for RoBERTa model.")
+ parser.add_argument(
+ "--batch_size", type=int, required=True, help="Batch size for data generation"
+ )
+ parser.add_argument("--output_path", default="")
+
+ args = parser.parse_args()
+
+ gen_data(args.batch_size, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roberta/ixrt/perf_engine.py b/models/nlp/language_model/roberta/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f108474b281bfce71ccaf73d60ba3119cf97c1
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--task",
+ default="resnet50-tf-fp32",
+ help="The task going to be evaluted, refs to workloads/")
+ parser.add_argument(
+ "--hardware_type",
+ default="GPU",
+ help="The backend going to be evaluted, refs to backends/")
+ parser.add_argument("--compile_only",
+ action='store_true',
+ help="Run compilation only")
+
+ args = parser.parse_args()
+ return args
+
+
+class PerfEngine:
+ def __init__(self) -> None:
+ super().__init__()
+ self.args = get_args()
+ self.workload = load_workload(self.args.task)
+ self.backend_type = self.args.hardware_type
+ self.compile_backend = None
+ self.old_os_path = os.environ['PATH']
+ self.prev_sys_path = list(sys.path)
+ self.real_prefix = sys.prefix
+ self.compile_only_mode = False
+
+ def start_engine(self) -> None:
+ '''
+ Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+ '''
+ success, total = 0, len(self.workload)
+ if total == 0:
+ return
+ log.info("******************* Backend Env Initization *******************")
+ status = self.activate_venv(self.backend_type)
+ if not status:
+ log.warning("Activate virtualenv Failed, Please Check...")
+
+ self.compile_backend = init_compile_backend(self.backend_type)
+ self.runtime_backend = init_runtime_backend(self.backend_type)
+
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type)
+ os.makedirs(output_dir, exist_ok=True)
+
+ status = self.single_workload_perf(self.workload)
+
+ def single_workload_perf(
+ self, workload: Dict[str, Any]) -> bool:
+ log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+ # Check Compile Only Mode
+ self.compile_only_mode = False
+ if self.args.compile_only or workload['compile_only']:
+ self.compile_only_mode = True
+
+ base_report = {
+ "Model": workload['model'].upper(),
+ "Backend": self.backend_type,
+ "Host Info": self.get_cpu_name()
+ }
+
+ # Initalize Model Config Info
+ model_info = self.get_model_info(workload['model'])
+ pre_compile_config = {"workload": workload, 'model_info': model_info}
+ interact_info = self.check_interact_info(pre_compile_config)
+ pre_compile_config['interact_info'] = interact_info
+ if not model_info['dataset_name']:
+ model_info['dataset_name'] = 'fake_dataset'
+
+
+ '''
+ Compile Backend could do some optimization like convert model format here
+ '''
+ log.info("******************************************* Running Backend Compilation... *******************************************")
+ log.info("Running Backend Preoptimization...")
+ pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+ # Initalize dataset
+ dataset = load_dataset(model_info)
+ dataset.preprocess()
+ base_report['Dataset'] = model_info['dataset_name'].upper(
+ ) if model_info['dataset_name'] else None
+
+ #Placeholder Only
+ segment_info = self.compile_backend.segment(pre_compile_config)
+
+ best_batch_sizes = self.compile_backend.get_best_batch_size()
+ if isinstance(best_batch_sizes, list):
+ pre_compile_config['workload'][
+ 'batch_sizes'] = best_batch_sizes
+
+ log.info("Start to compile the model...")
+ start = time.time()
+ compile_info = self.compile_backend.compile(pre_compile_config,
+ dataset)
+ end = time.time()
+
+ graph_compile_report = {}
+ graph_compile_report["Compile Duration"] = round(end - start, 5)
+ graph_compile_report["Compile Precision"] = compile_info[
+ 'compile_precision']
+ graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+ if 'optimizations' in compile_info:
+ graph_compile_report['Optimizations'] = compile_info['optimizations']
+ if 'instance_count' in compile_info:
+ base_report['Instance Count'] = compile_info['instance_count']
+ if 'device_count' in compile_info:
+ base_report['Device Count'] = compile_info['device_count']
+ base_report['Graph Compile'] = graph_compile_report
+
+ # Initalize Output Dir and Reports
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type + '/' +
+ workload['model'])
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Compile only mode will stop here
+ if self.compile_only_mode:
+ base_report.pop("Backend")
+ return compile_info["compile_status"], base_report
+
+ # load runtime backend
+ """
+ Start Here
+ """
+ batch_sizes = pre_compile_config['workload']['batch_sizes']
+ self.runtime_backend.configs = compile_info
+ self.runtime_backend.workload = workload
+ self.runtime_backend.model_info = model_info
+
+ self.runtime_backend.load(workload['batch_sizes'][0])
+ # test accuracy
+ accuracy_report = {}
+ AccuracyChecker = self.get_accuracy_checker(
+ model_info['dataset_name']
+ if model_info['dataset_name'] else 'fake_dataset')
+ AccuracyChecker.runtime_backend = self.runtime_backend
+ AccuracyChecker.dataloader = dataset
+ AccuracyChecker.output_dir = output_dir
+ AccuracyChecker.configs = compile_info
+
+ if workload['test_accuracy']:
+ log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+
+ accuracy_report['Data Percent'] = workload['data_percent']
+ accuracy_report.update(accuracy_results)
+
+ # test numeric
+ if workload['test_numeric']:
+ log.info("******************************************* Running Numeric Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ if not workload['test_accuracy']:
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+ diff_results = AccuracyChecker.calculate_diff()
+ accuracy_report.update(diff_results)
+ # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+ if accuracy_report:
+ base_report['Accuracy'] = accuracy_report
+
+ # function to test qps and latency
+ if workload['test_perf']:
+ log.info("******************************************* Runing QPS Checker... *******************************************")
+ performance_reports = []
+ qs_status = self.runtime_backend.is_qs_mode_supported()
+ if qs_status:
+ qs_config = self.runtime_backend.generate_qs_config()
+ performance_reports = self.qs_benchmark(qs_config)
+ else:
+ for bs in batch_sizes:
+ self.runtime_backend.load(bs)
+ batch_reports = self.runtime_backend.benchmark(dataset)
+ performance_reports.append(batch_reports)
+ base_report['Performance'] = performance_reports
+
+ if "Instance Count" not in base_report:
+ log.warning("Vendors need to Add # of instances")
+ if "Device Count" not in base_report:
+ log.warning("Vendors need to Add # of devices")
+
+ # write output to json file
+ output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+ with open(output_report_path, 'w') as file:
+ json.dump(base_report, file, indent=4)
+
+ base_report.pop("Backend")
+ log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+ format(output_dir[output_dir.rfind('general_perf'):],
+ os.path.basename(output_report_path)))
+
+ return compile_info["compile_status"]
+
+ #WIP
+ def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+ return []
+
+ def get_accuracy_checker(self, dataset_name: str):
+ AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+ dataset_name +
+ ".test_accuracy")
+ AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+ return AccuracyChecker()
+
+ def get_model_info(self, model_name: str) -> Dict[str, Any]:
+ with open("general_perf/model_zoo/" + model_name + '.json',
+ 'r') as file:
+ model_info = json.load(file)
+ return model_info
+
+ def get_cpu_name(self):
+ command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+ cpu_name = subprocess.check_output(command, shell=True)
+ return cpu_name.decode().strip()
+
+ def check_interact_info(
+ self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+ interact_info = self.compile_backend.get_interact_profile(
+ pre_compile_config)
+
+ answer = {}
+ if len(interact_info) == 0:
+ return answer
+
+ dialog_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ })
+
+ input_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ 'text-area.prompt': 'bg:#ffffff',
+ 'text-area': '#000000',
+ })
+
+ option = yes_no_dialog(title=self.backend_type + '编译配置',
+ text='[请选择]:是否进行编译后端配置:',
+ style=dialog_style).run()
+ if option:
+ sum_question = len(interact_info)
+ for i, question in enumerate(interact_info):
+ if question['depends']:
+ state = 0
+ for title in question['depends'].split(','):
+ if not answer[title]:
+ state = 1
+ if state:
+ continue
+ if question['dialog_type'] == 'Yes/No Dialog':
+ option = yes_no_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=dialog_style).run()
+ elif question['dialog_type'] == "Input Dialog":
+ option = input_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=input_style).run()
+ elif question['dialog_type'] == "Radiolist Dialog":
+ choice = [(i, text)
+ for i, text in enumerate(question['options'])]
+ num = radiolist_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ values=choice,
+ style=dialog_style).run()
+ option = question['options'][num] if num is not None else question[
+ 'default']
+ answer[question['name']] = option
+
+ return answer
+
+ def activate_venv(self, hardware_type: str) -> bool:
+
+ return True
+
+ def deactivate_venv(self):
+ sys.path[:
+ 0] = self.prev_sys_path #will also revert the added site-packages
+ sys.prefix = self.real_prefix
+ os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+ engine = PerfEngine()
+ engine.start_engine()
\ No newline at end of file
diff --git a/models/nlp/language_model/roberta/ixrt/scripts/infer_roberta_fp16_performance.sh b/models/nlp/language_model/roberta/ixrt/scripts/infer_roberta_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..90bdec9be8b064f41e4c5c96a40bd09d1f52b253
--- /dev/null
+++ b/models/nlp/language_model/roberta/ixrt/scripts/infer_roberta_fp16_performance.sh
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+ BS=${1:-1}
+ TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+ TARGET_ENGINE=${ORIGIN_ONNX_NAME}_bs_${BS}_end.engine
+ if [[ ! -f "${ORIGIN_ONNX}" ]];then
+ echo "${ORIGIN_ONNX} not exists!"
+ exit 1
+ fi
+
+ python3 ${PROJ_PATH}/gen_data.py --batch_size ${BS} --output_path ${PROJ_PATH}
+
+ # Graph optimize
+ [ -f "${TARGET_ONNX}" ] || python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+
+ # Build Engine
+ ixrtexec --onnx ${TARGET_ONNX} --min_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+ --opt_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+ --max_shape input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384 \
+ --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin
+
+ # Test Performance
+ ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes input_ids.1:${BS}x384,attention_mask.1:${BS}x384,token_type_ids.1:${BS}x384
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/roformer/ixrt/README.md b/models/nlp/language_model/roformer/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c088cf0f740821d5cc96557dbc53588f4ee5866f
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/README.md
@@ -0,0 +1,105 @@
+# RoFormer
+
+## Description
+
+Position encoding recently has shown effective in the transformer architecture. It enables valuable supervision for dependency modeling between elements at different positions of the sequence. In this paper, we first investigate various methods to integrate positional information into the learning process of transformer-based language models. Then, we propose a novel method named Rotary Position Embedding(RoPE) to effectively leverage the positional information. Specifically, the proposed RoPE encodes the absolute position with a rotation matrix and meanwhile incorporates the explicit relative position dependency in self-attention formulation. Notably, RoPE enables valuable properties, including the flexibility of sequence length, decaying inter-token dependency with increasing relative distances, and the capability of equipping the linear self-attention with relative position encoding. Finally, we evaluate the enhanced transformer with rotary position embedding, also called RoFormer, on various long text classification benchmark datasets.
+
+## Setup
+
+### Install
+
+```bash
+apt install -y libnuma-dev
+
+pip3 install tf2onnx
+pip3 install pycuda
+pip3 install onnxsim
+pip3 install py-libnuma==1.2
+
+```
+
+### Download
+
+Pretrained model:
+
+Dataset:
+
+```bash
+# Go to path of this model
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/roformer/ixrt
+cd ${MODEL_PATH}
+
+# Download the pretrained model and dataset to 'data'
+mkdir -p data/
+pushd data/
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_roformer.tar
+tar xf open_roformer.tar
+rm -f open_roformer.tar
+popd
+```
+
+### Deal with ONNX
+
+```bash
+# export onnx
+python3 export_onnx.py --model_path ./data/open_roformer --output_path ./data/open_roformer/roformer-frozen_org.onnx
+
+# Simplify onnx model
+onnxsim ./data/open_roformer/roformer-frozen_org.onnx ./data/open_roformer/roformer-frozen.onnx
+python3 deploy.py --model_path ./data/open_roformer/roformer-frozen.onnx --output_path ./data/open_roformer/roformer-frozen.onnx
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./data/open_roformer/roformer-frozen
+export OPTIMIER_FILE=${IXRT_OSS_ROOT}/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+bash scripts/infer_roformer_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit the website: , which integrates inference and training of many models under this framework, supporting the ILUVATAR backend.
+
+For detailed steps regarding this model, please refer to this document: Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link ByteMLPerf and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+
+# Comment Line102 in compile_backend_iluvatar.py
+sed -i '102s/build_engine/# build_engine/' ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/compile_backend_iluvatar.py
+
+# Move open_roformer
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+mv ./data/open_roformer ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/
+
+# Setup open_cail2019 dataset
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_cail2019.tar
+tar xf open_cail2019.tar
+cp ./open_cail2019/* ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cail2019
+rm -f open_cail2019.tar
+
+# Go to general_perf/
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+# Modify model_zoo/roformer-tf-fp32.json
+sed -i 's/segment:0/segment0/g; s/token:0/token0/g' model_zoo/roformer-tf-fp32.json
+# Run Acc scripts
+python3 core/perf_engine.py --hardware_type ILUVATAR --task roformer-tf-fp32
+```
+
+## Results
+
+| Model | BatchSize | Precision | FPS | ACC |
+| -------- | --------- | --------- | ------- | ------- |
+| RoFormer | 2 | FP16 | 195.186 | 0.33789 |
diff --git a/models/nlp/language_model/roformer/ixrt/deploy.py b/models/nlp/language_model/roformer/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..073fb7333577624be7c304eaeb1916d272cb4dcc
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/deploy.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import onnx
+import argparse
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+if __name__ == "__main__":
+ args = get_args()
+ model = onnx.load(args.model_path)
+ for input in model.graph.input:
+ for node in model.graph.node:
+ for i, name in enumerate(node.input):
+ if name == input.name:
+ node.input[i] =name.replace(':',"")
+ input.name=input.name.replace(':',"")# 保存修改后的模型
+ onnx.save(model, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roformer/ixrt/export_onnx.py b/models/nlp/language_model/roformer/ixrt/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..475dddd7c2ab27b6ca342be98ea92d2c791ff60b
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/export_onnx.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import tf2onnx
+from tf2onnx import tf_loader
+import argparse
+ONNX_OPSET = 11
+
+def _convert_graphdef_to_onnx(graph_def,
+ inputs=None,
+ outputs=None,
+ output_path='',
+ **kwargs):
+
+ inputs_as_nchw = kwargs.get('inputs_as_nchw', None)
+ custom_ops = kwargs.get('custom_ops', None)
+ custom_op_handlers = kwargs.get('custom_op_handlers', None)
+ custom_rewriter = kwargs.get('custom_rewriter', None)
+ extra_opset = kwargs.get('extra_opset', None)
+ large_model = kwargs.get('large_model', False)
+ name = kwargs.get('name', 'habana_convert')
+ target = kwargs.get('target', None)
+ shape_override = kwargs.get('shape_override', {})
+
+ tf2onnx.convert.from_graph_def(graph_def,
+ name=name,
+ input_names=inputs,
+ output_names=outputs,
+ opset=ONNX_OPSET,
+ custom_ops=custom_ops,
+ custom_op_handlers=custom_op_handlers,
+ custom_rewriter=custom_rewriter,
+ inputs_as_nchw=inputs_as_nchw,
+ extra_opset=extra_opset,
+ shape_override=shape_override,
+ target=target,
+ large_model=large_model,
+ output_path=output_path)
+ return output_path
+
+def savedmodel_to_onnx(model_path, output_path='', **kwargs):
+ inputs = kwargs.get('inputs', None)
+ outputs = kwargs.get('outputs', None)
+ graph_def, inputs, outputs = tf_loader.from_saved_model(
+ model_path, inputs, outputs)
+ return _convert_graphdef_to_onnx(graph_def, inputs, outputs, output_path, **kwargs)
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+if __name__ == "__main__":
+ args = get_args()
+ savedmodel_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/nlp/language_model/roformer/ixrt/perf_engine.py b/models/nlp/language_model/roformer/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f108474b281bfce71ccaf73d60ba3119cf97c1
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--task",
+ default="resnet50-tf-fp32",
+ help="The task going to be evaluted, refs to workloads/")
+ parser.add_argument(
+ "--hardware_type",
+ default="GPU",
+ help="The backend going to be evaluted, refs to backends/")
+ parser.add_argument("--compile_only",
+ action='store_true',
+ help="Run compilation only")
+
+ args = parser.parse_args()
+ return args
+
+
+class PerfEngine:
+ def __init__(self) -> None:
+ super().__init__()
+ self.args = get_args()
+ self.workload = load_workload(self.args.task)
+ self.backend_type = self.args.hardware_type
+ self.compile_backend = None
+ self.old_os_path = os.environ['PATH']
+ self.prev_sys_path = list(sys.path)
+ self.real_prefix = sys.prefix
+ self.compile_only_mode = False
+
+ def start_engine(self) -> None:
+ '''
+ Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+ '''
+ success, total = 0, len(self.workload)
+ if total == 0:
+ return
+ log.info("******************* Backend Env Initization *******************")
+ status = self.activate_venv(self.backend_type)
+ if not status:
+ log.warning("Activate virtualenv Failed, Please Check...")
+
+ self.compile_backend = init_compile_backend(self.backend_type)
+ self.runtime_backend = init_runtime_backend(self.backend_type)
+
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type)
+ os.makedirs(output_dir, exist_ok=True)
+
+ status = self.single_workload_perf(self.workload)
+
+ def single_workload_perf(
+ self, workload: Dict[str, Any]) -> bool:
+ log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+ # Check Compile Only Mode
+ self.compile_only_mode = False
+ if self.args.compile_only or workload['compile_only']:
+ self.compile_only_mode = True
+
+ base_report = {
+ "Model": workload['model'].upper(),
+ "Backend": self.backend_type,
+ "Host Info": self.get_cpu_name()
+ }
+
+ # Initalize Model Config Info
+ model_info = self.get_model_info(workload['model'])
+ pre_compile_config = {"workload": workload, 'model_info': model_info}
+ interact_info = self.check_interact_info(pre_compile_config)
+ pre_compile_config['interact_info'] = interact_info
+ if not model_info['dataset_name']:
+ model_info['dataset_name'] = 'fake_dataset'
+
+
+ '''
+ Compile Backend could do some optimization like convert model format here
+ '''
+ log.info("******************************************* Running Backend Compilation... *******************************************")
+ log.info("Running Backend Preoptimization...")
+ pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+ # Initalize dataset
+ dataset = load_dataset(model_info)
+ dataset.preprocess()
+ base_report['Dataset'] = model_info['dataset_name'].upper(
+ ) if model_info['dataset_name'] else None
+
+ #Placeholder Only
+ segment_info = self.compile_backend.segment(pre_compile_config)
+
+ best_batch_sizes = self.compile_backend.get_best_batch_size()
+ if isinstance(best_batch_sizes, list):
+ pre_compile_config['workload'][
+ 'batch_sizes'] = best_batch_sizes
+
+ log.info("Start to compile the model...")
+ start = time.time()
+ compile_info = self.compile_backend.compile(pre_compile_config,
+ dataset)
+ end = time.time()
+
+ graph_compile_report = {}
+ graph_compile_report["Compile Duration"] = round(end - start, 5)
+ graph_compile_report["Compile Precision"] = compile_info[
+ 'compile_precision']
+ graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+ if 'optimizations' in compile_info:
+ graph_compile_report['Optimizations'] = compile_info['optimizations']
+ if 'instance_count' in compile_info:
+ base_report['Instance Count'] = compile_info['instance_count']
+ if 'device_count' in compile_info:
+ base_report['Device Count'] = compile_info['device_count']
+ base_report['Graph Compile'] = graph_compile_report
+
+ # Initalize Output Dir and Reports
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type + '/' +
+ workload['model'])
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Compile only mode will stop here
+ if self.compile_only_mode:
+ base_report.pop("Backend")
+ return compile_info["compile_status"], base_report
+
+ # load runtime backend
+ """
+ Start Here
+ """
+ batch_sizes = pre_compile_config['workload']['batch_sizes']
+ self.runtime_backend.configs = compile_info
+ self.runtime_backend.workload = workload
+ self.runtime_backend.model_info = model_info
+
+ self.runtime_backend.load(workload['batch_sizes'][0])
+ # test accuracy
+ accuracy_report = {}
+ AccuracyChecker = self.get_accuracy_checker(
+ model_info['dataset_name']
+ if model_info['dataset_name'] else 'fake_dataset')
+ AccuracyChecker.runtime_backend = self.runtime_backend
+ AccuracyChecker.dataloader = dataset
+ AccuracyChecker.output_dir = output_dir
+ AccuracyChecker.configs = compile_info
+
+ if workload['test_accuracy']:
+ log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+
+ accuracy_report['Data Percent'] = workload['data_percent']
+ accuracy_report.update(accuracy_results)
+
+ # test numeric
+ if workload['test_numeric']:
+ log.info("******************************************* Running Numeric Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ if not workload['test_accuracy']:
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+ diff_results = AccuracyChecker.calculate_diff()
+ accuracy_report.update(diff_results)
+ # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+ if accuracy_report:
+ base_report['Accuracy'] = accuracy_report
+
+ # function to test qps and latency
+ if workload['test_perf']:
+ log.info("******************************************* Runing QPS Checker... *******************************************")
+ performance_reports = []
+ qs_status = self.runtime_backend.is_qs_mode_supported()
+ if qs_status:
+ qs_config = self.runtime_backend.generate_qs_config()
+ performance_reports = self.qs_benchmark(qs_config)
+ else:
+ for bs in batch_sizes:
+ self.runtime_backend.load(bs)
+ batch_reports = self.runtime_backend.benchmark(dataset)
+ performance_reports.append(batch_reports)
+ base_report['Performance'] = performance_reports
+
+ if "Instance Count" not in base_report:
+ log.warning("Vendors need to Add # of instances")
+ if "Device Count" not in base_report:
+ log.warning("Vendors need to Add # of devices")
+
+ # write output to json file
+ output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+ with open(output_report_path, 'w') as file:
+ json.dump(base_report, file, indent=4)
+
+ base_report.pop("Backend")
+ log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+ format(output_dir[output_dir.rfind('general_perf'):],
+ os.path.basename(output_report_path)))
+
+ return compile_info["compile_status"]
+
+ #WIP
+ def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+ return []
+
+ def get_accuracy_checker(self, dataset_name: str):
+ AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+ dataset_name +
+ ".test_accuracy")
+ AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+ return AccuracyChecker()
+
+ def get_model_info(self, model_name: str) -> Dict[str, Any]:
+ with open("general_perf/model_zoo/" + model_name + '.json',
+ 'r') as file:
+ model_info = json.load(file)
+ return model_info
+
+ def get_cpu_name(self):
+ command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+ cpu_name = subprocess.check_output(command, shell=True)
+ return cpu_name.decode().strip()
+
+ def check_interact_info(
+ self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+ interact_info = self.compile_backend.get_interact_profile(
+ pre_compile_config)
+
+ answer = {}
+ if len(interact_info) == 0:
+ return answer
+
+ dialog_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ })
+
+ input_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ 'text-area.prompt': 'bg:#ffffff',
+ 'text-area': '#000000',
+ })
+
+ option = yes_no_dialog(title=self.backend_type + '编译配置',
+ text='[请选择]:是否进行编译后端配置:',
+ style=dialog_style).run()
+ if option:
+ sum_question = len(interact_info)
+ for i, question in enumerate(interact_info):
+ if question['depends']:
+ state = 0
+ for title in question['depends'].split(','):
+ if not answer[title]:
+ state = 1
+ if state:
+ continue
+ if question['dialog_type'] == 'Yes/No Dialog':
+ option = yes_no_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=dialog_style).run()
+ elif question['dialog_type'] == "Input Dialog":
+ option = input_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=input_style).run()
+ elif question['dialog_type'] == "Radiolist Dialog":
+ choice = [(i, text)
+ for i, text in enumerate(question['options'])]
+ num = radiolist_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ values=choice,
+ style=dialog_style).run()
+ option = question['options'][num] if num is not None else question[
+ 'default']
+ answer[question['name']] = option
+
+ return answer
+
+ def activate_venv(self, hardware_type: str) -> bool:
+
+ return True
+
+ def deactivate_venv(self):
+ sys.path[:
+ 0] = self.prev_sys_path #will also revert the added site-packages
+ sys.prefix = self.real_prefix
+ os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+ engine = PerfEngine()
+ engine.start_engine()
\ No newline at end of file
diff --git a/models/nlp/language_model/roformer/ixrt/scripts/infer_roformer_fp16_performance.sh b/models/nlp/language_model/roformer/ixrt/scripts/infer_roformer_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0510e32d98c50d995b584fac3241b804eca512c6
--- /dev/null
+++ b/models/nlp/language_model/roformer/ixrt/scripts/infer_roformer_fp16_performance.sh
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+ BS=${1:-1}
+ TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+ TARGET_ENGINE=${ORIGIN_ONNX_NAME}_end.engine
+ SHAPE="input_segment0:${BS}x1024,input_token0:${BS}x1024"
+ MAX_SHAPE="input_segment0:64x1024,input_token0:64x1024"
+ if [[ ! -f "${ORIGIN_ONNX}" ]];then
+ echo "${ORIGIN_ONNX} not exists!"
+ exit 1
+ fi
+
+ # Graph optimize
+ python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --model_type roformer
+
+ # Build Engine
+ ixrtexec --onnx ${TARGET_ONNX} --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin \
+ --min_shape $SHAPE --opt_shape $SHAPE --max_shape $MAX_SHAPE --shapes $SHAPE
+
+ # Test Performance
+ ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes ${SHAPE}
+
+}
+run 2
\ No newline at end of file
diff --git a/models/nlp/language_model/videobert/ixrt/README.md b/models/nlp/language_model/videobert/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d485fbe39f024ea7036fa987876307bfff02b2f5
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/README.md
@@ -0,0 +1,84 @@
+# VideoBERT
+
+## Description
+
+VideoBERT is a model designed for video understanding tasks, extending the capabilities of BERT (Bidirectional Encoder Representations from Transformers) to video data. It enhances video representation learning by integrating both visual and textual information into a unified framework.
+
+## Setup
+
+### Install
+
+```bash
+apt install -y libnuma-dev
+
+pip3 install onnxsim
+pip3 install onnx_graphsurgeon
+pip3 install scikit-learn
+pip3 install tqdm
+pip3 install pycuda
+pip3 install onnx
+pip3 install tabulate
+pip3 install cv2
+pip3 install pycocotools
+pip3 install opencv-python==4.6.0.66
+pip3 install transformers==4.33.3
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the cifar-100-python dataset.
+
+or you can :
+
+```bash
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/nlp/language_model/videobert/ixrt
+cd ${MODEL_PATH}
+bash ./scripts/prepare_model_and_dataset.sh
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./general_perf/model_zoo/popular/open_videobert/video-bert
+export OPTIMIER_FILE=./ixrt-oss/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+bash scripts/infer_videobert_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit here: , which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link and install requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/backends/ILUVATAR/requirements.txt
+
+# copy data
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/
+cp -r ./datasets/open_cifar/cifar-100-python/ ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_cifar/
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/open_videobert/
+cp ./general_perf/model_zoo/popular/open_videobert/video-bert.onnx ByteMLPerf/byte_infer_perf/general_perf/model_zoo/popular/open_videobert/
+
+# run acc scripts
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task videobert-onnx-fp32
+```
+
+## Results
+
+| Model | BatchSize | Precision | QPS | Top-1 ACC |
+| --------- | --------- | --------- | ----- | --------- |
+| VideoBERT | 4 | FP16 | 37.68 | 61.67 |
diff --git a/models/nlp/language_model/videobert/ixrt/perf_engine.py b/models/nlp/language_model/videobert/ixrt/perf_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..089d9860f573bba7e19f84aa20fb830a8fcc22d8
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/perf_engine.py
@@ -0,0 +1,349 @@
+# Copyright 2023 ByteDance and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import logging
+import importlib
+import json
+import subprocess
+import time
+
+from typing import Any, Dict, Tuple
+from prompt_toolkit.shortcuts import radiolist_dialog, input_dialog, yes_no_dialog
+from prompt_toolkit.styles import Style
+
+BYTE_MLPERF_ROOT = os.path.dirname(
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.chdir(BYTE_MLPERF_ROOT)
+sys.path.insert(0, BYTE_MLPERF_ROOT)
+
+import argparse
+from general_perf.core.configs.workload_store import load_workload
+from general_perf.core.configs.dataset_store import load_dataset
+from general_perf.core.configs.backend_store import init_compile_backend, init_runtime_backend
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("PerfEngine")
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--task",
+ default="resnet50-tf-fp32",
+ help="The task going to be evaluted, refs to workloads/")
+ parser.add_argument(
+ "--hardware_type",
+ default="GPU",
+ help="The backend going to be evaluted, refs to backends/")
+ parser.add_argument("--compile_only",
+ action='store_true',
+ help="Run compilation only")
+
+ args = parser.parse_args()
+ return args
+
+
+class PerfEngine:
+ def __init__(self) -> None:
+ super().__init__()
+ self.args = get_args()
+ self.workload = load_workload(self.args.task)
+ self.backend_type = self.args.hardware_type
+ self.compile_backend = None
+ self.old_os_path = os.environ['PATH']
+ self.prev_sys_path = list(sys.path)
+ self.real_prefix = sys.prefix
+ self.compile_only_mode = False
+
+ def start_engine(self) -> None:
+ '''
+ Byte MlPerf will create an virtual env for each backend to avoid dependance conflict
+ '''
+ success, total = 0, len(self.workload)
+ if total == 0:
+ return
+ log.info("******************* Backend Env Initization *******************")
+ status = self.activate_venv(self.backend_type)
+ if not status:
+ log.warning("Activate virtualenv Failed, Please Check...")
+
+ self.compile_backend = init_compile_backend(self.backend_type)
+ self.runtime_backend = init_runtime_backend(self.backend_type)
+
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type)
+ os.makedirs(output_dir, exist_ok=True)
+
+ status = self.single_workload_perf(self.workload)
+
+ def single_workload_perf(
+ self, workload: Dict[str, Any]) -> bool:
+ log.info("******************************************* Start to test model: {}. *******************************************".format(workload['model']))
+
+ # Check Compile Only Mode
+ self.compile_only_mode = False
+ if self.args.compile_only or workload['compile_only']:
+ self.compile_only_mode = True
+
+ base_report = {
+ "Model": workload['model'].upper(),
+ "Backend": self.backend_type,
+ "Host Info": self.get_cpu_name()
+ }
+
+ # Initalize Model Config Info
+ model_info = self.get_model_info(workload['model'])
+ pre_compile_config = {"workload": workload, 'model_info': model_info}
+ interact_info = self.check_interact_info(pre_compile_config)
+ pre_compile_config['interact_info'] = interact_info
+ if not model_info['dataset_name']:
+ model_info['dataset_name'] = 'fake_dataset'
+
+
+ '''
+ Compile Backend could do some optimization like convert model format here
+ '''
+ log.info("******************************************* Running Backend Compilation... *******************************************")
+ log.info("Running Backend Preoptimization...")
+ pre_compile_config = self.compile_backend.pre_optimize(pre_compile_config)
+
+
+ # Initalize dataset
+ dataset = load_dataset(model_info)
+ dataset.preprocess()
+ base_report['Dataset'] = model_info['dataset_name'].upper(
+ ) if model_info['dataset_name'] else None
+
+ #Placeholder Only
+ segment_info = self.compile_backend.segment(pre_compile_config)
+
+ best_batch_sizes = self.compile_backend.get_best_batch_size()
+ if isinstance(best_batch_sizes, list):
+ pre_compile_config['workload'][
+ 'batch_sizes'] = best_batch_sizes
+
+ log.info("Start to compile the model...")
+ start = time.time()
+ compile_info = self.compile_backend.compile(pre_compile_config,
+ dataset)
+ end = time.time()
+
+ graph_compile_report = {}
+ graph_compile_report["Compile Duration"] = round(end - start, 5)
+ graph_compile_report["Compile Precision"] = compile_info[
+ 'compile_precision']
+ graph_compile_report["Subgraph Coverage"] = compile_info['sg_percent']
+ if 'optimizations' in compile_info:
+ graph_compile_report['Optimizations'] = compile_info['optimizations']
+ if 'instance_count' in compile_info:
+ base_report['Instance Count'] = compile_info['instance_count']
+ if 'device_count' in compile_info:
+ base_report['Device Count'] = compile_info['device_count']
+ base_report['Graph Compile'] = graph_compile_report
+
+ # Initalize Output Dir and Reports
+ output_dir = os.path.abspath('general_perf/reports/' +
+ self.backend_type + '/' +
+ workload['model'])
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Compile only mode will stop here
+ if self.compile_only_mode:
+ base_report.pop("Backend")
+ return compile_info["compile_status"], base_report
+
+ # load runtime backend
+ """
+ Start Here
+ """
+ batch_sizes = pre_compile_config['workload']['batch_sizes']
+ self.runtime_backend.configs = compile_info
+ self.runtime_backend.workload = workload
+ self.runtime_backend.model_info = model_info
+
+ self.runtime_backend.load(workload['batch_sizes'][0])
+ # test accuracy
+ accuracy_report = {}
+ AccuracyChecker = self.get_accuracy_checker(
+ model_info['dataset_name']
+ if model_info['dataset_name'] else 'fake_dataset')
+ AccuracyChecker.runtime_backend = self.runtime_backend
+ AccuracyChecker.dataloader = dataset
+ AccuracyChecker.output_dir = output_dir
+ AccuracyChecker.configs = compile_info
+
+ if workload['test_accuracy']:
+ log.info("******************************************* Running Accuracy Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+
+ accuracy_report['Data Percent'] = workload['data_percent']
+ accuracy_report.update(accuracy_results)
+
+ # test numeric
+ if workload['test_numeric']:
+ log.info("******************************************* Running Numeric Checker... *******************************************")
+
+ dataset.rebatch(self.runtime_backend.get_loaded_batch_size())
+ if not workload['test_accuracy']:
+ accuracy_results = AccuracyChecker.calculate_acc(
+ workload['data_percent'])
+ diff_results = AccuracyChecker.calculate_diff()
+ accuracy_report.update(diff_results)
+ # accuracy_report['Diff Dist'] = compile_info['model'] + '-to-' + compile_info['compile_precision'].lower() + ".png"
+
+ if accuracy_report:
+ base_report['Accuracy'] = accuracy_report
+
+ # function to test qps and latency
+ if workload['test_perf']:
+ log.info("******************************************* Runing QPS Checker... *******************************************")
+ performance_reports = []
+ qs_status = self.runtime_backend.is_qs_mode_supported()
+ if qs_status:
+ qs_config = self.runtime_backend.generate_qs_config()
+ performance_reports = self.qs_benchmark(qs_config)
+ else:
+ for bs in batch_sizes:
+ self.runtime_backend.load(bs)
+ batch_reports = self.runtime_backend.benchmark(dataset)
+ performance_reports.append(batch_reports)
+ base_report['Performance'] = performance_reports
+
+ if "Instance Count" not in base_report:
+ log.warning("Vendors need to Add # of instances")
+ if "Device Count" not in base_report:
+ log.warning("Vendors need to Add # of devices")
+
+ # write output to json file
+ output_report_path = output_dir + "/result-" + compile_info['compile_precision'].lower() + ".json"
+ with open(output_report_path, 'w') as file:
+ json.dump(base_report, file, indent=4)
+
+ base_report.pop("Backend")
+ log.info("Testing Finish. Report is saved in path: [ {}/{} ]".
+ format(output_dir[output_dir.rfind('general_perf'):],
+ os.path.basename(output_report_path)))
+
+ return compile_info["compile_status"]
+
+ #WIP
+ def qs_benchmark(self, qs_config: Dict[str, Any]) -> list:
+ return []
+
+ def get_accuracy_checker(self, dataset_name: str):
+ AccuracyChecker = importlib.import_module('general_perf.datasets.' +
+ dataset_name +
+ ".test_accuracy")
+ AccuracyChecker = getattr(AccuracyChecker, 'AccuracyChecker')
+ return AccuracyChecker()
+
+ def get_model_info(self, model_name: str) -> Dict[str, Any]:
+ with open("general_perf/model_zoo/" + model_name + '.json',
+ 'r') as file:
+ model_info = json.load(file)
+ return model_info
+
+ def get_cpu_name(self):
+ command = "lscpu | grep 'Model name' | awk -F: '{print $2}'"
+ cpu_name = subprocess.check_output(command, shell=True)
+ return cpu_name.decode().strip()
+
+ def check_interact_info(
+ self, pre_compile_config: Dict[str, Dict]) -> Dict[str, Any]:
+ interact_info = self.compile_backend.get_interact_profile(
+ pre_compile_config)
+
+ answer = {}
+ if len(interact_info) == 0:
+ return answer
+
+ dialog_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ })
+
+ input_style = Style.from_dict({
+ 'dialog': 'bg:#88b8ff',
+ 'dialog frame.label': 'bg:#ffffff #000000',
+ 'dialog.body': 'bg:#000000 #a0acde',
+ 'dialog shadow': 'bg:#004aaa',
+ 'text-area.prompt': 'bg:#ffffff',
+ 'text-area': '#000000',
+ })
+
+ option = yes_no_dialog(title=self.backend_type + '编译配置',
+ text='[请选择]:是否进行编译后端配置:',
+ style=dialog_style).run()
+ if option:
+ sum_question = len(interact_info)
+ for i, question in enumerate(interact_info):
+ if question['depends']:
+ state = 0
+ for title in question['depends'].split(','):
+ if not answer[title]:
+ state = 1
+ if state:
+ continue
+ if question['dialog_type'] == 'Yes/No Dialog':
+ option = yes_no_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=dialog_style).run()
+ elif question['dialog_type'] == "Input Dialog":
+ option = input_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ style=input_style).run()
+ elif question['dialog_type'] == "Radiolist Dialog":
+ choice = [(i, text)
+ for i, text in enumerate(question['options'])]
+ num = radiolist_dialog(
+ title=self.backend_type + '编译配置进度(' + str(i + 1) +
+ '/' + str(sum_question) + ')',
+ text="[Backend " + self.backend_type + "]: " +
+ question['note'],
+ values=choice,
+ style=dialog_style).run()
+ option = question['options'][num] if num is not None else question[
+ 'default']
+ answer[question['name']] = option
+
+ return answer
+
+ def activate_venv(self, hardware_type: str) -> bool:
+
+ return True
+
+ def deactivate_venv(self):
+ sys.path[:
+ 0] = self.prev_sys_path #will also revert the added site-packages
+ sys.prefix = self.real_prefix
+ os.environ['PATH'] = self.old_os_path
+
+
+if __name__ == "__main__":
+ engine = PerfEngine()
+ engine.start_engine()
diff --git a/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh b/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7911aecdb775bcec206c398b81eff18e083597f0
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/scripts/infer_videobert_fp16_performance.sh
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+
+# Start to test
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+ BS=16
+ TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+ TARGET_ENGINE=${ORIGIN_ONNX_NAME}_end.engine
+ if [[ ! -f "${ORIGIN_ONNX}" ]];then
+ echo "${ORIGIN_ONNX} not exists!"
+ exit 1
+ fi
+
+ # Graph optimize
+ python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --dump_onnx
+
+ # Build Engine
+ ixrtexec --onnx ${TARGET_ONNX} --min_shape image:${BS}x3x224x224,text:100x77 \
+ --opt_shape image:${BS}x3x224x224,text:100x77 \
+ --max_shape image:${BS}x3x224x224,text:100x77 \
+ --save_engine ${TARGET_ENGINE} --log_level error --plugins ixrt_plugin --shapes image:${BS}x3x224x224,text:100x77
+
+ # Test Performance
+ ixrtexec --load_engine ${TARGET_ENGINE} --plugins ixrt_plugin --shapes image:${BS}x3x224x224,text:100x77
+
+}
+run 1
\ No newline at end of file
diff --git a/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh b/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c57f758d35547a14106d1acbedb2510fba335c44
--- /dev/null
+++ b/models/nlp/language_model/videobert/ixrt/scripts/prepare_model_and_dataset.sh
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+# #!/bin/bash
+echo "******************* Downloading Model.... *******************"
+
+mkdir -p general_perf/model_zoo/regular
+mkdir -p general_perf/model_zoo/popular
+mkdir -p general_perf/model_zoo/sota
+mkdir -p general_perf/download
+mkdir -p datasets/open_cifar/
+
+wget -O general_perf/download/open_videobert.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/open_videobert.tar
+tar xf general_perf/download/open_videobert.tar -C general_perf/model_zoo/popular/
+
+
+# # Download Datasets
+wget -O general_perf/download/cifar-100-python.tar https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/cifar-100-python.tar
+tar xf general_perf/download/cifar-100-python.tar -C datasets/open_cifar
+
+
+echo "Extract Done."
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/README.md b/models/recommendation/widedeep/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..350a6da35f97da01cb2c82932c93dac64360f3cc
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/README.md
@@ -0,0 +1,84 @@
+# Wide&Deep
+
+## Description
+
+Generalized linear models with nonlinear feature transformations are widely used for large-scale regression and classification problems with sparse inputs. Memorization of feature interactions through a wide set of cross-product feature transformations are effective and interpretable, while generalization requires more feature engineering effort. With less feature engineering, deep neural networks can generalize better to unseen feature combinations through low-dimensional dense embeddings learned for the sparse features. However, deep neural networks with embeddings can over-generalize and recommend less relevant items when the user-item interactions are sparse and high-rank. In this paper, we present Wide & Deep learning---jointly trained wide linear models and deep neural networks---to combine the benefits of memorization and generalization for recommender systems. We productionized and evaluated the system on Google Play, a commercial mobile app store with over one billion active users and over one million apps. Online experiment results show that Wide & Deep significantly increased app acquisitions compared with wide-only and deep-only models. We have also open-sourced our implementation in TensorFlow.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install tf2onnx
+pip3 install pycuda
+pip3 install onnxsim
+pip3 install py-libnuma==1.2
+```
+
+### Download
+
+Pretrained model:
+
+Dataset:
+
+```bash
+# Go to path of this model
+export PROJ_ROOT=/PATH/TO/DEEPSPARKINFERENCE
+export MODEL_PATH=${PROJ_ROOT}/models/recommendation/widedeep/ixrt
+cd ${MODEL_PATH}
+
+# export onnx
+python3 export_onnx.py --model_path open_wide_deep_saved_model --output_path open_wide_deep_saved_model/widedeep.onnx
+
+# Simplify onnx model
+onnxsim open_wide_deep_saved_model/widedeep.onnx open_wide_deep_saved_model/widedeep_sim.onnx
+python3 deploy.py --model_path open_wide_deep_saved_model/widedeep_sim.onnx --output_path open_wide_deep_saved_model/widedeep_sim.onnx
+python3 change2dynamic.py --model_path open_wide_deep_saved_model/widedeep_sim.onnx --output_path open_wide_deep_saved_model/widedeep_sim.onnx
+```
+
+## Inference
+
+```bash
+export ORIGIN_ONNX_NAME=./open_wide_deep_saved_model/widedeep_sim
+export OPTIMIER_FILE=${IXRT_OSS_ROOT}/tools/optimizer/optimizer.py
+export PROJ_PATH=./
+```
+
+### Performance
+
+```bash
+bash scripts/infer_widedeep_fp16_performance.sh
+```
+
+### Accuracy
+
+If you want to evaluate the accuracy of this model, please visit the website: , which integrates inference and training of many models under this framework, supporting the ILUVATAR backend
+
+For detailed steps regarding this model, please refer to this document: Note: You need to modify the relevant paths in the code to your own correct paths.
+
+```bash
+# link and install ByteMLPerf requirements
+ln -s ${PROJ_ROOT}/toolbox/ByteMLPerf ./
+pip3 install -r ./ByteMLPerf/byte_infer_perf/general_perf/requirements.txt
+mv perf_engine.py ./ByteMLPerf/byte_infer_perf/general_perf/core/perf_engine.py
+
+# Get eval.csv and onnx
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model
+mkdir -p ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/
+
+wget https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/eval.csv
+mv eval.csv ./ByteMLPerf/byte_infer_perf/general_perf/datasets/open_criteo_kaggle/
+
+wget http://files.deepspark.org.cn:880/deepspark/widedeep_dynamicshape_new.onnx
+mv widedeep_dynamicshape_new.onnx ./ByteMLPerf/byte_infer_perf/general_perf/model_zoo/regular/open_wide_deep_saved_model/
+
+# Run Acc scripts
+cd ./ByteMLPerf/byte_infer_perf/general_perf
+python3 core/perf_engine.py --hardware_type ILUVATAR --task widedeep-tf-fp32
+```
+
+## Results
+
+| Model | BatchSize | Precision | FPS | ACC |
+| --------- | --------- | --------- | -------- | ------- |
+| Wide&Deep | 1024 | FP16 | 77073.93 | 0.74597 |
diff --git a/models/recommendation/widedeep/ixrt/change2dynamic.py b/models/recommendation/widedeep/ixrt/change2dynamic.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9bcf6f156bcd1bfb6e9a7e150c0eb4461e70f98
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/change2dynamic.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import onnx
+
+def change_input_output_dim(model):
+ # Use some symbolic name not used for any other dimension
+ sym_batch_dim = "batch"
+ # sym_batch_dim = -1
+
+ # The following code changes the first dimension of every input to be batch-dim
+ # Modify as appropriate ... note that this requires all inputs to
+ # have the same batch_dim
+ inputs = model.graph.input
+ for input in inputs:
+ # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+ # Add checks as needed.
+ dim1 = input.type.tensor_type.shape.dim[0]
+ # update dim to be a symbolic value
+ dim1.dim_param = sym_batch_dim
+
+ if input.name == "new_categorical_placeholder:0":
+ input.type.tensor_type.shape.dim[1].dim_value = int(2)
+ elif input.name == "new_numeric_placeholder:0":
+ input.type.tensor_type.shape.dim[1].dim_value = int(13)
+ elif input.name == "import/head/predictions/zeros_like:0":
+ input.type.tensor_type.shape.dim[1].dim_value = int(1)
+
+ # or update it to be an actual value:
+ # dim1.dim_value = actual_batch_dim
+
+ outputs = model.graph.output
+
+ for output in outputs:
+ # Checks omitted.This assumes that all inputs are tensors and have a shape with first dim.
+ # Add checks as needed.
+ dim1 = output.type.tensor_type.shape.dim[0]
+ # update dim to be a symbolic value
+ dim1.dim_param = sym_batch_dim
+
+def change_input_node_name(model, input_names):
+ for i,input in enumerate(model.graph.input):
+ input_name = input_names[i]
+ for node in model.graph.node:
+ for i, name in enumerate(node.input):
+ if name == input.name:
+ node.input[i] = input_name
+ input.name = input_name
+
+
+def change_output_node_name(model, output_names):
+ for i,output in enumerate(model.graph.output):
+ output_name = output_names[i]
+ for node in model.graph.node:
+ for i, name in enumerate(node.output):
+ if name == output.name:
+ node.output[i] = output_name
+ output.name = output_name
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+
+
+if __name__ == "__main__":
+ args = get_args()
+ model = onnx.load(args.model_path)
+ change_input_output_dim(model)
+ model = onnx.load(args.model_path)
+ for input in model.graph.input:
+ for node in model.graph.node:
+ for i, name in enumerate(node.input):
+ if name == input.name:
+ node.input[i] =name.replace(':',"")
+ input.name=input.name.replace(':',"")# 保存修改后的模型
+ onnx.save(model, args.output_path)
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/deploy.py b/models/recommendation/widedeep/ixrt/deploy.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e1ac694f9a096b4aa6cb0b2acbbc689e5d901db
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/deploy.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import onnx
+import argparse
+import copy
+
+from typing import Union, Callable, List
+
+from tensorrt.deploy.api import *
+from tensorrt.deploy.backend.onnx.converter import default_converter
+from tensorrt.deploy.backend.torch.executor.operators._operators import to_py_type
+from tensorrt.deploy.ir.operator_attr import BaseOperatorAttr, EmptyAttr
+from tensorrt.deploy.ir.operator_type import OperatorType as OP
+from tensorrt.deploy.ir import operator_attr as attr, Operator, generate_operator_name
+from tensorrt.deploy.fusion import BasePass, PatternGraph, build_sequence_graph, GraphMatcher, PassSequence
+from tensorrt.deploy.ir import Graph
+from tensorrt.deploy.quantizer.quant_operator.base import quant_single_input_operator
+from tensorrt.deploy.backend.onnx.converter import convert_onnx_operator
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+
+class FuseGemmPass(BasePass):
+ def process(self, graph: Graph) -> Graph:
+ self.transform = GraphTransform(graph)
+
+ self.transform.find_sequence_subgraph(
+ pattern=[OP.MATMUL, OP.ADD], callback=self.fuse_gemm, strict=True
+ )
+ return graph
+
+ def fuse_gemm(self, graph, pattern: PatternGraph):
+ matmul = pattern.nodes[0]
+ add = pattern.nodes[1]
+
+ if len(add.operator.inputs) != 2:
+ return
+
+ b_var = graph.get_variable(matmul.operator.inputs[1])
+ if not graph.is_leaf_variable(b_var) or b_var.value is None:
+ return
+
+ if b_var.value.ndim != 2:
+ return
+
+ bias_var = None
+ for input in add.operator.inputs:
+ if input not in matmul.operator.outputs:
+ bias_var = input
+
+ matmul.operator.inputs.append(bias_var)
+ self.transform.delete_operator_and_link(
+ add.operator, link_input=matmul.operator.outputs[0]
+ )
+
+ matmul.operator.op_type = OP.GEMM
+ matmul.operator.attributes = attr.GemmAttr(transB=1)
+
+def replace_input(graph):
+ transformer = GraphTransform(graph)
+ from_op = graph.get_operator("Shape__8")
+ to_op = graph.get_operator('import/head/predictions/zeros_like')
+ var = graph.get_variable("import/head/predictions/zeros_like:0")
+ transformer.delete_operators_between_op_op(from_op=from_op, to_op=to_op)
+ transformer.add_input("import/head/predictions/zeros_like:0")
+ return graph
+
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+if __name__ == "__main__":
+ args = get_args()
+ graph = create_source(args.model_path)()
+ graph = FuseGemmPass().process(graph)
+ graph = replace_input(graph)
+ create_target(saved_path=args.output_path).export(graph)
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/export_onnx.py b/models/recommendation/widedeep/ixrt/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..475dddd7c2ab27b6ca342be98ea92d2c791ff60b
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/export_onnx.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import tf2onnx
+from tf2onnx import tf_loader
+import argparse
+ONNX_OPSET = 11
+
+def _convert_graphdef_to_onnx(graph_def,
+ inputs=None,
+ outputs=None,
+ output_path='',
+ **kwargs):
+
+ inputs_as_nchw = kwargs.get('inputs_as_nchw', None)
+ custom_ops = kwargs.get('custom_ops', None)
+ custom_op_handlers = kwargs.get('custom_op_handlers', None)
+ custom_rewriter = kwargs.get('custom_rewriter', None)
+ extra_opset = kwargs.get('extra_opset', None)
+ large_model = kwargs.get('large_model', False)
+ name = kwargs.get('name', 'habana_convert')
+ target = kwargs.get('target', None)
+ shape_override = kwargs.get('shape_override', {})
+
+ tf2onnx.convert.from_graph_def(graph_def,
+ name=name,
+ input_names=inputs,
+ output_names=outputs,
+ opset=ONNX_OPSET,
+ custom_ops=custom_ops,
+ custom_op_handlers=custom_op_handlers,
+ custom_rewriter=custom_rewriter,
+ inputs_as_nchw=inputs_as_nchw,
+ extra_opset=extra_opset,
+ shape_override=shape_override,
+ target=target,
+ large_model=large_model,
+ output_path=output_path)
+ return output_path
+
+def savedmodel_to_onnx(model_path, output_path='', **kwargs):
+ inputs = kwargs.get('inputs', None)
+ outputs = kwargs.get('outputs', None)
+ graph_def, inputs, outputs = tf_loader.from_saved_model(
+ model_path, inputs, outputs)
+ return _convert_graphdef_to_onnx(graph_def, inputs, outputs, output_path, **kwargs)
+
+def get_args():
+ """Parse commandline."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", default="")
+ parser.add_argument("--output_path", default="")
+ args = parser.parse_args()
+ return args
+
+if __name__ == "__main__":
+ args = get_args()
+ savedmodel_to_onnx(args.model_path, args.output_path)
\ No newline at end of file
diff --git a/models/recommendation/widedeep/ixrt/scripts/infer_widedeep_fp16_performance.sh b/models/recommendation/widedeep/ixrt/scripts/infer_widedeep_fp16_performance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..866adb44937ac5c616b856e13122073ea5cb4233
--- /dev/null
+++ b/models/recommendation/widedeep/ixrt/scripts/infer_widedeep_fp16_performance.sh
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+set -x
+ORIGIN_ONNX=${ORIGIN_ONNX_NAME}.onnx
+cd ${PROJ_PATH}
+
+run(){
+ BS=${1:-1}
+ TARGET_ONNX=${ORIGIN_ONNX_NAME}_end.onnx
+ TARGET_ENGINE=${ORIGIN_ONNX_NAME}_bs_${BS}_end.engine
+ if [[ ! -f "${ORIGIN_ONNX}" ]];then
+ echo "${ORIGIN_ONNX} not exists!"
+ exit 1
+ fi
+
+ # Graph optimize
+ python3 ${OPTIMIER_FILE} --onnx ${ORIGIN_ONNX} --input_shapes "new_categorical_placeholder0:$((26 * ${BS}))x2,new_numeric_placeholder0:${BS}x13,import/head/predictions/zeros_like0:${BS}x1"
+ # Build Engine
+ ixrtexec --onnx ${TARGET_ONNX} --save_engine ${TARGET_ENGINE} --log_level error
+
+ # Test Performance
+ ixrtexec --load_engine ${TARGET_ENGINE}
+
+}
+run 1
\ No newline at end of file
diff --git a/models/speech/speech_recognition/conformer/ixrt/README.md b/models/speech/speech_recognition/conformer/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2ad0e26a13c8c6b2400e78726f8b0bab713cea45
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/README.md
@@ -0,0 +1,61 @@
+# Conformer
+
+## Description
+
+Conformer is a speech recognition model proposed by Google in 2020. It combines the advantages of CNN and Transformer. CNN efficiently extracts local features, while Transformer is more effective in capturing long sequence dependencies. Conformer applies convolution to the Encoder layer of Transformer, enhancing the performance of Transformer in the ASR (Automatic Speech Recognition) domain.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install tqdm
+pip3 install onnx
+pip3 install typeguard==2.13.3
+pip3 install onnxsim
+pip3 install pycuda
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the Aishell dataset.
+
+Download and put model in conformer_checkpoints.
+
+```bash
+ln -s /home/deepspark/datasets/INFER/conformer/20210601_u2++_conformer_exp_aishell ./conformer_checkpoints
+```
+
+### Prepare Data
+
+```bash
+# Accuracy
+DATA_DIR=/PATH/to/data_aishell
+TOOL_DIR="$(pwd)/tools"
+bash scripts/aishell_data_prepare.sh ${DATA_DIR} ${TOOL_DIR}
+```
+
+## Model Conversion And Inference
+
+### FP16
+
+```bash
+# Accuracy
+bash scripts/infer_conformer_fp16_accuracy_ixrt.sh
+# Performance
+bash scripts/infer_conformer_fp16_performance_ixrt.sh
+```
+
+## Results
+
+| Model | BatchSize | Precision | QPS | CER |
+| --------- | --------- | --------- | ------- | ------ |
+| Conformer | 24 | FP16 | 387.821 | 0.0517 |
diff --git a/models/speech/speech_recognition/conformer/ixrt/build_engine.py b/models/speech/speech_recognition/conformer/ixrt/build_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa20ee59f6ecd23d8a8cb9272ece0087ed65ab89
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/build_engine.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""
+Build Engine From FusionPlugin Onnx.
+"""
+
+import os
+import ctypes
+import json
+import onnx
+import logging
+import argparse
+
+import tensorrt
+import tensorrt as trt
+from tensorrt import Dims
+
+
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+ if not dynamic_path:
+ dynamic_path = os.path.join(os.path.dirname(trt.__file__), "lib", "libixrt_plugin.so")
+ if not os.path.exists(dynamic_path):
+ raise FileNotFoundError(
+ f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!"
+ )
+ ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+ trt.init_libnvinfer_plugins(logger, namespace)
+ print(f"Loaded plugin from {dynamic_path}")
+
+load_ixrt_plugin()
+
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(description="build tensorrt engine of conformer.", usage="")
+ parser.add_argument(
+ "--model_name",
+ type=str,
+ required=True,
+ help="conformer",
+ )
+ parser.add_argument(
+ "--onnx_path",
+ type=str,
+ required=True,
+ help="onnx_path path to save",
+ )
+ parser.add_argument(
+ "--engine_path",
+ type=str,
+ required=True,
+ help="engine path to save",
+ )
+ parser.add_argument(
+ "--max_batch_size",
+ type=int,
+ required=True,
+ )
+ parser.add_argument(
+ "--max_seq_len",
+ type=int,
+ required=True,
+ )
+ args = parser.parse_args()
+ return args
+
+args = parse_args()
+MaxBSZ = args.max_batch_size
+MaxSeqLen = args.max_seq_len
+
+
+def build_engine_trtapi_dynamicshape(args):
+ onnx_model = args.onnx_path
+ assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!"
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+
+ profile = builder.create_optimization_profile()
+ profile.set_shape("input", Dims([MaxBSZ, 100, 80]), Dims([MaxBSZ, 1000, 80]), Dims([MaxBSZ, 1500, 80]))
+ profile.set_shape("mask", Dims([MaxBSZ, 1, 25]), Dims([MaxBSZ, 1, 250]), Dims([MaxBSZ, 1, 374]))
+ profile.set_shape("pos_emb", Dims([1, 25, 256]), Dims([1, 250, 256]), Dims([1, 374, 256]))
+ build_config.add_optimization_profile(profile)
+
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+ parser.parse_from_file(onnx_model)
+ build_config.set_flag(tensorrt.BuilderFlag.FP16)
+
+ # set dynamic
+ # input
+ input_tensor = network.get_input(0)
+ input_tensor.shape = Dims([MaxBSZ, -1, 80])
+ # mask
+ mask_tensor = network.get_input(1)
+ mask_tensor.shape = Dims([MaxBSZ, 1, -1])
+ # pos_emb
+ pos_emb_tensor = network.get_input(2)
+ pos_emb_tensor.shape = Dims([1, -1, 256])
+
+ plan = builder.build_serialized_network(network, build_config)
+ with open(args.engine_path, "wb") as f:
+ f.write(plan)
+
+ print("Build dynamic shape engine done!")
+
+
+def build_engine_trtapi_staticshape(args):
+ onnx_model = args.onnx_path
+ assert os.path.isfile(onnx_model), f"The onnx model{onnx_model} must be existed!"
+ IXRT_LOGGER = tensorrt.Logger(tensorrt.Logger.WARNING)
+ builder = tensorrt.Builder(IXRT_LOGGER)
+ EXPLICIT_BATCH = 1 << (int)(tensorrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+ network = builder.create_network(EXPLICIT_BATCH)
+ build_config = builder.create_builder_config()
+ parser = tensorrt.OnnxParser(network, IXRT_LOGGER)
+
+ parser.parse_from_file(onnx_model)
+ build_config.set_flag(tensorrt.BuilderFlag.FP16)
+
+ plan = builder.build_serialized_network(network, build_config)
+ with open(args.engine_path, "wb") as f:
+ f.write(plan)
+
+ print("Build static shape engine done!")
+
+
+if __name__ == "__main__":
+ build_engine_trtapi_dynamicshape(args)
+ # build_engine_trtapi_staticshape(args)
diff --git a/models/speech/speech_recognition/conformer/ixrt/common.py b/models/speech/speech_recognition/conformer/ixrt/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..89023300ddc7ca3e4f0f992f4b124d8a8c131ae5
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/common.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import ctypes
+import cv2
+import glob
+import torch
+import tensorrt
+import tensorrt as trt
+import numpy as np
+import pycuda.driver as cuda
+
+from tensorrt.hook.utils import copy_ixrt_io_tensors_as_np
+
+
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""):
+ if not dynamic_path:
+ dynamic_path = os.path.join(os.path.dirname(trt.__file__), "lib", "libixrt_plugin.so")
+ if not os.path.exists(dynamic_path):
+ raise FileNotFoundError(
+ f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!"
+ )
+ ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL)
+ trt.init_libnvinfer_plugins(logger, namespace)
+ print(f"Loaded plugin from {dynamic_path}")
+load_ixrt_plugin()
+
+
+def trtapi(engine_file):
+ datatype = tensorrt.DataType.FLOAT
+ host_mem = tensorrt.IHostMemory
+ logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+ with open(engine_file, "rb") as f, tensorrt.Runtime(logger) as runtime:
+ runtime = tensorrt.Runtime(logger)
+ assert runtime
+ engine = runtime.deserialize_cuda_engine(f.read())
+ assert engine
+ context = engine.create_execution_context()
+ assert context
+
+ return engine, context
+
+
+def create_engine_context(engine_path, logger):
+ with open(engine_path, "rb") as f:
+ runtime = tensorrt.Runtime(logger)
+ assert runtime
+ engine = runtime.deserialize_cuda_engine(f.read())
+ assert engine
+ context = engine.create_execution_context()
+ assert context
+
+ return engine, context
+
+
+def get_io_bindings(engine):
+ # Setup I/O bindings
+ inputs = []
+ outputs = []
+ allocations = []
+
+ for i in range(engine.num_bindings):
+ is_input = False
+ if engine.binding_is_input(i):
+ is_input = True
+ name = engine.get_binding_name(i)
+ dtype = engine.get_binding_dtype(i)
+ shape = engine.get_binding_shape(i)
+ if is_input:
+ batch_size = shape[0]
+ size = np.dtype(tensorrt.nptype(dtype)).itemsize
+ for s in shape:
+ size *= s
+ allocation = cuda.mem_alloc(size)
+ binding = {
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(tensorrt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
+ }
+ print(f"binding {i}, name : {name} dtype : {np.dtype(tensorrt.nptype(dtype))} shape : {list(shape)}")
+ allocations.append(allocation)
+ if engine.binding_is_input(i):
+ inputs.append(binding)
+ else:
+ outputs.append(binding)
+ return inputs, outputs, allocations
+
+
+def setup_io_bindings(engine, context):
+ # Setup I/O bindings
+ inputs = []
+ outputs = []
+ allocations = []
+
+ for i in range(engine.num_bindings):
+ is_input = False
+ if engine.binding_is_input(i):
+ is_input = True
+ name = engine.get_binding_name(i)
+ dtype = engine.get_binding_dtype(i)
+ shape = context.get_binding_shape(i)
+ if is_input:
+ batch_size = shape[0]
+ size = np.dtype(tensorrt.nptype(dtype)).itemsize
+ for s in shape:
+ size *= s
+ allocation = cuda.mem_alloc(size)
+ binding = {
+ "index": i,
+ "name": name,
+ "dtype": np.dtype(tensorrt.nptype(dtype)),
+ "shape": list(shape),
+ "allocation": allocation,
+ }
+ allocations.append(allocation)
+ if engine.binding_is_input(i):
+ inputs.append(binding)
+ else:
+ outputs.append(binding)
+ return inputs, outputs, allocations
diff --git a/models/speech/speech_recognition/conformer/ixrt/convert2onnx.py b/models/speech/speech_recognition/conformer/ixrt/convert2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..823ae3215f58d18a636e868668199ed3f388ee20
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/convert2onnx.py
@@ -0,0 +1,529 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""
+Build Compute Graph(Fusion Plugin Onnx) From Checkpoints.
+"""
+
+import os
+import json
+import torch
+import argparse
+import numpy as np
+from collections import OrderedDict
+
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+from tensorrt.deploy.ir.data_type import DataType
+from tensorrt.deploy.ir.variable import Variable, VariableOptions
+from tensorrt.deploy.ir.graph import Graph
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description="Build Compute Graph From Checkpoints.", usage=""
+ )
+ parser.add_argument(
+ "--model_name",
+ type=str,
+ required=True,
+ help="conformer",
+ )
+ parser.add_argument(
+ "--model_path",
+ type=str,
+ required=True,
+ help="checkpont of conformer",
+ )
+ parser.add_argument(
+ "--onnx_path",
+ type=str,
+ required=True,
+ help="raw onnx path to save",
+ )
+ parser.add_argument(
+ "--batch_size",
+ type=int,
+ required=True,
+ help="the batch size for test.",
+ )
+ args = parser.parse_args()
+ return args
+
+
+def add_global_cmvn_op(graph, state_dict, args):
+ t = graph
+
+ sub_inputs = [t.make_variable("input", dtype=DataType.FLOAT, shape=(128, 1500, 80))]
+ key = "encoder.global_cmvn.mean"
+ sub_inputs.append(t.make_variable(name=key, value=state_dict[key]))
+ sub_outputs = [t.make_variable("Sub_output_0", dtype=DataType.FLOAT, shape=(128, 1500, 80))]
+ t.make_operator(
+ "Sub",
+ inputs=sub_inputs,
+ outputs=sub_outputs,
+ )
+
+ mul_inputs = sub_outputs
+ key = "encoder.global_cmvn.istd"
+ mul_inputs.append(t.make_variable(name=key, value=state_dict[key]))
+ mul_outputs = [t.make_variable("Mul_output_0", dtype=DataType.FLOAT, shape=(128, 1500, 80))]
+ t.make_operator(
+ "Mul",
+ inputs=mul_inputs,
+ outputs=mul_outputs,
+ )
+
+ unsqueeze_inputs = mul_outputs
+ unsqueeze_inputs.append(t.make_variable("axes", value=np.array([1], dtype=np.int64)))
+ unsqueeze_outputs = [t.make_variable("Unsqueeze_output_0", dtype=DataType.FLOAT, shape=(128, 1, 1500, 80))]
+ t.make_operator(
+ "Unsqueeze",
+ inputs=unsqueeze_inputs,
+ outputs=unsqueeze_outputs,
+ )
+
+
+def add_first_submodule_op(graph, state_dict, args):
+ """
+ The firt submodule part contains follows:
+ 1.Conv2d+ReLU;
+ 2.Conv2d+ReLU;
+ 3.Transpose+Reshape;
+ 4.MatMul+Add+Mul;
+ """
+
+ t = graph
+ conv2d0_weight_keys = [
+ "encoder.embed.conv.0.weight",
+ "encoder.embed.conv.0.bias",
+ ]
+ conv2d0_attributes = {
+ "dilations": [1, 1],
+ "group": 1,
+ "kernel_shape": [3, 3],
+ "pads": [0, 0, 0, 0],
+ "strides": [2, 2],
+ }
+ conv2d0_inputs = [t.get_variable("Unsqueeze_output_0")]
+ conv2d0_outputs = [t.make_variable("Conv_output_0", dtype=DataType.FLOAT)]
+
+ for key in conv2d0_weight_keys:
+ conv2d0_inputs.append(t.make_variable(name=key, value=state_dict[key]))
+ t.make_operator(
+ "Conv",
+ inputs=conv2d0_inputs,
+ outputs=conv2d0_outputs,
+ **conv2d0_attributes
+ )
+
+ relu0_inputs = conv2d0_outputs
+ relu0_outputs = [t.make_variable("Relu_output_0", dtype=DataType.FLOAT)]
+ t.make_operator(
+ "Relu",
+ inputs=relu0_inputs,
+ outputs=relu0_outputs
+ )
+
+ conv2d1_weight_keys = [
+ "encoder.embed.conv.2.weight",
+ "encoder.embed.conv.2.bias",
+ ]
+ conv2d1_attributes = {
+ "dilations": [1, 1],
+ "group": 1,
+ "kernel_shape": [3, 3],
+ "pads": [0, 0, 0, 0],
+ "strides": [2, 2],
+ }
+ conv2d1_inputs = relu0_outputs
+ conv2d1_outputs = [t.make_variable("Conv_output_1", dtype=DataType.FLOAT)]
+
+ for key in conv2d1_weight_keys:
+ conv2d1_inputs.append(t.make_variable(name=key, value=state_dict[key]))
+ t.make_operator(
+ "Conv",
+ inputs=conv2d1_inputs,
+ outputs=conv2d1_outputs,
+ **conv2d1_attributes
+ )
+
+ relu1_inputs = conv2d1_outputs
+ relu1_outputs = [t.make_variable("Relu_output_1", dtype=DataType.FLOAT)]
+ t.make_operator(
+ "Relu",
+ inputs=relu1_inputs,
+ outputs=relu1_outputs
+ )
+
+ tran_inputs = relu1_outputs
+ tran_outputs = [t.make_variable("Transpose_output_0", dtype=DataType.FLOAT)]
+ tran_attributes = {"perm": [0, 2, 1, 3]}
+ t.make_operator(
+ "Transpose",
+ inputs=tran_inputs,
+ outputs=tran_outputs,
+ **tran_attributes
+ )
+
+ reshape_inputs = tran_outputs
+ reshape_inputs.append(t.make_variable(name="constant_0", value=np.array([args.batch_size, -1, 4864]), dtype=DataType.INT64))
+ reshape_outputs = [t.make_variable("Reshape_output_0", dtype=DataType.FLOAT)]
+ t.make_operator(
+ "Reshape",
+ inputs=reshape_inputs,
+ outputs=reshape_outputs,
+ )
+
+ matmul_inputs = reshape_outputs
+ matmul_inputs.append(t.make_variable(name="embed.out.0.weight", value=state_dict["encoder.embed.out.0.weight"].transpose(1, 0))) # (256,4864)--->(4864,256)
+ matmul_outputs = [t.make_variable("MatMul_output_0", dtype=DataType.FLOAT)]
+ t.make_operator(
+ "MatMul",
+ inputs=matmul_inputs,
+ outputs=matmul_outputs,
+ )
+
+ add_inputs = matmul_outputs
+ add_inputs.append(t.make_variable(name="embed.out.0.bias", value=state_dict["encoder.embed.out.0.bias"]))
+ add_outputs = [t.make_variable("Add_output_0", dtype=DataType.FLOAT)]
+ t.make_operator(
+ "Add",
+ inputs=add_inputs,
+ outputs=add_outputs,
+ )
+
+ mul_inputs = add_outputs
+ mul_inputs.append(t.make_variable(name="constant_1", value=np.array([16.], dtype=np.float32), dtype=DataType.FLOAT))
+ mul_outputs = [t.make_variable("Mul_output_1", dtype=DataType.FLOAT)]
+ t.make_operator(
+ "Mul",
+ inputs=mul_inputs,
+ outputs=mul_outputs,
+ )
+
+
+def add_encoder_ff_macaron_op(graph, state_dict, args, index):
+
+ t = graph
+ ff_macaron_keys = [
+ "encoder.encoders.{}.norm_ff_macaron.weight",
+ "encoder.encoders.{}.norm_ff_macaron.bias",
+ "encoder.encoders.{}.feed_forward_macaron.w_1.weight",
+ "encoder.encoders.{}.feed_forward_macaron.w_1.bias",
+ "encoder.encoders.{}.feed_forward_macaron.w_2.weight",
+ "encoder.encoders.{}.feed_forward_macaron.w_2.bias",
+ ]
+
+ attributes = {
+ "in_feature": 256,
+ "hidden_size": 2048,
+ "act_type": 12,
+ "ff_scale": 0.5,
+ }
+
+ if index == 0:
+ inputs = [graph.get_variable("Mul_output_1")]
+ else:
+ inputs = [graph.get_variable("norm_final_{}_output".format(index-1))]
+
+ outputs = [t.make_variable("ff_macaron_{}_output".format(index), dtype=DataType.FLOAT)]
+
+ for key in ff_macaron_keys:
+ key = key.format(index)
+ inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+ t.make_operator(
+ "PositionWiseFFNPluginDynamic_IxRT",
+ inputs=inputs,
+ outputs=outputs,
+ **attributes
+ )
+
+
+def add_encoder_mhsa_op(graph, state_dict, args, index):
+
+ t = graph
+ mhsa_keys = [
+ "encoder.encoders.{}.norm_mha.weight",
+ "encoder.encoders.{}.norm_mha.bias",
+ "encoder.encoders.{}.self_attn.linear_q.weight",
+ "encoder.encoders.{}.self_attn.linear_q.bias",
+ "encoder.encoders.{}.self_attn.linear_k.weight",
+ "encoder.encoders.{}.self_attn.linear_k.bias",
+ "encoder.encoders.{}.self_attn.linear_v.weight",
+ "encoder.encoders.{}.self_attn.linear_v.bias",
+ "encoder.encoders.{}.self_attn.linear_pos.weight",
+ "encoder.encoders.{}.self_attn.pos_bias_u",
+ "encoder.encoders.{}.self_attn.pos_bias_v",
+ "encoder.encoders.{}.self_attn.linear_out.weight",
+ "encoder.encoders.{}.self_attn.linear_out.bias",
+ ]
+
+ attributes = {
+ "bs": 128,
+ "seq_len": 374,
+ "n_head": 4,
+ "n_feat": 256,
+ }
+
+ if index == 0:
+ inputs = [
+ graph.get_variable("ff_macaron_{}_output".format(index)),
+ t.make_variable("mask", dtype=DataType.INT32, shape=(128, 1, 374)),
+ t.make_variable("pos_emb", dtype=DataType.FLOAT, shape=(1, 374, 256)),
+ ]
+ else:
+ inputs = [
+ graph.get_variable("ff_macaron_{}_output".format(index)),
+ graph.get_variable("mask"),
+ graph.get_variable("pos_emb"),
+ ]
+
+ outputs = [t.make_variable("mhsa_{}_output".format(index), dtype=DataType.FLOAT)]
+
+ for key in mhsa_keys:
+ key = key.format(index)
+ inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+ t.make_operator(
+ "ConformerMultiHeadSelfAttentionPlugin_IxRT",
+ inputs=inputs,
+ outputs=outputs,
+ **attributes
+ )
+
+
+def add_encoder_conv_module_op(graph, state_dict, args, index):
+
+ t = graph
+ conv_module_keys = [
+ "encoder.encoders.{}.norm_conv.weight",
+ "encoder.encoders.{}.norm_conv.bias",
+ "encoder.encoders.{}.conv_module.pointwise_conv1.weight",
+ "encoder.encoders.{}.conv_module.pointwise_conv1.bias",
+ "encoder.encoders.{}.conv_module.depthwise_conv.weight",
+ "encoder.encoders.{}.conv_module.depthwise_conv.bias",
+ "encoder.encoders.{}.conv_module.norm.weight",
+ "encoder.encoders.{}.conv_module.norm.bias",
+ "encoder.encoders.{}.conv_module.pointwise_conv2.weight",
+ "encoder.encoders.{}.conv_module.pointwise_conv2.bias",
+ ]
+
+ attributes = {
+ "kernel_size_1": 1,
+ "stride_1": 1,
+ "odim_1": 512,
+ "kernel_size_2": 8,
+ "stride_2": 1,
+ "odim_2": 256,
+ "kernel_size_3": 1,
+ "stride_3": 1,
+ "odim_3": 256,
+ }
+
+ inputs = [
+ graph.get_variable("mhsa_{}_output".format(index)),
+ graph.get_variable("mask"),
+ ]
+ outputs = [t.make_variable("conv_module_{}_output".format(index), dtype=DataType.FLOAT)]
+
+ for key in conv_module_keys:
+ key = key.format(index)
+
+ if "conv_module.depthwise_conv.weight" in key:
+ inputs.append(t.make_variable(name=key, value=state_dict[key].permute(1, 2, 0).half(), dtype=DataType.FLOAT16))
+ elif "bias" in key and "norm" not in key:
+ inputs.append(t.make_variable(name=key, value=state_dict[key], dtype=DataType.FLOAT))
+ else:
+ inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+ t.make_operator(
+ "ConformerConvModulePlugin_IxRT",
+ inputs=inputs,
+ outputs=outputs,
+ **attributes
+ )
+
+
+def add_encoder_positionwise_ff_op(graph, state_dict, args, index):
+
+ t = graph
+ positionwise_ff_keys = [
+ "encoder.encoders.{}.norm_ff.weight",
+ "encoder.encoders.{}.norm_ff.bias",
+ "encoder.encoders.{}.feed_forward.w_1.weight",
+ "encoder.encoders.{}.feed_forward.w_1.bias",
+ "encoder.encoders.{}.feed_forward.w_2.weight",
+ "encoder.encoders.{}.feed_forward.w_2.bias",
+ ]
+
+ attributes = {
+ "in_feature": 256,
+ "hidden_size": 2048,
+ "act_type": 12,
+ "ff_scale": 0.5,
+ }
+
+ inputs = [graph.get_variable('conv_module_{}_output'.format(index))]
+ outputs = [t.make_variable("positionwise_ff_{}_output".format(index), dtype=DataType.FLOAT)]
+
+ for key in positionwise_ff_keys:
+ key = key.format(index)
+ inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+ t.make_operator(
+ "PositionWiseFFNPluginDynamic_IxRT",
+ inputs=inputs,
+ outputs=outputs,
+ **attributes
+ )
+
+
+def add_encoder_ln_op(graph, state_dict, args, index):
+
+ t = graph
+ ln_keys = [
+ "encoder.encoders.{}.norm_final.weight",
+ "encoder.encoders.{}.norm_final.bias",
+ ]
+
+ attributes = {
+ "axis": -1,
+ "epsilon": 0.000009999999747378752,
+ "stash_type": 1,
+ }
+
+ inputs = [graph.get_variable("positionwise_ff_{}_output".format(index))]
+ outputs = [t.make_variable("norm_final_{}_output".format(index), dtype=DataType.FLOAT)]
+
+ for key in ln_keys:
+ key = key.format(index)
+ inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+ t.make_operator(
+ "LayerNormalization",
+ inputs=inputs,
+ outputs=outputs,
+ **attributes
+ )
+
+
+def add_final_ln_op(graph, state_dict, args):
+
+ t = graph
+ ln_keys = [
+ "encoder.after_norm.weight",
+ "encoder.after_norm.bias",
+ ]
+
+ attributes = {
+ "axis": -1,
+ "epsilon": 0.000009999999747378752,
+ "stash_type": 1,
+ }
+
+ inputs = [graph.get_variable("norm_final_11_output")]
+ outputs = [t.make_variable("norm_final_output", dtype=DataType.FLOAT)]
+
+ for key in ln_keys:
+ inputs.append(t.make_variable(name=key, value=state_dict[key].half(), dtype=DataType.FLOAT16))
+
+ t.make_operator(
+ "LayerNormalization",
+ inputs=inputs,
+ outputs=outputs,
+ **attributes
+ )
+
+
+def add_ctc_op(graph, state_dict, args):
+ t = graph
+ # matmul
+ matmul_inputs = [graph.get_variable("norm_final_output")]
+ matmul_inputs.append(t.make_variable(name="ctc.ctc_lo.weight", value=state_dict["ctc.ctc_lo.weight"].transpose(1, 0))) # (4233,256)--->(256,4233)
+ matmul_outputs = [t.make_variable("MatMul_output_1", dtype=DataType.FLOAT)]
+ t.make_operator(
+ "MatMul",
+ inputs=matmul_inputs,
+ outputs=matmul_outputs,
+ )
+
+ add_inputs = matmul_outputs
+ add_inputs.append(t.make_variable(name="ctc.ctc_lo.bias", value=state_dict["ctc.ctc_lo.bias"]))
+ add_outputs = [t.make_variable("Add_output_1", dtype=DataType.FLOAT)]
+ t.make_operator(
+ "Add",
+ inputs=add_inputs,
+ outputs=add_outputs,
+ )
+
+ logsoftmax_inputs = add_outputs
+ logsoftmax_outputs = [t.make_variable("output", dtype=DataType.FLOAT)]
+ attributes = {
+ "axis": 2
+ }
+ t.make_operator(
+ "LogSoftmax",
+ inputs=logsoftmax_inputs,
+ outputs=logsoftmax_outputs,
+ **attributes
+ )
+
+
+def main(args):
+ graph = Graph()
+ transform = GraphTransform(graph)
+ state_dict = torch.load(args.model_path)
+
+ # 0. Global CMVN: sub+mul+unsqueeze
+ add_global_cmvn_op(transform, state_dict, args)
+
+ # 1. First Submodule: Conv2d+Relu+Transpose+MatMul
+ add_first_submodule_op(transform, state_dict, args)
+
+ # 2. Second Submodule: ConformerEncoderLayer: 12 layers
+ for i in range(args.num_layers):
+ add_encoder_ff_macaron_op(transform, state_dict, args, i)
+ add_encoder_mhsa_op(transform, state_dict, args, i)
+ add_encoder_conv_module_op(transform, state_dict, args, i)
+ add_encoder_positionwise_ff_op(transform, state_dict, args, i)
+ add_encoder_ln_op(transform, state_dict, args, i)
+
+ # 3. Third Submodule: FinalNorm
+ add_final_ln_op(transform, state_dict, args)
+
+ # 4.Forth Submodule: CTC+LogSoftmax
+ add_ctc_op(transform, state_dict, args)
+
+ # 5. set input and output
+ graph.add_input(graph.get_variable("input"))
+ graph.add_input(graph.get_variable("mask"))
+ graph.add_input(graph.get_variable("pos_emb"))
+ graph.add_output(graph.get_variable("output"))
+ # 5. export onnx file
+ create_target(saved_path=args.onnx_path).export(graph)
+ print("save onnx: ", args.onnx_path)
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ model_name = args.model_name.lower()
+ args.num_layers = 12
+ args.hidden_size = 2048
+ args.head_num = 4
+ args.head_dim = 64
+ args.pad_id = 0
+ args.inner_size = 3072
+ main(args)
diff --git a/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py b/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..35aad9bbf24533bed27e98ddbe4e326fa897df88
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_accuracy.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+
+import argparse
+import yaml
+import copy
+import torch
+import numpy as np
+
+from tqdm.contrib import tqdm
+from torch.utils.data import DataLoader
+from wenet.file_utils import read_symbol_table
+from wenet.dataset import Dataset
+from tools.compute_cer import Calculator, characterize, normalize, default_cluster
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings,trtapi,setup_io_bindings
+import pickle
+
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+from utils import make_pad_mask, RelPositionalEncoding
+from postprocess import ctc_greedy_search
+
+
+rel_positional_encoding = RelPositionalEncoding(256, 0.1)
+
+
+def get_args():
+ parser = argparse.ArgumentParser(description="recognize with your model")
+ parser.add_argument(
+ "--infer_type",
+ default="fp16",
+ choices=["fp16", "int8"],
+ help="inference type: fp16 or int8",
+ )
+ parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
+ parser.add_argument("--batch_size", type=int, default=24)
+ parser.add_argument("--data_dir", required=True, help="test data directory")
+ parser.add_argument(
+ "--model_dir", type=str, required=True, help="model for inference"
+ )
+ args = parser.parse_args()
+ return args
+
+
+def tensorrt_infer(engine, context, all_inputs):
+
+ input_names = ["input", "mask", "pos_emb"]
+ output_names = ["output"]
+
+ for input_name, input_data in zip(input_names, all_inputs):
+ input_idx = engine.get_binding_index(input_name)
+ input_shape = input_data.shape
+ context.set_binding_shape(input_idx, Dims(input_shape))
+
+ inputs, outputs, allocations = setup_io_bindings(engine, context)
+ pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+
+ for i, input_data in enumerate(all_inputs):
+ cuda.memcpy_htod(inputs[i]["allocation"], input_data)
+
+ context.execute_v2(allocations)
+ cuda.memcpy_dtoh(pred_output, outputs[0]["allocation"])
+ return pred_output
+
+
+def engine_init(engine):
+ host_mem = tensorrt.IHostMemory
+ logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+ engine, context = create_engine_context(engine, logger)
+
+ return engine,context
+
+
+def calculate_cer(data, reference_data):
+ calculator = Calculator()
+ tochar = True
+ split = None
+ case_sensitive = False
+ ignore_words = set()
+ rec_set = {}
+ for line in data:
+ if tochar:
+ array = characterize(line)
+ else:
+ array = line.strip().split()
+ if len(array) == 0:
+ continue
+ fid = array[0]
+ rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
+
+ default_clusters = {}
+ default_words = {}
+ for line in reference_data:
+ if tochar:
+ array = characterize(line)
+ else:
+ array = line.strip().split()
+ if len(array) == 0:
+ continue
+ fid = array[0]
+ if fid not in rec_set:
+ continue
+ lab = normalize(array[1:], ignore_words, case_sensitive, split)
+ rec = rec_set[fid]
+
+ for word in rec + lab:
+ if word not in default_words:
+ default_cluster_name = default_cluster(word)
+ if default_cluster_name not in default_clusters:
+ default_clusters[default_cluster_name] = {}
+ if word not in default_clusters[default_cluster_name]:
+ default_clusters[default_cluster_name][word] = 1
+ default_words[word] = default_cluster_name
+ result = calculator.calculate(lab, rec)
+
+ result = calculator.overall()
+ cer = float(result["ins"] + result["sub"] + result["del"]) / result["all"]
+ corr = result["cor"] / result["all"]
+
+ return cer, corr
+
+
+def main():
+ args = get_args()
+
+ # 读取配置文件
+ config_fn = os.path.join(args.model_dir, "config.yaml")
+ with open(config_fn, "r") as fin:
+ configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+ dataset_conf = copy.deepcopy(configs["dataset_conf"])
+ dataset_conf["filter_conf"]["max_length"] = 102400
+ dataset_conf["filter_conf"]["min_length"] = 0
+ dataset_conf["filter_conf"]["token_max_length"] = 102400
+ dataset_conf["filter_conf"]["token_min_length"] = 0
+ dataset_conf["filter_conf"]["max_output_input_ratio"] = 102400
+ dataset_conf["filter_conf"]["min_output_input_ratio"] = 0
+ dataset_conf["speed_perturb"] = False
+ dataset_conf["spec_aug"] = False
+ dataset_conf["shuffle"] = False
+ dataset_conf["sort"] = True
+ dataset_conf["fbank_conf"]["dither"] = 0.0
+ dataset_conf["batch_conf"]["batch_type"] = "static"
+ dataset_conf["batch_conf"]["batch_size"] = args.batch_size
+
+ # Load dict
+ dict_fn = os.path.join(args.model_dir, "words.txt")
+ char_dict = {}
+ with open(dict_fn, "r", encoding="utf8") as fin:
+ for line in fin:
+ arr = line.strip().split()
+ assert len(arr) == 2
+ char_dict[int(arr[1])] = arr[0]
+ eos = len(char_dict) - 1
+
+ data_type = "raw"
+ test_data_fn = os.path.join(args.data_dir, "data.list")
+ symbol_table = read_symbol_table(dict_fn)
+ test_dataset = Dataset(
+ data_type, test_data_fn, symbol_table, dataset_conf, partition=False
+ )
+ test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+
+ data_path_pkl = os.path.join(args.data_dir, f"aishell_test_data_bs{args.batch_size}.pkl")
+
+ print("*** 1. Prepare data ***")
+ if not os.path.isfile(data_path_pkl):
+ eval_samples = []
+ max_batch_size = -1
+ max_feature_length = -1
+ for batch in test_data_loader:
+ keys, feats, target, feats_lengths, target_lengths = batch
+ max_feature_length = max(max_feature_length, feats.size(1))
+ max_batch_size = max(max_batch_size, feats.size(0))
+ eval_samples.append(
+ [
+ keys,
+ feats.cpu().numpy().astype(np.float16),
+ feats_lengths.cpu().numpy().astype(np.int32),
+ ]
+ )
+ with open(data_path_pkl, "wb") as f:
+ pickle.dump(
+ [
+ eval_samples,
+ max_batch_size,
+ max_feature_length
+ ],
+ f,
+ )
+ else:
+ print(f"load data from tmp: {data_path_pkl}")
+ with open(data_path_pkl, "rb") as f:
+ (
+ eval_samples,
+ max_batch_size,
+ max_feature_length
+ ) = pickle.load(f)
+ print(
+ f"dataset max shape: batch_size: {max_batch_size}, feat_length: {max_feature_length}"
+ )
+
+ print("*** 2. Load engine ***")
+ engine_path = os.path.join(args.model_dir, f"conformer_encoder_fusion.engine")
+ engine, context = engine_init(engine_path)
+
+ print("*** 3. Warm up ***")
+ if args.warm_up > 0:
+ for i in range(args.warm_up):
+ feats_tmp = np.ones((args.batch_size, 1500, 80)).astype(np.float32)
+ feats_lengths_tmp = np.ones((args.batch_size)).astype(np.int32) * 1500
+ mask_tmp = make_pad_mask(feats_lengths_tmp, 1500)
+ mask_len_tmp = mask_tmp.shape[-1]
+ pos_emb_tmp = rel_positional_encoding(mask_len_tmp).numpy()
+ all_inputs = [feats_tmp, mask_tmp, pos_emb_tmp]
+ tensorrt_infer(engine, context, all_inputs)
+
+ results = []
+ for keys, feats, feats_lengths in tqdm(eval_samples):
+ b, seq_len, feat = feats.shape
+
+ inputs = feats.astype(np.float32)
+ mask = make_pad_mask(feats_lengths, seq_len)
+ mask_len = mask.shape[-1]
+ pos_emb = rel_positional_encoding(mask_len).numpy()
+
+ all_inputs = [inputs, mask, pos_emb]
+ hyps = tensorrt_infer(
+ engine,
+ context,
+ all_inputs
+ )
+
+ ctc_probs = torch.from_numpy(hyps)
+ ctc_lens = torch.from_numpy(feats_lengths)
+ hyps = ctc_greedy_search(ctc_probs, ctc_lens)
+
+ for i, key in enumerate(keys):
+ line = f"{key} "
+ for w in hyps[i]:
+ w = w - 1
+ if w == eos:
+ break
+ line += char_dict[w]
+ results.append(line)
+
+ # 3. 计算 CER
+ reference_file = os.path.join(args.data_dir, "text")
+ reference_data = []
+ for line in open(reference_file, "r", encoding="utf-8"):
+ reference_data.append(line)
+
+ cer, corr = calculate_cer(results, reference_data)
+ target_cer = float(os.environ["Accuracy"])
+ print("CER: ", cer, "target CER: ", target_cer)
+ if cer <= target_cer:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_performance.py b/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_performance.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19233fa6813722083e1e86fbfc310dcd1370670
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/ixrt_inference_performance.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+import time
+
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+
+import argparse
+import yaml
+import copy
+import torch
+import numpy as np
+
+from tqdm.contrib import tqdm
+from torch.utils.data import DataLoader
+from wenet.file_utils import read_symbol_table
+from wenet.dataset import Dataset
+from tools.compute_cer import Calculator, characterize, normalize, default_cluster
+import tensorrt
+from tensorrt import Dims
+from common import create_engine_context, get_io_bindings,trtapi,setup_io_bindings
+import pickle
+
+import pycuda.autoinit
+import pycuda.driver as cuda
+
+from utils import make_pad_mask, RelPositionalEncoding
+from postprocess import ctc_greedy_search
+
+
+rel_positional_encoding = RelPositionalEncoding(256, 0.1)
+
+
+def get_args():
+ parser = argparse.ArgumentParser(description="recognize with your model")
+ parser.add_argument(
+ "--infer_type",
+ default="fp16",
+ choices=["fp16", "int8"],
+ help="inference type: fp16 or int8",
+ )
+ parser.add_argument("--warm_up", type=int, default=3, help="warm_up count")
+ parser.add_argument("--batch_size", type=int, default=24)
+ parser.add_argument("--data_dir", required=True, help="test data directory")
+ parser.add_argument(
+ "--model_dir", type=str, required=True, help="model for inference"
+ )
+ args = parser.parse_args()
+ return args
+
+
+def tensorrt_infer(engine, context, all_inputs):
+
+ input_names = ["input", "mask", "pos_emb"]
+ output_names = ["output"]
+
+ for input_name, input_data in zip(input_names, all_inputs):
+ input_idx = engine.get_binding_index(input_name)
+ input_shape = input_data.shape
+ context.set_binding_shape(input_idx, Dims(input_shape))
+
+ inputs, outputs, allocations = setup_io_bindings(engine, context)
+ pred_output = np.zeros(outputs[0]["shape"], outputs[0]["dtype"])
+
+ for i, input_data in enumerate(all_inputs):
+ cuda.memcpy_htod(inputs[i]["allocation"], input_data)
+
+ context.execute_v2(allocations)
+ cuda.memcpy_dtoh(pred_output, outputs[0]["allocation"])
+ return pred_output
+
+
+def engine_init(engine):
+ host_mem = tensorrt.IHostMemory
+ logger = tensorrt.Logger(tensorrt.Logger.ERROR)
+ engine, context = create_engine_context(engine, logger)
+
+ return engine,context
+
+
+def calculate_cer(data, reference_data):
+ calculator = Calculator()
+ tochar = True
+ split = None
+ case_sensitive = False
+ ignore_words = set()
+ rec_set = {}
+ for line in data:
+ if tochar:
+ array = characterize(line)
+ else:
+ array = line.strip().split()
+ if len(array) == 0:
+ continue
+ fid = array[0]
+ rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
+
+ default_clusters = {}
+ default_words = {}
+ for line in reference_data:
+ if tochar:
+ array = characterize(line)
+ else:
+ array = line.strip().split()
+ if len(array) == 0:
+ continue
+ fid = array[0]
+ if fid not in rec_set:
+ continue
+ lab = normalize(array[1:], ignore_words, case_sensitive, split)
+ rec = rec_set[fid]
+
+ for word in rec + lab:
+ if word not in default_words:
+ default_cluster_name = default_cluster(word)
+ if default_cluster_name not in default_clusters:
+ default_clusters[default_cluster_name] = {}
+ if word not in default_clusters[default_cluster_name]:
+ default_clusters[default_cluster_name][word] = 1
+ default_words[word] = default_cluster_name
+ result = calculator.calculate(lab, rec)
+
+ result = calculator.overall()
+ cer = float(result["ins"] + result["sub"] + result["del"]) / result["all"]
+ corr = result["cor"] / result["all"]
+
+ return cer, corr
+
+
+def main():
+ args = get_args()
+
+ # 读取配置文件
+ config_fn = os.path.join(args.model_dir, "config.yaml")
+ with open(config_fn, "r") as fin:
+ configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+ dataset_conf = copy.deepcopy(configs["dataset_conf"])
+ dataset_conf["filter_conf"]["max_length"] = 102400
+ dataset_conf["filter_conf"]["min_length"] = 0
+ dataset_conf["filter_conf"]["token_max_length"] = 102400
+ dataset_conf["filter_conf"]["token_min_length"] = 0
+ dataset_conf["filter_conf"]["max_output_input_ratio"] = 102400
+ dataset_conf["filter_conf"]["min_output_input_ratio"] = 0
+ dataset_conf["speed_perturb"] = False
+ dataset_conf["spec_aug"] = False
+ dataset_conf["shuffle"] = False
+ dataset_conf["sort"] = True
+ dataset_conf["fbank_conf"]["dither"] = 0.0
+ dataset_conf["batch_conf"]["batch_type"] = "static"
+ dataset_conf["batch_conf"]["batch_size"] = args.batch_size
+
+ # Load dict
+ dict_fn = os.path.join(args.model_dir, "words.txt")
+ char_dict = {}
+ with open(dict_fn, "r", encoding="utf8") as fin:
+ for line in fin:
+ arr = line.strip().split()
+ assert len(arr) == 2
+ char_dict[int(arr[1])] = arr[0]
+ eos = len(char_dict) - 1
+
+ data_type = "raw"
+ test_data_fn = os.path.join(args.data_dir, "data.list")
+ symbol_table = read_symbol_table(dict_fn)
+ test_dataset = Dataset(
+ data_type, test_data_fn, symbol_table, dataset_conf, partition=False
+ )
+ test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+
+ data_path_pkl = os.path.join(args.data_dir, f"aishell_test_data_bs{args.batch_size}.pkl")
+
+ print("*** 1. Prepare data ***")
+ if not os.path.isfile(data_path_pkl):
+ eval_samples = []
+ max_batch_size = -1
+ max_feature_length = -1
+ for batch in test_data_loader:
+ keys, feats, target, feats_lengths, target_lengths = batch
+ max_feature_length = max(max_feature_length, feats.size(1))
+ max_batch_size = max(max_batch_size, feats.size(0))
+ eval_samples.append(
+ [
+ keys,
+ feats.cpu().numpy().astype(np.float16),
+ feats_lengths.cpu().numpy().astype(np.int32),
+ ]
+ )
+ with open(data_path_pkl, "wb") as f:
+ pickle.dump(
+ [
+ eval_samples,
+ max_batch_size,
+ max_feature_length
+ ],
+ f,
+ )
+ else:
+ print(f"load data from tmp: {data_path_pkl}")
+ with open(data_path_pkl, "rb") as f:
+ (
+ eval_samples,
+ max_batch_size,
+ max_feature_length
+ ) = pickle.load(f)
+ print(
+ f"dataset max shape: batch_size: {max_batch_size}, feat_length: {max_feature_length}"
+ )
+
+ print("*** 2. Load engine ***")
+ engine_path = os.path.join(args.model_dir, f"conformer_encoder_fusion.engine")
+ engine, context = engine_init(engine_path)
+
+ print("*** 3. Warm up ***")
+ if args.warm_up > 0:
+ for i in range(args.warm_up):
+ feats_tmp = np.ones((args.batch_size, 1500, 80)).astype(np.float32)
+ feats_lengths_tmp = np.ones((args.batch_size)).astype(np.int32) * 1500
+ mask_tmp = make_pad_mask(feats_lengths_tmp, 1500)
+ mask_len_tmp = mask_tmp.shape[-1]
+ pos_emb_tmp = rel_positional_encoding(mask_len_tmp).numpy()
+ all_inputs = [feats_tmp, mask_tmp, pos_emb_tmp]
+ tensorrt_infer(engine, context, all_inputs)
+
+ print("*** 4. Inference ***")
+ start_time = time.time()
+ num_samples = 0
+ results = []
+ for keys, feats, feats_lengths in tqdm(eval_samples):
+ b, seq_len, feat = feats.shape
+ num_samples += b
+ inputs = feats.astype(np.float32)
+ mask = make_pad_mask(feats_lengths, seq_len)
+ mask_len = mask.shape[-1]
+ pos_emb = rel_positional_encoding(mask_len).numpy()
+
+ all_inputs = [inputs, mask, pos_emb]
+ hyps = tensorrt_infer(
+ engine,
+ context,
+ all_inputs
+ )
+
+ eval_time = time.time() - start_time
+
+ QPS = num_samples / eval_time
+ print(f"Recognize {num_samples} sentences, {QPS} sentences/s")
+ target_qps = float(os.environ["Accuracy"])
+ print("QPS: = ", QPS, "target QPS: ", target_qps)
+ if QPS >= target_qps:
+ print("pass!")
+ exit()
+ else:
+ print("failed!")
+ exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/models/speech/speech_recognition/conformer/ixrt/postprocess/__init__.py b/models/speech/speech_recognition/conformer/ixrt/postprocess/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33f8b0465aee011298fa9933086fbdc1c8dbd4d4
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/postprocess/__init__.py
@@ -0,0 +1 @@
+from .search import ctc_greedy_search
diff --git a/models/speech/speech_recognition/conformer/ixrt/postprocess/search.py b/models/speech/speech_recognition/conformer/ixrt/postprocess/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2ae55650539b9d0be352e78a64999606ac12fbb
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/postprocess/search.py
@@ -0,0 +1,103 @@
+import math
+from collections import defaultdict
+from typing import List, Dict
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+
+
+def remove_duplicates_and_blank(hyp: List[int],
+ blank_id: int = 0) -> List[int]:
+ new_hyp: List[int] = []
+ cur = 0
+ while cur < len(hyp):
+ if hyp[cur] != blank_id:
+ new_hyp.append(hyp[cur])
+ prev = cur
+ while cur < len(hyp) and hyp[cur] == hyp[prev]:
+ cur += 1
+ return new_hyp
+
+
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+ """Make mask tensor containing indices of padded part.
+
+ See description of make_non_pad_mask.
+
+ Args:
+ lengths (torch.Tensor): Batch of lengths (B,).
+ Returns:
+ torch.Tensor: Mask tensor containing indices of padded part.
+
+ Examples:
+ >>> lengths = [5, 3, 2]
+ >>> make_pad_mask(lengths)
+ masks = [[0, 0, 0, 0 ,0],
+ [0, 0, 0, 1, 1],
+ [0, 0, 1, 1, 1]]
+ """
+ batch_size = lengths.size(0)
+ max_len = max_len if max_len > 0 else lengths.max().item()
+ seq_range = torch.arange(0,
+ max_len,
+ dtype=torch.int64,
+ device=lengths.device)
+ seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+ seq_length_expand = lengths.unsqueeze(-1)
+ mask = seq_range_expand >= seq_length_expand
+
+ mask = mask[:, 2::2][:, 2::2]
+ return mask
+
+
+class DecodeResult:
+
+ def __init__(self,
+ tokens: List[int],
+ score: float = 0.0,
+ confidence: float = 0.0,
+ tokens_confidence: List[float] = None,
+ times: List[int] = None,
+ nbest: List[List[int]] = None,
+ nbest_scores: List[float] = None,
+ nbest_times: List[List[int]] = None):
+ """
+ Args:
+ tokens: decode token list
+ score: the total decode score of this result
+ confidence: the total confidence of this result, it's in 0~1
+ tokens_confidence: confidence of each token
+ times: timestamp of each token, list of (start, end)
+ nbest: nbest result
+ nbest_scores: score of each nbest
+ nbest_times:
+ """
+ self.tokens = tokens
+ self.score = score
+ self.confidence = confidence
+ self.tokens_confidence = tokens_confidence
+ self.times = times
+ self.nbest = nbest
+ self.nbest_scores = nbest_scores
+ self.nbest_times = nbest_times
+
+
+def ctc_greedy_search(ctc_probs: torch.Tensor,
+ ctc_lens: torch.Tensor,
+ blank_id: int = 0) -> List[DecodeResult]:
+
+ batch_size = ctc_probs.shape[0]
+ maxlen = ctc_probs.size(1)
+ topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1)
+ topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen)
+
+ mask_ctc_lens = ctc_lens[0].item()
+ mask = make_pad_mask(ctc_lens, mask_ctc_lens) # (B, maxlen)
+ topk_index = topk_index.masked_fill_(mask, blank_id) # (B, maxlen)
+ hyps = [hyp.tolist() for hyp in topk_index]
+ scores = topk_prob.max(1)
+ results = []
+ for hyp in hyps:
+ results.append(remove_duplicates_and_blank(hyp, blank_id))
+ return results
+
diff --git a/models/speech/speech_recognition/conformer/ixrt/scripts/aishell_data_prepare.sh b/models/speech/speech_recognition/conformer/ixrt/scripts/aishell_data_prepare.sh
new file mode 100755
index 0000000000000000000000000000000000000000..985564c2294b2a413531d6ced018029ec911fb23
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/scripts/aishell_data_prepare.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+# set -euox pipefail
+
+data_dir=$1
+tool_dir=$2
+
+wav_dir=${data_dir}/wav
+aishell_text=${data_dir}/transcript/aishell_transcript_v0.8.txt
+
+# data directory check
+if [ ! -d $wav_dir ] || [ ! -f $aishell_text ]; then
+ echo "Error: wav directory and aishell text not found!"
+ exit 1;
+fi
+
+# find test wav file
+local_dir=${data_dir}/local
+mkdir -p $local_dir
+find $wav_dir -iname "*.wav" > $local_dir/wav.flist || exit 1;
+
+# Transcriptions preparation
+sed -e 's/\.wav//' $local_dir/wav.flist | awk -F '/' '{print $NF}' > $local_dir/utt.list
+paste -d' ' $local_dir/utt.list $local_dir/wav.flist > $local_dir/wav.scp_all
+${tool_dir}/filter_scp.pl -f 1 $local_dir/utt.list $aishell_text > $local_dir/transcripts.txt
+awk '{print $1}' $local_dir/transcripts.txt > $local_dir/utt.list
+${tool_dir}/filter_scp.pl -f 1 $local_dir/utt.list $local_dir/wav.scp_all | sort -u > $local_dir/wav.scp
+sort -u $local_dir/transcripts.txt > $local_dir/text
+echo "Preparing transcriptions succeeded!"
+
+test_dir=${data_dir}/test
+mkdir -p ${test_dir}
+for f in wav.scp text; do
+ cp $local_dir/$f ${test_dir}/$f || exit 1;
+done
+rm -r ${data_dir}/local
+
+# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
+# `shard` is used for large dataset which is over 1k hours, and `shard` is
+# faster on reading data and training.
+data_type=raw
+num_utts_per_shard=1000
+
+# remove the space between the text labels for Mandarin dataset
+cp $test_dir/text $test_dir/text.org
+paste -d " " <(cut -f 1 -d" " ${test_dir}/text.org) \
+ <(cut -f 2- -d" " ${test_dir}/text.org | tr -d " ") \
+ > ${test_dir}/text
+rm ${test_dir}/text.org
+
+# Prepare required format
+if [ $data_type == "shard" ]; then
+ ${tool_dir}/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+ --num_threads 16 $test_dir/wav.scp $test_dir/text \
+ $(realpath $test_dir/shards) $test_dir/data.list
+else
+ ${tool_dir}/make_raw_list.py $test_dir/wav.scp $test_dir/text \
+ $test_dir/data.list
+fi
+
+echo "AISHELL data preparation succeeded!"
\ No newline at end of file
diff --git a/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy_ixrt.sh b/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy_ixrt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f1af4bb4e03a0c9c6084ae7a122f66f765c27c86
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_accuracy_ixrt.sh
@@ -0,0 +1,49 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+
+PROJECT_DIR=${current_path}/..
+DATA_DIR=${current_path}/../aishell_test_data/test
+MODEL_DIR=${current_path}/../conformer_checkpoints
+
+export Accuracy=${Accuracy:=0.052}
+
+cd ${PROJECT_DIR}
+
+echo "Step1.Export Onnx From Checkpoints!"
+python3 convert2onnx.py \
+ --model_name "Conformer" \
+ --model_path=${MODEL_DIR}/final.pt \
+ --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx \
+ --batch_size=8
+
+echo "Step2.Build Engine!"
+python3 build_engine.py \
+ --model_name "Conformer" \
+ --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx \
+ --engine_path=${MODEL_DIR}/conformer_encoder_fusion.engine \
+ --max_batch_size=8 \
+ --max_seq_len=1500
+
+echo "Step3.Inference(Test ACC)!"
+python3 ixrt_inference_accuracy.py \
+ --infer_type fp16 \
+ --warm_up 3 \
+ --batch_size ${BATCH_SIZE:=8} \
+ --data_dir ${DATA_DIR} \
+ --model_dir ${MODEL_DIR}
diff --git a/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance_ixrt.sh b/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance_ixrt.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dc02673c03fb21a4301b757a18885af81cbad31d
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/scripts/infer_conformer_fp16_performance_ixrt.sh
@@ -0,0 +1,59 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+
+EXIT_STATUS=0
+check_status()
+{
+ if ((${PIPESTATUS[0]} != 0));then
+ echo "fails"
+ EXIT_STATUS=1
+ fi
+}
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+
+PROJECT_DIR=${current_path}/..
+DATA_DIR=${current_path}/../aishell_test_data/test
+MODEL_DIR=${current_path}/../conformer_checkpoints
+
+export Accuracy=${Accuracy:=350}
+
+cd ${PROJECT_DIR}
+
+
+echo "Step1.Export Onnx From Checkpoints!"
+python3 convert2onnx.py \
+ --model_name "Conformer" \
+ --model_path=${MODEL_DIR}/final.pt \
+ --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx \
+ --batch_size=24
+
+echo "Step2.Build Engine!"
+python3 build_engine.py \
+ --model_name "Conformer" \
+ --onnx_path=${MODEL_DIR}/conformer_encoder_fusion.onnx \
+ --engine_path=${MODEL_DIR}/conformer_encoder_fusion.engine \
+ --max_batch_size=24 \
+ --max_seq_len=1500
+
+echo "Step3.Inference(Test QPS)!"
+python3 ixrt_inference_performance.py \
+ --infer_type fp16 \
+ --batch_size ${BATCH_SIZE:=24} \
+ --data_dir ${DATA_DIR} \
+ --model_dir ${MODEL_DIR}
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/__init__.py b/models/speech/speech_recognition/conformer/ixrt/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/compute_cer.py b/models/speech/speech_recognition/conformer/ixrt/tools/compute_cer.py
new file mode 100755
index 0000000000000000000000000000000000000000..a5db08979f4d31a4a2ac9e4ceb0d122537690aac
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/compute_cer.py
@@ -0,0 +1,532 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+import sys
+import unicodedata
+import codecs
+
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = ['!', ',', '?',
+ '、', '。', '!', ',', ';', '?',
+ ':', '「', '」', '︰', '『', '』', '《', '》']
+
+def characterize(string) :
+ res = []
+ i = 0
+ while i < len(string):
+ char = string[i]
+ if char in puncts:
+ i += 1
+ continue
+ cat1 = unicodedata.category(char)
+ # https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+ if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned
+ i += 1
+ continue
+ if cat1 == 'Lo': # letter-other
+ res.append(char)
+ i += 1
+ else:
+ # some input looks like: , we want to separate it to two words.
+ sep = ' '
+ if char == '<':
+ sep = '>'
+ j = i + 1
+ while j < len(string):
+ c = string[j]
+ if ord(c) >= 128 or (c in spacelist) or (c == sep):
+ break
+ j += 1
+ if j < len(string) and string[j] == '>':
+ j += 1
+ res.append(string[i:j])
+ i = j
+ return res
+
+def stripoff_tags(x):
+ if not x:
+ return ''
+ chars = []
+ i = 0
+ T = len(x)
+ while i < T:
+ if x[i] == '<':
+ while i < T and x[i] != '>':
+ i += 1
+ i += 1
+ else:
+ chars.append(x[i])
+ i += 1
+ return ''.join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+ """ sentence, ignore_words are both in unicode
+ """
+ new_sentence = []
+ for token in sentence:
+ x = token
+ if not cs:
+ x = x.upper()
+ if x in ignore_words:
+ continue
+ if remove_tag:
+ x = stripoff_tags(x)
+ if not x:
+ continue
+ if split and x in split:
+ new_sentence += split[x]
+ if x.isalnum():
+ for k in x:
+ new_sentence.append(k)
+ else:
+ new_sentence.append(x)
+ return new_sentence
+
+class Calculator :
+ def __init__(self) :
+ self.data = {}
+ self.space = []
+ self.cost = {}
+ self.cost['cor'] = 0
+ self.cost['sub'] = 1
+ self.cost['del'] = 1
+ self.cost['ins'] = 1
+
+ def calculate(self, lab, rec) :
+ # Initialization
+ lab.insert(0, '')
+ rec.insert(0, '')
+ while len(self.space) < len(lab) :
+ self.space.append([])
+ for row in self.space :
+ for element in row :
+ element['dist'] = 0
+ element['error'] = 'non'
+ while len(row) < len(rec) :
+ row.append({'dist' : 0, 'error' : 'non'})
+ for i in range(len(lab)) :
+ self.space[i][0]['dist'] = i
+ self.space[i][0]['error'] = 'del'
+ for j in range(len(rec)) :
+ self.space[0][j]['dist'] = j
+ self.space[0][j]['error'] = 'ins'
+ self.space[0][0]['error'] = 'non'
+ for token in lab :
+ if token not in self.data and len(token) > 0 :
+ self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0,
+ 'ins' : 0, 'del' : 0}
+ for token in rec :
+ if token not in self.data and len(token) > 0 :
+ self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0,
+ 'ins' : 0, 'del' : 0}
+ # Computing edit distance
+ for i, lab_token in enumerate(lab) :
+ for j, rec_token in enumerate(rec) :
+ if i == 0 or j == 0 :
+ continue
+ min_dist = sys.maxsize
+ min_error = 'none'
+ dist = self.space[i - 1][j]['dist'] + self.cost['del']
+ error = 'del'
+ if dist < min_dist :
+ min_dist = dist
+ min_error = error
+ dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+ error = 'ins'
+ if dist < min_dist :
+ min_dist = dist
+ min_error = error
+ if lab_token == rec_token :
+ dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+ error = 'cor'
+ else :
+ dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+ error = 'sub'
+ if dist < min_dist :
+ min_dist = dist
+ min_error = error
+ self.space[i][j]['dist'] = min_dist
+ self.space[i][j]['error'] = min_error
+ # Tracing back
+ result = {'lab': [], 'rec': [], 'all': 0, 'cor': 0, 'sub': 0,
+ 'ins': 0, 'del': 0}
+ i = len(lab) - 1
+ j = len(rec) - 1
+ while True :
+ if self.space[i][j]['error'] == 'cor' : # correct
+ if len(lab[i]) > 0 :
+ self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+ self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+ result['all'] = result['all'] + 1
+ result['cor'] = result['cor'] + 1
+ result['lab'].insert(0, lab[i])
+ result['rec'].insert(0, rec[j])
+ i = i - 1
+ j = j - 1
+ elif self.space[i][j]['error'] == 'sub' : # substitution
+ if len(lab[i]) > 0 :
+ self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+ self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+ result['all'] = result['all'] + 1
+ result['sub'] = result['sub'] + 1
+ result['lab'].insert(0, lab[i])
+ result['rec'].insert(0, rec[j])
+ i = i - 1
+ j = j - 1
+ elif self.space[i][j]['error'] == 'del' : # deletion
+ if len(lab[i]) > 0 :
+ self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+ self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+ result['all'] = result['all'] + 1
+ result['del'] = result['del'] + 1
+ result['lab'].insert(0, lab[i])
+ result['rec'].insert(0, "")
+ i = i - 1
+ elif self.space[i][j]['error'] == 'ins' : # insertion
+ if len(rec[j]) > 0 :
+ self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+ result['ins'] = result['ins'] + 1
+ result['lab'].insert(0, "")
+ result['rec'].insert(0, rec[j])
+ j = j - 1
+ elif self.space[i][j]['error'] == 'non' : # starting point
+ break
+ else : # shouldn't reach here
+ print('this should not happen , i={i} , j={j} , \
+ error={error}'.
+ format(i=i, j=j, error=self.space[i][j]['error']))
+ return result
+
+ def overall(self) :
+ result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+ for token in self.data :
+ result['all'] = result['all'] + self.data[token]['all']
+ result['cor'] = result['cor'] + self.data[token]['cor']
+ result['sub'] = result['sub'] + self.data[token]['sub']
+ result['ins'] = result['ins'] + self.data[token]['ins']
+ result['del'] = result['del'] + self.data[token]['del']
+ return result
+
+ def cluster(self, data) :
+ result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+ for token in data :
+ if token in self.data :
+ result['all'] = result['all'] + self.data[token]['all']
+ result['cor'] = result['cor'] + self.data[token]['cor']
+ result['sub'] = result['sub'] + self.data[token]['sub']
+ result['ins'] = result['ins'] + self.data[token]['ins']
+ result['del'] = result['del'] + self.data[token]['del']
+ return result
+
+ def keys(self) :
+ return list(self.data.keys())
+
+def width(string):
+ return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+def default_cluster(word) :
+ unicode_names = [unicodedata.name(char) for char in word]
+ for i in reversed(range(len(unicode_names))) :
+ if unicode_names[i].startswith('DIGIT') : # 1
+ unicode_names[i] = 'Number' # 'DIGIT'
+ elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
+ unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) :
+ # 明 / 郎
+ unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH'
+ elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
+ unicode_names[i].startswith('LATIN SMALL LETTER')) :
+ # A / a
+ unicode_names[i] = 'English' # 'LATIN LETTER'
+ elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め
+ unicode_names[i] = 'Japanese' # 'GANA LETTER'
+ elif (unicode_names[i].startswith('AMPERSAND') or
+ unicode_names[i].startswith('APOSTROPHE') or
+ unicode_names[i].startswith('COMMERCIAL AT') or
+ unicode_names[i].startswith('DEGREE CELSIUS') or
+ unicode_names[i].startswith('EQUALS SIGN') or
+ unicode_names[i].startswith('FULL STOP') or
+ unicode_names[i].startswith('HYPHEN-MINUS') or
+ unicode_names[i].startswith('LOW LINE') or
+ unicode_names[i].startswith('NUMBER SIGN') or
+ unicode_names[i].startswith('PLUS SIGN') or
+ unicode_names[i].startswith('SEMICOLON')) :
+ # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+ del unicode_names[i]
+ else :
+ return 'Other'
+ if len(unicode_names) == 0 :
+ return 'Other'
+ if len(unicode_names) == 1 :
+ return unicode_names[0]
+ for i in range(len(unicode_names) - 1) :
+ if unicode_names[i] != unicode_names[i + 1] :
+ return 'Other'
+ return unicode_names[0]
+
+def usage() :
+ print("compute-wer.py : compute word error rate (WER) \
+ and align recognition results and references.")
+ print(" usage : python compute-wer.py [--cs={0,1}] \
+ [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] \
+ [--padding-symbol={space,underline}] test.ref test.hyp > test.wer")
+
+if __name__ == '__main__':
+ if len(sys.argv) == 1 :
+ usage()
+ sys.exit(0)
+ calculator = Calculator()
+ cluster_file = ''
+ ignore_words = set()
+ tochar = False
+ verbose = 1
+ padding_symbol = ' '
+ case_sensitive = False
+ max_words_per_line = sys.maxsize
+ split = None
+ while len(sys.argv) > 3:
+ a = '--maxw='
+ if sys.argv[1].startswith(a):
+ b = sys.argv[1][len(a):]
+ del sys.argv[1]
+ max_words_per_line = int(b)
+ continue
+ a = '--rt='
+ if sys.argv[1].startswith(a):
+ b = sys.argv[1][len(a):].lower()
+ del sys.argv[1]
+ remove_tag = (b == 'true') or (b != '0')
+ continue
+ a = '--cs='
+ if sys.argv[1].startswith(a):
+ b = sys.argv[1][len(a):].lower()
+ del sys.argv[1]
+ case_sensitive = (b == 'true') or (b != '0')
+ continue
+ a = '--cluster='
+ if sys.argv[1].startswith(a):
+ cluster_file = sys.argv[1][len(a):]
+ del sys.argv[1]
+ continue
+ a = '--splitfile='
+ if sys.argv[1].startswith(a):
+ split_file = sys.argv[1][len(a):]
+ del sys.argv[1]
+ split = dict()
+ with codecs.open(split_file, 'r', 'utf-8') as fh:
+ for line in fh: # line in unicode
+ words = line.strip().split()
+ if len(words) >= 2:
+ split[words[0]] = words[1:]
+ continue
+ a = '--ig='
+ if sys.argv[1].startswith(a):
+ ignore_file = sys.argv[1][len(a):]
+ del sys.argv[1]
+ with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+ for line in fh: # line in unicode
+ line = line.strip()
+ if len(line) > 0:
+ ignore_words.add(line)
+ continue
+ a = '--char='
+ if sys.argv[1].startswith(a):
+ b = sys.argv[1][len(a):].lower()
+ del sys.argv[1]
+ tochar = (b == 'true') or (b != '0')
+ continue
+ a = '--v='
+ if sys.argv[1].startswith(a):
+ b = sys.argv[1][len(a):].lower()
+ del sys.argv[1]
+ verbose = 0
+ try:
+ verbose = int(b)
+ except Exception:
+ if b == 'true' or b != '0':
+ verbose = 1
+ continue
+ a = '--padding-symbol='
+ if sys.argv[1].startswith(a):
+ b = sys.argv[1][len(a):].lower()
+ del sys.argv[1]
+ if b == 'space':
+ padding_symbol = ' '
+ elif b == 'underline':
+ padding_symbol = '_'
+ continue
+ if True or sys.argv[1].startswith('-'):
+ # ignore invalid switch
+ del sys.argv[1]
+ continue
+
+ if not case_sensitive:
+ ig = set([w.upper() for w in ignore_words])
+ ignore_words = ig
+
+ default_clusters = {}
+ default_words = {}
+
+ ref_file = sys.argv[1]
+ hyp_file = sys.argv[2]
+ rec_set = {}
+ if split and not case_sensitive:
+ newsplit = dict()
+ for w in split:
+ words = split[w]
+ for i in range(len(words)):
+ words[i] = words[i].upper()
+ newsplit[w.upper()] = words
+ split = newsplit
+
+ with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+ for line in fh:
+ if tochar:
+ array = characterize(line)
+ else:
+ array = line.strip().split()
+ if len(array) == 0:
+ continue
+ fid = array[0]
+ rec_set[fid] = normalize(array[1:], ignore_words,
+ case_sensitive, split)
+
+ # compute error rate on the interaction of reference file and hyp file
+ for line in open(ref_file, 'r', encoding='utf-8') :
+ if tochar:
+ array = characterize(line)
+ else:
+ array = line.rstrip('\n').split()
+ if len(array) == 0:
+ continue
+ fid = array[0]
+ if fid not in rec_set:
+ continue
+ lab = normalize(array[1:], ignore_words, case_sensitive, split)
+ rec = rec_set[fid]
+ if verbose:
+ print('\nutt: %s' % fid)
+
+ for word in rec + lab :
+ if word not in default_words :
+ default_cluster_name = default_cluster(word)
+ if default_cluster_name not in default_clusters :
+ default_clusters[default_cluster_name] = {}
+ if word not in default_clusters[default_cluster_name] :
+ default_clusters[default_cluster_name][word] = 1
+ default_words[word] = default_cluster_name
+
+ result = calculator.calculate(lab, rec)
+ if verbose:
+ if result['all'] != 0 :
+ wer = float(result['ins'] + result['sub'] +
+ result['del']) * 100.0 / result['all']
+ else :
+ wer = 0.0
+ print('WER: %4.2f %%' % wer, end=' ')
+ print('N=%d C=%d S=%d D=%d I=%d' %
+ (result['all'], result['cor'], result['sub'],
+ result['del'], result['ins']))
+ space = {}
+ space['lab'] = []
+ space['rec'] = []
+ for idx in range(len(result['lab'])) :
+ len_lab = width(result['lab'][idx])
+ len_rec = width(result['rec'][idx])
+ length = max(len_lab, len_rec)
+ space['lab'].append(length - len_lab)
+ space['rec'].append(length - len_rec)
+ upper_lab = len(result['lab'])
+ upper_rec = len(result['rec'])
+ lab1, rec1 = 0, 0
+ while lab1 < upper_lab or rec1 < upper_rec:
+ if verbose > 1:
+ print('lab(%s):' % fid.encode('utf-8'), end=' ')
+ else:
+ print('lab:', end=' ')
+ lab2 = min(upper_lab, lab1 + max_words_per_line)
+ for idx in range(lab1, lab2):
+ token = result['lab'][idx]
+ print('{token}'.format(token=token), end='')
+ for n in range(space['lab'][idx]) :
+ print(padding_symbol, end='')
+ print(' ', end='')
+ print()
+ if verbose > 1:
+ print('rec(%s):' % fid.encode('utf-8'), end=' ')
+ else:
+ print('rec:', end=' ')
+ rec2 = min(upper_rec, rec1 + max_words_per_line)
+ for idx in range(rec1, rec2):
+ token = result['rec'][idx]
+ print('{token}'.format(token=token), end='')
+ for n in range(space['rec'][idx]) :
+ print(padding_symbol, end='')
+ print(' ', end='')
+ print('\n', end='\n')
+ lab1 = lab2
+ rec1 = rec2
+
+ if verbose:
+ print('==================================================='
+ '========================')
+ print()
+
+ result = calculator.overall()
+ if result['all'] != 0 :
+ wer = float(result['ins'] + result['sub'] +
+ result['del']) * 100.0 / result['all']
+ else :
+ wer = 0.0
+ print('Overall -> wer %4.2f %% Corr %4.2f %%' % (wer, result['cor']*100/result['all']), end=' ')
+ print('N=%d C=%d S=%d D=%d I=%d' %
+ (result['all'], result['cor'], result['sub'],
+ result['del'], result['ins']))
+ if not verbose:
+ print()
+
+ if verbose:
+ for cluster_id in default_clusters :
+ result = calculator.cluster(k for k in default_clusters[cluster_id])
+ if result['all'] != 0 :
+ wer = float(result['ins'] + result['sub'] +
+ result['del']) * 100.0 / result['all']
+ else :
+ wer = 0.0
+ print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+ print('N=%d C=%d S=%d D=%d I=%d' %
+ (result['all'], result['cor'], result['sub'],
+ result['del'], result['ins']))
+ if len(cluster_file) > 0 : # compute separated WERs for word clusters
+ cluster_id = ''
+ cluster = []
+ for line in open(cluster_file, 'r', encoding='utf-8') :
+ for token in line.decode('utf-8').rstrip('\n').split() :
+ # end of cluster reached, like
+ if token[0:2] == '' and token[len(token) - 1] == '>' and \
+ token.lstrip('').rstrip('>') == cluster_id :
+ result = calculator.cluster(cluster)
+ if result['all'] != 0 :
+ wer = float(result['ins'] + result['sub'] +
+ result['del']) * 100.0 / result['all']
+ else :
+ wer = 0.0
+ print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+ print('N=%d C=%d S=%d D=%d I=%d' %
+ (result['all'], result['cor'], result['sub'],
+ result['del'], result['ins']))
+ cluster_id = ''
+ cluster = []
+ # begin of cluster reached, like
+ elif (token[0] == '<' and token[len(token) - 1] == '>' and
+ cluster_id == ''):
+ cluster_id = token.lstrip('<').rstrip('>')
+ cluster = []
+ # general terms, like WEATHER / CAR / ...
+ else :
+ cluster.append(token)
+ print()
+ print('======================================='
+ '====================================')
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/filter_scp.pl b/models/speech/speech_recognition/conformer/ixrt/tools/filter_scp.pl
new file mode 100755
index 0000000000000000000000000000000000000000..b76d37f41be0886470281978bfacf97f6b8ae976
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/filter_scp.pl
@@ -0,0 +1,87 @@
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation
+# Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose "n-th" field is an utterance id), printing
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
+# the -f switch
+
+$exclude = 0;
+$field = 1;
+$shifted = 0;
+
+do {
+ $shifted=0;
+ if ($ARGV[0] eq "--exclude") {
+ $exclude = 1;
+ shift @ARGV;
+ $shifted=1;
+ }
+ if ($ARGV[0] eq "-f") {
+ $field = $ARGV[1];
+ shift @ARGV; shift @ARGV;
+ $shifted=1
+ }
+} while ($shifted);
+
+if(@ARGV < 1 || @ARGV > 2) {
+ die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" .
+ "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+ "Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
+ "only the lines that were *not* in id_list.\n" .
+ "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+ "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+ "-f option, add 1 to the argument.\n" .
+ "See also: utils/filter_scp.pl .\n";
+}
+
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while() {
+ @A = split;
+ @A>=1 || die "Invalid id-list file line $_";
+ $seen{$A[0]} = 1;
+}
+
+if ($field == 1) { # Treat this as special case, since it is common.
+ while(<>) {
+ $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+ # $1 is what we filter on.
+ if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+ print $_;
+ }
+ }
+} else {
+ while(<>) {
+ @A = split;
+ @A > 0 || die "Invalid scp file line $_";
+ @A >= $field || die "Invalid scp file line $_";
+ if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+ print $_;
+ }
+ }
+}
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/make_raw_list.py b/models/speech/speech_recognition/conformer/ixrt/tools/make_raw_list.py
new file mode 100755
index 0000000000000000000000000000000000000000..2f84f015542bb38da027b8ea61e8638f873cec33
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/make_raw_list.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='')
+ parser.add_argument('--segments', default=None, help='segments file')
+ parser.add_argument('wav_file', help='wav file')
+ parser.add_argument('text_file', help='text file')
+ parser.add_argument('output_file', help='output list file')
+ args = parser.parse_args()
+
+ wav_table = {}
+ with open(args.wav_file, 'r', encoding='utf8') as fin:
+ for line in fin:
+ arr = line.strip().split()
+ assert len(arr) == 2
+ wav_table[arr[0]] = arr[1]
+
+ if args.segments is not None:
+ segments_table = {}
+ with open(args.segments, 'r', encoding='utf8') as fin:
+ for line in fin:
+ arr = line.strip().split()
+ assert len(arr) == 4
+ segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3]))
+
+ with open(args.text_file, 'r', encoding='utf8') as fin, \
+ open(args.output_file, 'w', encoding='utf8') as fout:
+ for line in fin:
+ arr = line.strip().split(maxsplit=1)
+ key = arr[0]
+ txt = arr[1] if len(arr) > 1 else ''
+ if args.segments is None:
+ assert key in wav_table
+ wav = wav_table[key]
+ line = dict(key=key, wav=wav, txt=txt)
+ else:
+ assert key in segments_table
+ wav_key, start, end = segments_table[key]
+ wav = wav_table[wav_key]
+ line = dict(key=key, wav=wav, txt=txt, start=start, end=end)
+ json_line = json.dumps(line, ensure_ascii=False)
+ fout.write(json_line + '\n')
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/make_shard_list.py b/models/speech/speech_recognition/conformer/ixrt/tools/make_shard_list.py
new file mode 100755
index 0000000000000000000000000000000000000000..fcd4bcd7d62ba933cf27c34fc02e18371a6b10a6
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/make_shard_list.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import io
+import logging
+import os
+import tarfile
+import time
+import multiprocessing
+
+import torch
+import torchaudio
+import torchaudio.backend.sox_io_backend as sox
+
+AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
+
+
+def write_tar_file(data_list,
+ no_segments,
+ tar_file,
+ resample=16000,
+ index=0,
+ total=1):
+ logging.info('Processing {} {}/{}'.format(tar_file, index, total))
+ read_time = 0.0
+ save_time = 0.0
+ write_time = 0.0
+ with tarfile.open(tar_file, "w") as tar:
+ prev_wav = None
+ for item in data_list:
+ if no_segments:
+ key, txt, wav = item
+ else:
+ key, txt, wav, start, end = item
+
+ suffix = wav.split('.')[-1]
+ assert suffix in AUDIO_FORMAT_SETS
+ if no_segments:
+ ts = time.time()
+ with open(wav, 'rb') as fin:
+ data = fin.read()
+ read_time += (time.time() - ts)
+ else:
+ if wav != prev_wav:
+ ts = time.time()
+ waveforms, sample_rate = sox.load(wav, normalize=False)
+ read_time += (time.time() - ts)
+ prev_wav = wav
+ start = int(start * sample_rate)
+ end = int(end * sample_rate)
+ audio = waveforms[:1, start:end]
+
+ # resample
+ if sample_rate != resample:
+ audio = torchaudio.transforms.Resample(
+ sample_rate, resample)(audio)
+
+ ts = time.time()
+ f = io.BytesIO()
+ sox.save(f, audio, resample, format="wav", bits_per_sample=16)
+ # Save to wav for segments file
+ suffix = "wav"
+ f.seek(0)
+ data = f.read()
+ save_time += (time.time() - ts)
+
+ assert isinstance(txt, str)
+ ts = time.time()
+ txt_file = key + '.txt'
+ txt = txt.encode('utf8')
+ txt_data = io.BytesIO(txt)
+ txt_info = tarfile.TarInfo(txt_file)
+ txt_info.size = len(txt)
+ tar.addfile(txt_info, txt_data)
+
+ wav_file = key + '.' + suffix
+ wav_data = io.BytesIO(data)
+ wav_info = tarfile.TarInfo(wav_file)
+ wav_info.size = len(data)
+ tar.addfile(wav_info, wav_data)
+ write_time += (time.time() - ts)
+ logging.info('read {} save {} write {}'.format(read_time, save_time,
+ write_time))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='')
+ parser.add_argument('--num_utts_per_shard',
+ type=int,
+ default=1000,
+ help='num utts per shard')
+ parser.add_argument('--num_threads',
+ type=int,
+ default=1,
+ help='num threads for make shards')
+ parser.add_argument('--prefix',
+ default='shards',
+ help='prefix of shards tar file')
+ parser.add_argument('--segments', default=None, help='segments file')
+ parser.add_argument('--resample',
+ type=int,
+ default=16000,
+ help='segments file')
+ parser.add_argument('wav_file', help='wav file')
+ parser.add_argument('text_file', help='text file')
+ parser.add_argument('shards_dir', help='output shards dir')
+ parser.add_argument('shards_list', help='output shards list file')
+ args = parser.parse_args()
+ logging.basicConfig(level=logging.INFO,
+ format='%(asctime)s %(levelname)s %(message)s')
+
+ torch.set_num_threads(1)
+ wav_table = {}
+ with open(args.wav_file, 'r', encoding='utf8') as fin:
+ for line in fin:
+ arr = line.strip().split()
+ assert len(arr) == 2
+ wav_table[arr[0]] = arr[1]
+
+ no_segments = True
+ segments_table = {}
+ if args.segments is not None:
+ no_segments = False
+ with open(args.segments, 'r', encoding='utf8') as fin:
+ for line in fin:
+ arr = line.strip().split()
+ assert len(arr) == 4
+ segments_table[arr[0]] = (arr[1], float(arr[2]), float(arr[3]))
+
+ data = []
+ with open(args.text_file, 'r', encoding='utf8') as fin:
+ for line in fin:
+ arr = line.strip().split(maxsplit=1)
+ key = arr[0]
+ txt = arr[1] if len(arr) > 1 else ''
+ if no_segments:
+ assert key in wav_table
+ wav = wav_table[key]
+ data.append((key, txt, wav))
+ else:
+ wav_key, start, end = segments_table[key]
+ wav = wav_table[wav_key]
+ data.append((key, txt, wav, start, end))
+
+ num = args.num_utts_per_shard
+ chunks = [data[i:i + num] for i in range(0, len(data), num)]
+ os.makedirs(args.shards_dir, exist_ok=True)
+
+ # Using thread pool to speedup
+ pool = multiprocessing.Pool(processes=args.num_threads)
+ shards_list = []
+ tasks_list = []
+ num_chunks = len(chunks)
+ for i, chunk in enumerate(chunks):
+ tar_file = os.path.join(args.shards_dir,
+ '{}_{:09d}.tar'.format(args.prefix, i))
+ shards_list.append(tar_file)
+ pool.apply_async(
+ write_tar_file,
+ (chunk, no_segments, tar_file, args.resample, i, num_chunks))
+
+ pool.close()
+ pool.join()
+
+ with open(args.shards_list, 'w', encoding='utf8') as fout:
+ for name in shards_list:
+ fout.write(name + '\n')
diff --git a/models/speech/speech_recognition/conformer/ixrt/tools/text2token.py b/models/speech/speech_recognition/conformer/ixrt/tools/text2token.py
new file mode 100755
index 0000000000000000000000000000000000000000..4f4dcc901d436650695f0b80e0cf99e1e99269ee
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/tools/text2token.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+# Copyright 2021 JD AI Lab. All Rights Reserved. (authors: Lu Fan)
+# Copyright 2021 Mobvoi Inc. All Rights Reserved. (Di Wu)
+# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import re
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def exist_or_not(i, match_pos):
+ start_pos = None
+ end_pos = None
+ for pos in match_pos:
+ if pos[0] <= i < pos[1]:
+ start_pos = pos[0]
+ end_pos = pos[1]
+ break
+
+ return start_pos, end_pos
+
+def seg_char(sent):
+ pattern = re.compile(r'([\u4e00-\u9fa5])')
+ chars = pattern.split(sent)
+ chars = [w for w in chars if len(w.strip()) > 0]
+ return chars
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description='convert raw text to tokenized text',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--nchar',
+ '-n',
+ default=1,
+ type=int,
+ help='number of characters to split, i.e., \
+ aabb -> a a b b with -n 1 and aa bb with -n 2')
+ parser.add_argument('--skip-ncols',
+ '-s',
+ default=0,
+ type=int,
+ help='skip first n columns')
+ parser.add_argument('--space',
+ default='',
+ type=str,
+ help='space symbol')
+ parser.add_argument('--bpe-model',
+ '-m',
+ default=None,
+ type=str,
+ help='bpe model for english part')
+ parser.add_argument('--non-lang-syms',
+ '-l',
+ default=None,
+ type=str,
+ help='list of non-linguistic symobles,'
+ ' e.g., etc.')
+ parser.add_argument('text',
+ type=str,
+ default=False,
+ nargs='?',
+ help='input text')
+ parser.add_argument('--trans_type',
+ '-t',
+ type=str,
+ default="char",
+ choices=["char", "phn", "cn_char_en_bpe"],
+ help="""Transcript type. char/phn. e.g., for TIMIT
+ FADG0_SI1279 -
+ If trans_type is char, read from
+ SI1279.WRD file -> "bricks are an alternative"
+ Else if trans_type is phn,
+ read from SI1279.PHN file ->
+ "sil b r ih sil k s aa r er n aa l
+ sil t er n ih sil t ih v sil" """)
+ return parser
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ rs = []
+ if args.non_lang_syms is not None:
+ with codecs.open(args.non_lang_syms, 'r', encoding="utf-8") as f:
+ nls = [x.rstrip() for x in f.readlines()]
+ rs = [re.compile(re.escape(x)) for x in nls]
+
+ if args.bpe_model is not None:
+ import sentencepiece as spm
+ sp = spm.SentencePieceProcessor()
+ sp.load(args.bpe_model)
+
+ if args.text:
+ f = codecs.open(args.text, encoding="utf-8")
+ else:
+ f = codecs.getreader("utf-8")(
+ sys.stdin if is_python2 else sys.stdin.buffer)
+
+ sys.stdout = codecs.getwriter("utf-8")(
+ sys.stdout if is_python2 else sys.stdout.buffer)
+ line = f.readline()
+ n = args.nchar
+ while line:
+ x = line.split()
+ print(' '.join(x[:args.skip_ncols]), end=" ")
+ a = ' '.join(x[args.skip_ncols:])
+
+ # get all matched positions
+ match_pos = []
+ for r in rs:
+ i = 0
+ while i >= 0:
+ m = r.search(a, i)
+ if m:
+ match_pos.append([m.start(), m.end()])
+ i = m.end()
+ else:
+ break
+
+ if len(match_pos) > 0:
+ chars = []
+ i = 0
+ while i < len(a):
+ start_pos, end_pos = exist_or_not(i, match_pos)
+ if start_pos is not None:
+ chars.append(a[start_pos:end_pos])
+ i = end_pos
+ else:
+ chars.append(a[i])
+ i += 1
+ a = chars
+
+ if (args.trans_type == "phn"):
+ a = a.split(" ")
+ elif args.trans_type == "cn_char_en_bpe":
+ b = seg_char(a)
+ a = []
+ for j in b:
+ # we use "▁" to instead of blanks among english words
+ # warning: here is "▁", not "_"
+ for l in j.strip().split("▁"):
+ if not l.encode('UTF-8').isalpha():
+ a.append(l)
+ else:
+ for k in sp.encode_as_pieces(l):
+ a.append(k)
+ else:
+ a = [a[j:j + n] for j in range(0, len(a), n)]
+
+ a_flat = []
+ for z in a:
+ a_flat.append("".join(z))
+
+ a_chars = [z.replace(' ', args.space) for z in a_flat]
+ if (args.trans_type == "phn"):
+ a_chars = [z.replace("sil", args.space) for z in a_chars]
+ print(' '.join(a_chars))
+ line = f.readline()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/models/speech/speech_recognition/conformer/ixrt/utils/__init__.py b/models/speech/speech_recognition/conformer/ixrt/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57435c110fc12f39d79c1b02f4b2e83dfe1a3e3
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/utils/__init__.py
@@ -0,0 +1,39 @@
+import os
+import torch
+import numpy as np
+
+from .embedding import RelPositionalEncoding
+
+
+rel_positional_encoding = RelPositionalEncoding(256, 0.1)
+
+
+def make_pad_mask(lengths: np.ndarray, max_len: int = 0) -> np.ndarray :
+ """Make mask tensor containing indices of padded part.
+
+ See description of make_non_pad_mask.
+
+ Args:
+ lengths (numpy.ndarray): Batch of lengths (B,).
+ Returns:
+ numpy.ndarray: Mask tensor containing indices of padded part.
+
+ Examples:
+ >>> lengths = [5, 3, 2]
+ >>> make_pad_mask(lengths)
+ masks = [[0, 0, 0, 0 ,0],
+ [0, 0, 0, 1, 1],
+ [0, 0, 1, 1, 1]]
+ """
+
+ batch_size = lengths.shape[0]
+ max_len = max_len if max_len > 0 else lengths.max().item()
+ seq_range = np.arange(0, max_len, dtype=np.int64)
+ seq_range_expand = np.tile(seq_range, batch_size).reshape(batch_size, max_len)
+ seq_length_expand = lengths[..., None]
+ mask = seq_range_expand >= seq_length_expand
+ mask = np.expand_dims(mask, axis=1)
+ mask = ~mask
+ mask = mask[:, :, 2::2][:, :, 2::2]
+ mask = mask.astype(np.int32)
+ return mask
diff --git a/models/speech/speech_recognition/conformer/ixrt/utils/embedding.py b/models/speech/speech_recognition/conformer/ixrt/utils/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fd65c4cdfc3fec244c88d2c47cf94b33b9088f3
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/utils/embedding.py
@@ -0,0 +1,133 @@
+"""Positonal Encoding Module."""
+
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+
+class PositionalEncoding(torch.nn.Module):
+ """Positional encoding.
+
+ :param int d_model: embedding dim
+ :param float dropout_rate: dropout rate
+ :param int max_len: maximum input length
+
+ PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
+ PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+ """
+
+ def __init__(self,
+ d_model: int,
+ dropout_rate: float,
+ max_len: int = 5000,
+ reverse: bool = False):
+ """Construct an PositionalEncoding object."""
+ super().__init__()
+ self.d_model = d_model
+ self.xscale = math.sqrt(self.d_model)
+ self.dropout = torch.nn.Dropout(p=dropout_rate)
+ self.max_len = max_len
+
+ pe = torch.zeros(self.max_len, self.d_model)
+ position = torch.arange(0, self.max_len,
+ dtype=torch.float32).unsqueeze(1)
+ div_term = torch.exp(
+ torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+ -(math.log(10000.0) / self.d_model))
+ pe[:, 0::2] = torch.sin(position * div_term)
+ pe[:, 1::2] = torch.cos(position * div_term)
+ pe = pe.unsqueeze(0)
+ self.register_buffer("pe", pe)
+
+ def forward(self,
+ x: torch.Tensor,
+ offset: Union[int, torch.Tensor] = 0) \
+ -> Tuple[torch.Tensor, torch.Tensor]:
+ """Add positional encoding.
+
+ Args:
+ x (torch.Tensor): Input. Its shape is (batch, time, ...)
+ offset (int, torch.tensor): position offset
+
+ Returns:
+ torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+ torch.Tensor: for compatibility to RelPositionalEncoding
+ """
+
+ pos_emb = self.position_encoding(offset, x.size(1), False)
+ x = x * self.xscale + pos_emb
+ return self.dropout(x), self.dropout(pos_emb)
+
+ def position_encoding(self,
+ offset: Union[int, torch.Tensor],
+ size: int,
+ apply_dropout: bool = True) -> torch.Tensor:
+ """ For getting encoding in a streaming fashion
+
+ Attention!!!!!
+ we apply dropout only once at the whole utterance level in a none
+ streaming way, but will call this function several times with
+ increasing input size in a streaming scenario, so the dropout will
+ be applied several times.
+
+ Args:
+ offset (int or torch.tensor): start offset
+ size (int): required size of position encoding
+
+ Returns:
+ torch.Tensor: Corresponding encoding
+ """
+ # How to subscript a Union type:
+ # https://github.com/pytorch/pytorch/issues/69434
+ # import ipdb;ipdb.set_trace()
+ if isinstance(offset, int):
+ assert offset + size <= self.max_len
+ pos_emb = self.pe[:, offset:offset + size]
+ elif isinstance(offset, torch.Tensor) and offset.dim() == 0: # scalar
+ assert offset + size <= self.max_len
+ pos_emb = self.pe[:, offset:offset + size]
+ else: # for batched streaming decoding on GPU
+ assert torch.max(offset) + size <= self.max_len
+ index = offset.unsqueeze(1) + \
+ torch.arange(0, size).to(offset.device) # B X T
+ flag = index > 0
+ # remove negative offset
+ index = index * flag
+ pos_emb = F.embedding(index, self.pe[0]) # B X T X d_model
+
+ if apply_dropout:
+ pos_emb = self.dropout(pos_emb)
+ return pos_emb
+
+
+class RelPositionalEncoding(PositionalEncoding):
+ """Relative positional encoding module.
+ See : Appendix B in https://arxiv.org/abs/1901.02860
+ Args:
+ d_model (int): Embedding dimension.
+ dropout_rate (float): Dropout rate.
+ max_len (int): Maximum input length.
+ """
+
+ def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+ """Initialize class."""
+ super().__init__(d_model, dropout_rate, max_len, reverse=True)
+
+ def forward(self,
+ seq_len: int,
+ offset: Union[int, torch.Tensor] = 0) \
+ -> Tuple[torch.Tensor, torch.Tensor]:
+ """Compute positional encoding.
+ Args:
+ x (torch.Tensor): Input tensor (batch, time, `*`).
+ Returns:
+ torch.Tensor: Encoded tensor (batch, time, `*`).
+ torch.Tensor: Positional embedding tensor (1, time, `*`).
+ """
+ pos_emb = self.position_encoding(offset, seq_len, False)
+ # return self.dropout(pos_emb)
+ return pos_emb
+
diff --git a/models/speech/speech_recognition/conformer/ixrt/wenet/__init__.py b/models/speech/speech_recognition/conformer/ixrt/wenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/speech/speech_recognition/conformer/ixrt/wenet/dataset.py b/models/speech/speech_recognition/conformer/ixrt/wenet/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..88a8cd15aec2277a36358883b25e929b179165e8
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/wenet/dataset.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+
+import wenet.processor as processor
+from wenet.file_utils import read_lists
+
+
+class Processor(IterableDataset):
+ def __init__(self, source, f, *args, **kw):
+ assert callable(f)
+ self.source = source
+ self.f = f
+ self.args = args
+ self.kw = kw
+
+ def set_epoch(self, epoch):
+ self.source.set_epoch(epoch)
+
+ def __iter__(self):
+ """ Return an iterator over the source dataset processed by the
+ given processor.
+ """
+ assert self.source is not None
+ assert callable(self.f)
+ return self.f(iter(self.source), *self.args, **self.kw)
+
+ def apply(self, f):
+ assert callable(f)
+ return Processor(self, f, *self.args, **self.kw)
+
+
+class DistributedSampler:
+ def __init__(self, shuffle=True, partition=True):
+ self.epoch = -1
+ self.update()
+ self.shuffle = shuffle
+ self.partition = partition
+
+ def update(self):
+ assert dist.is_available()
+ if dist.is_initialized():
+ self.rank = dist.get_rank()
+ self.world_size = dist.get_world_size()
+ else:
+ self.rank = 0
+ self.world_size = 1
+ worker_info = torch.utils.data.get_worker_info()
+ if worker_info is None:
+ self.worker_id = 0
+ self.num_workers = 1
+ else:
+ self.worker_id = worker_info.id
+ self.num_workers = worker_info.num_workers
+ return dict(rank=self.rank,
+ world_size=self.world_size,
+ worker_id=self.worker_id,
+ num_workers=self.num_workers)
+
+ def set_epoch(self, epoch):
+ self.epoch = epoch
+
+ def sample(self, data):
+ """ Sample data according to rank/world_size/num_workers
+
+ Args:
+ data(List): input data list
+
+ Returns:
+ List: data list after sample
+ """
+ data = list(range(len(data)))
+ # TODO(Binbin Zhang): fix this
+ # We can not handle uneven data for CV on DDP, so we don't
+ # sample data by rank, that means every GPU gets the same
+ # and all the CV data
+ if self.partition:
+ if self.shuffle:
+ random.Random(self.epoch).shuffle(data)
+ data = data[self.rank::self.world_size]
+ data = data[self.worker_id::self.num_workers]
+ return data
+
+
+class DataList(IterableDataset):
+ def __init__(self, lists, shuffle=True, partition=True):
+ self.lists = lists
+ self.sampler = DistributedSampler(shuffle, partition)
+
+ def set_epoch(self, epoch):
+ self.sampler.set_epoch(epoch)
+
+ def __iter__(self):
+ sampler_info = self.sampler.update()
+ indexes = self.sampler.sample(self.lists)
+ for index in indexes:
+ # yield dict(src=src)
+ data = dict(src=self.lists[index])
+ data.update(sampler_info)
+ yield data
+
+
+def Dataset(data_type,
+ data_list_file,
+ symbol_table,
+ conf,
+ bpe_model=None,
+ non_lang_syms=None,
+ partition=True):
+ """ Construct dataset from arguments
+
+ We have two shuffle stage in the Dataset. The first is global
+ shuffle at shards tar/raw file level. The second is global shuffle
+ at training samples level.
+
+ Args:
+ data_type(str): raw/shard
+ bpe_model(str): model for english bpe part
+ partition(bool): whether to do data partition in terms of rank
+ """
+ assert data_type in ['raw', 'shard']
+ lists = read_lists(data_list_file)
+ shuffle = conf.get('shuffle', True)
+ dataset = DataList(lists, shuffle=shuffle, partition=partition)
+ if data_type == 'shard':
+ dataset = Processor(dataset, processor.url_opener)
+ dataset = Processor(dataset, processor.tar_file_and_group)
+ else:
+ dataset = Processor(dataset, processor.parse_raw)
+
+ dataset = Processor(dataset, processor.tokenize, symbol_table, bpe_model,
+ non_lang_syms, conf.get('split_with_space', False))
+ filter_conf = conf.get('filter_conf', {})
+ dataset = Processor(dataset, processor.filter, **filter_conf)
+
+ resample_conf = conf.get('resample_conf', {})
+ dataset = Processor(dataset, processor.resample, **resample_conf)
+
+ speed_perturb = conf.get('speed_perturb', False)
+ if speed_perturb:
+ dataset = Processor(dataset, processor.speed_perturb)
+
+ fbank_conf = conf.get('fbank_conf', {})
+ dataset = Processor(dataset, processor.compute_fbank, **fbank_conf)
+
+ spec_aug = conf.get('spec_aug', True)
+ if spec_aug:
+ spec_aug_conf = conf.get('spec_aug_conf', {})
+ dataset = Processor(dataset, processor.spec_aug, **spec_aug_conf)
+
+ if shuffle:
+ shuffle_conf = conf.get('shuffle_conf', {})
+ dataset = Processor(dataset, processor.shuffle, **shuffle_conf)
+
+ sort = conf.get('sort', True)
+ if sort:
+ sort_conf = conf.get('sort_conf', {})
+ dataset = Processor(dataset, processor.sort, **sort_conf)
+
+ batch_conf = conf.get('batch_conf', {})
+ dataset = Processor(dataset, processor.batch, **batch_conf)
+ dataset = Processor(dataset, processor.padding)
+ return dataset
diff --git a/models/speech/speech_recognition/conformer/ixrt/wenet/file_utils.py b/models/speech/speech_recognition/conformer/ixrt/wenet/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b7e516cc61f759267f4ef09309ff0b45110a0c1
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/wenet/file_utils.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+
+def read_lists(list_file):
+ lists = []
+ with open(list_file, 'r', encoding='utf8') as fin:
+ for line in fin:
+ lists.append(line.strip())
+ return lists
+
+
+def read_non_lang_symbols(non_lang_sym_path):
+ """read non-linguistic symbol from file.
+
+ The file format is like below:
+
+ {NOISE}\n
+ {BRK}\n
+ ...
+
+
+ Args:
+ non_lang_sym_path: non-linguistic symbol file path, None means no any
+ syms.
+
+ """
+ if non_lang_sym_path is None:
+ return None
+ else:
+ syms = read_lists(non_lang_sym_path)
+ non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
+ for sym in syms:
+ if non_lang_syms_pattern.fullmatch(sym) is None:
+ class BadSymbolFormat(Exception):
+ pass
+ raise BadSymbolFormat(
+ "Non-linguistic symbols should be "
+ "formatted in {xxx}//[xxx], consider"
+ " modify '%s' to meet the requirment. "
+ "More details can be found in discussions here : "
+ "https://github.com/wenet-e2e/wenet/pull/819" % (sym))
+ return syms
+
+
+def read_symbol_table(symbol_table_file):
+ symbol_table = {}
+ with open(symbol_table_file, 'r', encoding='utf8') as fin:
+ for line in fin:
+ arr = line.strip().split()
+ assert len(arr) == 2
+ symbol_table[arr[0]] = int(arr[1])
+ return symbol_table
diff --git a/models/speech/speech_recognition/conformer/ixrt/wenet/processor.py b/models/speech/speech_recognition/conformer/ixrt/wenet/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a542a3d204cdb3def8cf61ce0b0fd8bb31af32e
--- /dev/null
+++ b/models/speech/speech_recognition/conformer/ixrt/wenet/processor.py
@@ -0,0 +1,550 @@
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import json
+import random
+import re
+import tarfile
+from subprocess import PIPE, Popen
+from urllib.parse import urlparse
+
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+from torch.nn.utils.rnn import pad_sequence
+
+AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
+
+
+def url_opener(data):
+ """ Give url or local file, return file descriptor
+ Inplace operation.
+
+ Args:
+ data(Iterable[str]): url or local file list
+
+ Returns:
+ Iterable[{src, stream}]
+ """
+ for sample in data:
+ assert 'src' in sample
+ # TODO(Binbin Zhang): support HTTP
+ url = sample['src']
+ try:
+ pr = urlparse(url)
+ # local file
+ if pr.scheme == '' or pr.scheme == 'file':
+ stream = open(url, 'rb')
+ # network file, such as HTTP(HDFS/OSS/S3)/HTTPS/SCP
+ else:
+ cmd = f'curl -s -L {url}'
+ process = Popen(cmd, shell=True, stdout=PIPE)
+ sample.update(process=process)
+ stream = process.stdout
+ sample.update(stream=stream)
+ yield sample
+ except Exception as ex:
+ logging.warning('Failed to open {}'.format(url))
+
+
+def tar_file_and_group(data):
+ """ Expand a stream of open tar files into a stream of tar file contents.
+ And groups the file with same prefix
+
+ Args:
+ data: Iterable[{src, stream}]
+
+ Returns:
+ Iterable[{key, wav, txt, sample_rate}]
+ """
+ for sample in data:
+ assert 'stream' in sample
+ stream = tarfile.open(fileobj=sample['stream'], mode="r|*")
+ prev_prefix = None
+ example = {}
+ valid = True
+ for tarinfo in stream:
+ name = tarinfo.name
+ pos = name.rfind('.')
+ assert pos > 0
+ prefix, postfix = name[:pos], name[pos + 1:]
+ if prev_prefix is not None and prefix != prev_prefix:
+ example['key'] = prev_prefix
+ if valid:
+ yield example
+ example = {}
+ valid = True
+ with stream.extractfile(tarinfo) as file_obj:
+ try:
+ if postfix == 'txt':
+ example['txt'] = file_obj.read().decode('utf8').strip()
+ elif postfix in AUDIO_FORMAT_SETS:
+ waveform, sample_rate = torchaudio.load(file_obj)
+ example['wav'] = waveform
+ example['sample_rate'] = sample_rate
+ else:
+ example[postfix] = file_obj.read()
+ except Exception as ex:
+ valid = False
+ logging.warning('error to parse {}'.format(name))
+ prev_prefix = prefix
+ if prev_prefix is not None:
+ example['key'] = prev_prefix
+ yield example
+ stream.close()
+ if 'process' in sample:
+ sample['process'].communicate()
+ sample['stream'].close()
+
+
+def parse_raw(data):
+ """ Parse key/wav/txt from json line
+
+ Args:
+ data: Iterable[str], str is a json line has key/wav/txt
+
+ Returns:
+ Iterable[{key, wav, txt, sample_rate}]
+ """
+ for sample in data:
+ assert 'src' in sample
+ json_line = sample['src']
+ obj = json.loads(json_line)
+ assert 'key' in obj
+ assert 'wav' in obj
+ assert 'txt' in obj
+ key = obj['key']
+ wav_file = obj['wav']
+ txt = obj['txt']
+ try:
+ if 'start' in obj:
+ assert 'end' in obj
+ sample_rate = torchaudio.backend.sox_io_backend.info(
+ wav_file).sample_rate
+ start_frame = int(obj['start'] * sample_rate)
+ end_frame = int(obj['end'] * sample_rate)
+ waveform, _ = torchaudio.backend.sox_io_backend.load(
+ filepath=wav_file,
+ num_frames=end_frame - start_frame,
+ frame_offset=start_frame)
+ else:
+ waveform, sample_rate = torchaudio.load(wav_file)
+ example = dict(key=key,
+ txt=txt,
+ wav=waveform,
+ sample_rate=sample_rate)
+ yield example
+ except Exception as ex:
+ logging.warning('Failed to read {}'.format(wav_file))
+
+
+def filter(data,
+ max_length=10240,
+ min_length=10,
+ token_max_length=200,
+ token_min_length=1,
+ min_output_input_ratio=0.0005,
+ max_output_input_ratio=1):
+ """ Filter sample according to feature and label length
+ Inplace operation.
+
+ Args::
+ data: Iterable[{key, wav, label, sample_rate}]
+ max_length: drop utterance which is greater than max_length(10ms)
+ min_length: drop utterance which is less than min_length(10ms)
+ token_max_length: drop utterance which is greater than
+ token_max_length, especially when use char unit for
+ english modeling
+ token_min_length: drop utterance which is
+ less than token_max_length
+ min_output_input_ratio: minimal ration of
+ token_length / feats_length(10ms)
+ max_output_input_ratio: maximum ration of
+ token_length / feats_length(10ms)
+
+ Returns:
+ Iterable[{key, wav, label, sample_rate}]
+ """
+ for sample in data:
+ assert 'sample_rate' in sample
+ assert 'wav' in sample
+ assert 'label' in sample
+ # sample['wav'] is torch.Tensor, we have 100 frames every second
+ num_frames = sample['wav'].size(1) / sample['sample_rate'] * 100
+ if num_frames < min_length:
+ continue
+ if num_frames > max_length:
+ continue
+ if len(sample['label']) < token_min_length:
+ continue
+ if len(sample['label']) > token_max_length:
+ continue
+ if num_frames != 0:
+ if len(sample['label']) / num_frames < min_output_input_ratio:
+ continue
+ if len(sample['label']) / num_frames > max_output_input_ratio:
+ continue
+ yield sample
+
+
+def resample(data, resample_rate=16000):
+ """ Resample data.
+ Inplace operation.
+
+ Args:
+ data: Iterable[{key, wav, label, sample_rate}]
+ resample_rate: target resample rate
+
+ Returns:
+ Iterable[{key, wav, label, sample_rate}]
+ """
+ for sample in data:
+ assert 'sample_rate' in sample
+ assert 'wav' in sample
+ sample_rate = sample['sample_rate']
+ waveform = sample['wav']
+ if sample_rate != resample_rate:
+ sample['sample_rate'] = resample_rate
+ sample['wav'] = torchaudio.transforms.Resample(
+ orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+ yield sample
+
+
+def speed_perturb(data, speeds=None):
+ """ Apply speed perturb to the data.
+ Inplace operation.
+
+ Args:
+ data: Iterable[{key, wav, label, sample_rate}]
+ speeds(List[float]): optional speed
+
+ Returns:
+ Iterable[{key, wav, label, sample_rate}]
+ """
+ if speeds is None:
+ speeds = [0.9, 1.0, 1.1]
+ for sample in data:
+ assert 'sample_rate' in sample
+ assert 'wav' in sample
+ sample_rate = sample['sample_rate']
+ waveform = sample['wav']
+ speed = random.choice(speeds)
+ if speed != 1.0:
+ wav, _ = torchaudio.sox_effects.apply_effects_tensor(
+ waveform, sample_rate,
+ [['speed', str(speed)], ['rate', str(sample_rate)]])
+ sample['wav'] = wav
+
+ yield sample
+
+
+def compute_fbank(data,
+ num_mel_bins=23,
+ frame_length=25,
+ frame_shift=10,
+ dither=0.0):
+ """ Extract fbank
+
+ Args:
+ data: Iterable[{key, wav, label, sample_rate}]
+
+ Returns:
+ Iterable[{key, feat, label}]
+ """
+ for sample in data:
+ assert 'sample_rate' in sample
+ assert 'wav' in sample
+ assert 'key' in sample
+ assert 'label' in sample
+ sample_rate = sample['sample_rate']
+ waveform = sample['wav']
+ waveform = waveform * (1 << 15)
+ # Only keep key, feat, label
+ mat = kaldi.fbank(waveform,
+ num_mel_bins=num_mel_bins,
+ frame_length=frame_length,
+ frame_shift=frame_shift,
+ dither=dither,
+ energy_floor=0.0,
+ sample_frequency=sample_rate)
+ yield dict(key=sample['key'], label=sample['label'], feat=mat)
+
+
+def __tokenize_by_bpe_model(sp, txt):
+ tokens = []
+ # CJK(China Japan Korea) unicode range is [U+4E00, U+9FFF], ref:
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+ pattern = re.compile(r'([\u4e00-\u9fff])')
+ # Example:
+ # txt = "你好 ITS'S OKAY 的"
+ # chars = ["你", "好", " ITS'S OKAY ", "的"]
+ chars = pattern.split(txt.upper())
+ mix_chars = [w for w in chars if len(w.strip()) > 0]
+ for ch_or_w in mix_chars:
+ # ch_or_w is a single CJK charater(i.e., "你"), do nothing.
+ if pattern.fullmatch(ch_or_w) is not None:
+ tokens.append(ch_or_w)
+ # ch_or_w contains non-CJK charaters(i.e., " IT'S OKAY "),
+ # encode ch_or_w using bpe_model.
+ else:
+ for p in sp.encode_as_pieces(ch_or_w):
+ tokens.append(p)
+
+ return tokens
+
+
+def tokenize(data, symbol_table, bpe_model=None, non_lang_syms=None,
+ split_with_space=False):
+ """ Decode text to chars or BPE
+ Inplace operation
+
+ Args:
+ data: Iterable[{key, wav, txt, sample_rate}]
+
+ Returns:
+ Iterable[{key, wav, txt, tokens, label, sample_rate}]
+ """
+ if non_lang_syms is not None:
+ non_lang_syms_pattern = re.compile(r"(\[[^\[\]]+\]|<[^<>]+>|{[^{}]+})")
+ else:
+ non_lang_syms = {}
+ non_lang_syms_pattern = None
+
+ if bpe_model is not None:
+ import sentencepiece as spm
+ sp = spm.SentencePieceProcessor()
+ sp.load(bpe_model)
+ else:
+ sp = None
+
+ for sample in data:
+ assert 'txt' in sample
+ txt = sample['txt'].strip()
+ if non_lang_syms_pattern is not None:
+ parts = non_lang_syms_pattern.split(txt.upper())
+ parts = [w for w in parts if len(w.strip()) > 0]
+ else:
+ parts = [txt]
+
+ label = []
+ tokens = []
+ for part in parts:
+ if part in non_lang_syms:
+ tokens.append(part)
+ else:
+ if bpe_model is not None:
+ tokens.extend(__tokenize_by_bpe_model(sp, part))
+ else:
+ if split_with_space:
+ part = part.split(" ")
+ for ch in part:
+ if ch == ' ':
+ ch = "▁"
+ tokens.append(ch)
+
+ for ch in tokens:
+ if ch in symbol_table:
+ label.append(symbol_table[ch])
+ elif '' in symbol_table:
+ label.append(symbol_table[''])
+
+ sample['tokens'] = tokens
+ sample['label'] = label
+ yield sample
+
+
+def spec_aug(data, num_t_mask=2, num_f_mask=2, max_t=50, max_f=10, max_w=80):
+ """ Do spec augmentation
+ Inplace operation
+
+ Args:
+ data: Iterable[{key, feat, label}]
+ num_t_mask: number of time mask to apply
+ num_f_mask: number of freq mask to apply
+ max_t: max width of time mask
+ max_f: max width of freq mask
+ max_w: max width of time warp
+
+ Returns
+ Iterable[{key, feat, label}]
+ """
+ for sample in data:
+ assert 'feat' in sample
+ x = sample['feat']
+ assert isinstance(x, torch.Tensor)
+ y = x.clone().detach()
+ max_frames = y.size(0)
+ max_freq = y.size(1)
+ # time mask
+ for i in range(num_t_mask):
+ start = random.randint(0, max_frames - 1)
+ length = random.randint(1, max_t)
+ end = min(max_frames, start + length)
+ y[start:end, :] = 0
+ # freq mask
+ for i in range(num_f_mask):
+ start = random.randint(0, max_freq - 1)
+ length = random.randint(1, max_f)
+ end = min(max_freq, start + length)
+ y[:, start:end] = 0
+ sample['feat'] = y
+ yield sample
+
+
+def shuffle(data, shuffle_size=10000):
+ """ Local shuffle the data
+
+ Args:
+ data: Iterable[{key, feat, label}]
+ shuffle_size: buffer size for shuffle
+
+ Returns:
+ Iterable[{key, feat, label}]
+ """
+ buf = []
+ for sample in data:
+ buf.append(sample)
+ if len(buf) >= shuffle_size:
+ random.shuffle(buf)
+ for x in buf:
+ yield x
+ buf = []
+ # The sample left over
+ random.shuffle(buf)
+ for x in buf:
+ yield x
+
+
+def sort(data, sort_size=500):
+ """ Sort the data by feature length.
+ Sort is used after shuffle and before batch, so we can group
+ utts with similar lengths into a batch, and `sort_size` should
+ be less than `shuffle_size`
+
+ Args:
+ data: Iterable[{key, feat, label}]
+ sort_size: buffer size for sort
+
+ Returns:
+ Iterable[{key, feat, label}]
+ """
+
+ buf = []
+ for sample in data:
+ buf.append(sample)
+ if len(buf) >= sort_size:
+ buf.sort(key=lambda x: x['feat'].size(0))
+ for x in buf:
+ yield x
+ buf = []
+ # The sample left over
+ buf.sort(key=lambda x: x['feat'].size(0))
+ for x in buf:
+ yield x
+
+
+def static_batch(data, batch_size=16):
+ """ Static batch the data by `batch_size`
+
+ Args:
+ data: Iterable[{key, feat, label}]
+ batch_size: batch size
+
+ Returns:
+ Iterable[List[{key, feat, label}]]
+ """
+ buf = []
+ for sample in data:
+ buf.append(sample)
+ if len(buf) >= batch_size:
+ yield buf
+ buf = []
+ if len(buf) > 0:
+ yield buf
+
+
+def dynamic_batch(data, max_frames_in_batch=12000):
+ """ Dynamic batch the data until the total frames in batch
+ reach `max_frames_in_batch`
+
+ Args:
+ data: Iterable[{key, feat, label}]
+ max_frames_in_batch: max_frames in one batch
+
+ Returns:
+ Iterable[List[{key, feat, label}]]
+ """
+ buf = []
+ longest_frames = 0
+ for sample in data:
+ assert 'feat' in sample
+ assert isinstance(sample['feat'], torch.Tensor)
+ new_sample_frames = sample['feat'].size(0)
+ longest_frames = max(longest_frames, new_sample_frames)
+ frames_after_padding = longest_frames * (len(buf) + 1)
+ if frames_after_padding > max_frames_in_batch:
+ yield buf
+ buf = [sample]
+ longest_frames = new_sample_frames
+ else:
+ buf.append(sample)
+ if len(buf) > 0:
+ yield buf
+
+
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000):
+ """ Wrapper for static/dynamic batch
+ """
+ if batch_type == 'static':
+ return static_batch(data, batch_size)
+ elif batch_type == 'dynamic':
+ return dynamic_batch(data, max_frames_in_batch)
+ else:
+ logging.fatal('Unsupported batch type {}'.format(batch_type))
+
+
+def padding(data):
+ """ Padding the data into training data
+
+ Args:
+ data: Iterable[List[{key, feat, label}]]
+
+ Returns:
+ Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+ """
+ for sample in data:
+ assert isinstance(sample, list)
+ feats_length = torch.tensor([x['feat'].size(0) for x in sample],
+ dtype=torch.int32)
+ order = torch.argsort(feats_length, descending=True)
+ feats_lengths = torch.tensor(
+ [sample[i]['feat'].size(0) for i in order], dtype=torch.int32)
+ sorted_feats = [sample[i]['feat'] for i in order]
+ sorted_keys = [sample[i]['key'] for i in order]
+ sorted_labels = [
+ torch.tensor(sample[i]['label'], dtype=torch.int64) for i in order
+ ]
+ label_lengths = torch.tensor([x.size(0) for x in sorted_labels],
+ dtype=torch.int32)
+
+ padded_feats = pad_sequence(sorted_feats,
+ batch_first=True,
+ padding_value=0)
+ padding_labels = pad_sequence(sorted_labels,
+ batch_first=True,
+ padding_value=-1)
+
+ yield (sorted_keys, padded_feats, padding_labels, feats_lengths,
+ label_lengths)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/README.md b/models/speech/speech_recognition/transformer_asr/ixrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c2e1b456d5fe38efdda736439c1361a14dcedcd
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/README.md
@@ -0,0 +1,83 @@
+# Transformer ASR(BeamSearch)
+
+## Description
+
+Beam search allows us to exert control over the output of text generation. This is useful because we sometimes know exactly what we want inside the output. For example, in a Neural Machine Translation task, we might know which words must be included in the final translation with a dictionary lookup.
+
+## Setup
+
+### Install
+
+```bash
+pip3 install speechbrain==0.5.13
+```
+
+### Download
+
+Pretrained model:
+
+Dataset: to download the Aishell dataset.
+
+```bash
+# Make sure the checkpoint path is results/transformer/8886/save
+mkdir -p results/transformer/8886/save
+# The data path like below:
+results/transformer/8886
+├── cer.txt
+├── dev.csv
+├── env.log
+├── hyperparams.yaml
+├── inference_encoder_ctc.py
+├── inference.py
+├── log.txt
+├── save
+│ ├── CKPT+2023-03-29+06-31-40+00
+│ │ ├── brain.ckpt
+│ │ ├── CKPT.yaml
+│ │ ├── counter.ckpt
+│ │ ├── model.ckpt
+│ │ ├── noam_scheduler.ckpt
+│ │ └── normalizer.ckpt
+│ └── tokenizer.ckpt
+├── test.csv
+├── train.csv
+└── train_log.txt
+
+# Make sure the dataset path is results/transformer/8886/save
+mkdir -p /home/data/speechbrain/aishell/csv_data
+ln -s /PATH/to/data_aishell /home/data/speechbrain/aishell/
+cp results/transformer/8886/*.csv /home/data/speechbrain/aishell/csv_data
+```
+
+## Inference
+
+### Build faster kernels
+
+```bash
+bash build.sh
+```
+
+### Build engine
+
+max_batch_size and max_seq_len depend on the situation.
+
+```bash
+python3 builder.py \
+--ckpt_path results/transformer/8886/save \
+--head_num 4 \
+--max_batch_size 64 \
+--max_seq_len 1024 \
+--engine_path transformer.engine
+```
+
+### Run engine
+
+```bash
+python3 inference.py hparams/train_ASR_transformer.yaml --data_folder=/home/data/speechbrain/aishell --engine_path transformer.engine
+```
+
+## Results
+
+| Model | BatchSize | Precision | QPS | CER |
+| --------------- | --------- | --------- | ----- | ---- |
+| Transformer ASR | 32 | FP16 | 15.64 | 5.95 |
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/aishell_prepare.py b/models/speech/speech_recognition/transformer_asr/ixrt/aishell_prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba31939402691ec29525480cf0070e3016654b8d
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/aishell_prepare.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import shutil
+import logging
+from speechbrain.dataio.dataio import read_audio
+from speechbrain.utils.data_utils import download_file
+import glob
+import csv
+import argparse
+
+logger = logging.getLogger(__name__)
+
+
+def prepare_aishell(data_folder, save_folder, skip_prep=False):
+ """
+ This function prepares the AISHELL-1 dataset.
+ If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.
+
+ data_folder : path to AISHELL-1 dataset.
+ save_folder: path where to store the manifest csv files.
+ skip_prep: If True, skip data preparation.
+
+ """
+ if skip_prep:
+ return
+
+ # If the data folders do not exist, we need to extract the data
+ if not os.path.isdir(os.path.join(data_folder, "data_aishell/wav")):
+ # # Check for zip file and download if it doesn't exist
+ # zip_location = os.path.join(data_folder, "data_aishell.tgz")
+ # if not os.path.exists(zip_location):
+ # url = "https://www.openslr.org/resources/33/data_aishell.tgz"
+ # download_file(url, zip_location, unpack=True)
+ # logger.info("Extracting data_aishell.tgz...")
+ # shutil.unpack_archive(zip_location, data_folder)
+
+ wav_dir = os.path.join(data_folder, "data_aishell/wav")
+ tgz_list = glob.glob(wav_dir + "/*.tar.gz")
+ for tgz in tgz_list:
+ shutil.unpack_archive(tgz, wav_dir)
+ os.remove(tgz)
+
+ # Create filename-to-transcript dictionary
+ filename2transcript = {}
+ with open(
+ os.path.join(
+ data_folder, "data_aishell/transcript/aishell_transcript_v0.8.txt"
+ ),
+ "r",
+ ) as f:
+ lines = f.readlines()
+ for line in lines:
+ key = line.split()[0]
+ value = " ".join(line.split()[1:])
+ filename2transcript[key] = value
+
+ splits = [
+ # "train",
+ "dev",
+ "test",
+ ]
+ ID_start = 0 # needed to have a unique ID for each audio
+ for split in splits:
+ new_filename = os.path.join(save_folder, split) + ".csv"
+ if os.path.exists(new_filename):
+ continue
+ logger.info("Preparing %s..." % new_filename)
+
+ csv_output = [["ID", "duration", "wav", "transcript"]]
+ entry = []
+
+ all_wavs = glob.glob(
+ os.path.join(data_folder, "data_aishell/wav") + "/" + split + "/*/*.wav"
+ )
+ for i in range(len(all_wavs)):
+ filename = all_wavs[i].split("/")[-1].split(".wav")[0]
+ if filename not in filename2transcript:
+ continue
+ signal = read_audio(all_wavs[i])
+ duration = signal.shape[0] / 16000
+ transcript_ = filename2transcript[filename]
+ csv_line = [
+ ID_start + i,
+ str(duration),
+ all_wavs[i],
+ transcript_,
+ ]
+ entry.append(csv_line)
+
+ csv_output = csv_output + entry
+
+ with open(new_filename, mode="w") as csv_f:
+ csv_writer = csv.writer(
+ csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
+ )
+ for line in csv_output:
+ csv_writer.writerow(line)
+
+ msg = "\t%s successfully created!" % (new_filename)
+ logger.info(msg)
+
+ ID_start += len(all_wavs)
+
+
+def parse_config():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--data_folder",
+ type=str,
+ default="/home/data/speechbrain/aishell",
+ help="data folder",
+ )
+ parser.add_argument(
+ "--save_folder",
+ type=str,
+ default="/home/data/speechbrain/aishell/csv_data",
+ help="csv save folder",
+ )
+
+ config = parser.parse_args()
+ print("Config:", config)
+ return config
+
+
+if __name__ == "__main__":
+
+ config = parse_config()
+ prepare_aishell(config.data_folder, config.save_folder, skip_prep=False)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/beam_search.py b/models/speech/speech_recognition/transformer_asr/ixrt/beam_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e5c794ad9ff8c0d517c666c08295f36551e463
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/beam_search.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import torch
+from ctc import CTCPrefixScorer
+import time
+
+def forward(self, enc_states, wav_len): # noqa: C901
+ """Applies beamsearch and returns the predicted tokens."""
+ enc_lens = torch.round(enc_states.shape[1] * wav_len).int()
+ device = enc_states.device
+ batch_size = enc_states.shape[0]
+
+ memory = self.reset_mem(batch_size * self.beam_size, device=device)
+
+ if self.lm_weight > 0:
+ lm_memory = self.reset_lm_mem(batch_size * self.beam_size, device)
+
+ if self.ctc_weight > 0:
+ # (batch_size * beam_size, L, vocab_size)
+ ctc_outputs = self.ctc_forward_step(enc_states)
+ ctc_scorer = CTCPrefixScorer(
+ ctc_outputs,
+ enc_lens,
+ batch_size,
+ self.beam_size,
+ self.blank_index,
+ self.eos_index,
+ self.ctc_window_size,
+ )
+ ctc_memory = None
+
+ # Inflate the enc_states and enc_len by beam_size times
+ enc_states = inflate_tensor(enc_states, times=self.beam_size, dim=0)
+ enc_lens = inflate_tensor(enc_lens, times=self.beam_size, dim=0)
+
+ # Using bos as the first input
+ inp_tokens = (
+ torch.zeros(batch_size * self.beam_size, device=device)
+ .fill_(self.bos_index)
+ .long()
+ )
+
+ # The first index of each sentence.
+ self.beam_offset = (
+ torch.arange(batch_size, device=device) * self.beam_size
+ )
+
+ # initialize sequence scores variables.
+ sequence_scores = torch.empty(
+ batch_size * self.beam_size, device=device
+ )
+ sequence_scores.fill_(float("-inf"))
+
+ # keep only the first to make sure no redundancy.
+ sequence_scores.index_fill_(0, self.beam_offset, 0.0)
+
+ # keep the hypothesis that reaches eos and their corresponding score and log_probs.
+ hyps_and_scores = [[] for _ in range(batch_size)]
+
+ # keep the sequences that still not reaches eos.
+ alived_seq = torch.empty(
+ batch_size * self.beam_size, 0, device=device
+ ).long()
+
+ # Keep the log-probabilities of alived sequences.
+ alived_log_probs = torch.empty(
+ batch_size * self.beam_size, 0, device=device
+ )
+
+ min_decode_steps = int(enc_states.shape[1] * self.min_decode_ratio)
+ max_decode_steps = int(enc_states.shape[1] * self.max_decode_ratio)
+
+ # Initialize the previous attention peak to zero
+ # This variable will be used when using_max_attn_shift=True
+ prev_attn_peak = torch.zeros(batch_size * self.beam_size, device=device)
+
+ for t in range(max_decode_steps):
+ # terminate condition
+ if self._check_full_beams(hyps_and_scores, self.beam_size):
+ break
+
+ log_probs, memory, attn = self.forward_step(
+ inp_tokens, memory, enc_states, enc_lens
+ )
+ log_probs = self.att_weight * log_probs
+
+ # Keep the original value
+ log_probs_clone = log_probs.clone().reshape(batch_size, -1)
+ vocab_size = log_probs.shape[-1]
+
+ if self.using_max_attn_shift:
+ # Block the candidates that exceed the max shift
+ cond, attn_peak = self._check_attn_shift(attn, prev_attn_peak)
+ log_probs = mask_by_condition(
+ log_probs, cond, fill_value=self.minus_inf
+ )
+ prev_attn_peak = attn_peak
+
+ # Set eos to minus_inf when less than minimum steps.
+ if t < min_decode_steps:
+ log_probs[:, self.eos_index] = self.minus_inf
+
+ # Set the eos prob to minus_inf when it doesn't exceed threshold.
+ if self.using_eos_threshold:
+ cond = self._check_eos_threshold(log_probs)
+ log_probs[:, self.eos_index] = mask_by_condition(
+ log_probs[:, self.eos_index],
+ cond,
+ fill_value=self.minus_inf,
+ )
+
+ # adding LM scores to log_prob if lm_weight > 0
+ if self.lm_weight > 0:
+ lm_log_probs, lm_memory = self.lm_forward_step(
+ inp_tokens, lm_memory
+ )
+ log_probs = log_probs + self.lm_weight * lm_log_probs
+
+ # adding CTC scores to log_prob if ctc_weight > 0
+ if self.ctc_weight > 0:
+ g = alived_seq
+ # block blank token
+ log_probs[:, self.blank_index] = self.minus_inf
+ if self.ctc_weight != 1.0 and self.ctc_score_mode == "partial":
+ # pruning vocab for ctc_scorer
+ _, ctc_candidates = log_probs.topk(
+ self.beam_size * 2, dim=-1
+ )
+ else:
+ ctc_candidates = None
+
+ ctc_log_probs, ctc_memory = ctc_scorer.forward_step(
+ g, ctc_memory, ctc_candidates, attn
+ )
+ log_probs = log_probs + self.ctc_weight * ctc_log_probs
+
+ scores = sequence_scores.unsqueeze(1).expand(-1, vocab_size)
+ scores = scores + log_probs
+
+ # length normalization
+ if self.length_normalization:
+ scores = scores / (t + 1)
+
+ # keep topk beams
+ scores, candidates = scores.view(batch_size, -1).topk(
+ self.beam_size, dim=-1
+ )
+
+ # The input for the next step, also the output of current step.
+ inp_tokens = (candidates % vocab_size).view(
+ batch_size * self.beam_size
+ )
+
+ scores = scores.view(batch_size * self.beam_size)
+ sequence_scores = scores
+
+ # recover the length normalization
+ if self.length_normalization:
+ sequence_scores = sequence_scores * (t + 1)
+
+ # The index of which beam the current top-K output came from in (t-1) timesteps.
+ predecessors = (
+ torch.div(candidates, vocab_size, rounding_mode="floor")
+ + self.beam_offset.unsqueeze(1).expand_as(candidates)
+ ).view(batch_size * self.beam_size)
+
+ # Permute the memory to synchoronize with the output.
+ memory = self.permute_mem(memory, index=predecessors)
+ if self.lm_weight > 0:
+ lm_memory = self.permute_lm_mem(lm_memory, index=predecessors)
+
+ if self.ctc_weight > 0:
+ ctc_memory = ctc_scorer.permute_mem(ctc_memory, candidates)
+
+ # If using_max_attn_shift, then the previous attn peak has to be permuted too.
+ if self.using_max_attn_shift:
+ prev_attn_peak = torch.index_select(
+ prev_attn_peak, dim=0, index=predecessors
+ )
+
+ # Add coverage penalty
+ if self.coverage_penalty > 0:
+ cur_attn = torch.index_select(attn, dim=0, index=predecessors)
+
+ # coverage: cumulative attention probability vector
+ if t == 0:
+ # Init coverage
+ self.coverage = cur_attn
+
+ # the attn of transformer is [batch_size*beam_size, current_step, source_len]
+ if len(cur_attn.size()) > 2:
+ self.converage = torch.sum(cur_attn, dim=1)
+ else:
+ # Update coverage
+ self.coverage = torch.index_select(
+ self.coverage, dim=0, index=predecessors
+ )
+ self.coverage = self.coverage + cur_attn
+
+ # Compute coverage penalty and add it to scores
+ penalty = torch.max(
+ self.coverage, self.coverage.clone().fill_(0.5)
+ ).sum(-1)
+ penalty = penalty - self.coverage.size(-1) * 0.5
+ penalty = penalty.view(batch_size * self.beam_size)
+ penalty = (
+ penalty / (t + 1) if self.length_normalization else penalty
+ )
+ scores = scores - penalty * self.coverage_penalty
+
+ # Update alived_seq
+ alived_seq = torch.cat(
+ [
+ torch.index_select(alived_seq, dim=0, index=predecessors),
+ inp_tokens.unsqueeze(1),
+ ],
+ dim=-1,
+ )
+
+ # Takes the log-probabilities
+ beam_log_probs = log_probs_clone[
+ torch.arange(batch_size).unsqueeze(1), candidates
+ ].reshape(batch_size * self.beam_size)
+ alived_log_probs = torch.cat(
+ [
+ torch.index_select(
+ alived_log_probs, dim=0, index=predecessors
+ ),
+ beam_log_probs.unsqueeze(1),
+ ],
+ dim=-1,
+ )
+
+ is_eos = self._update_hyp_and_scores(
+ inp_tokens,
+ alived_seq,
+ alived_log_probs,
+ hyps_and_scores,
+ scores,
+ timesteps=t,
+ )
+
+ # Block the paths that have reached eos.
+ sequence_scores.masked_fill_(is_eos, float("-inf"))
+
+ if not self._check_full_beams(hyps_and_scores, self.beam_size):
+ # Using all eos to fill-up the hyps.
+ eos = (
+ torch.zeros(batch_size * self.beam_size, device=device)
+ .fill_(self.eos_index)
+ .long()
+ )
+ _ = self._update_hyp_and_scores(
+ eos,
+ alived_seq,
+ alived_log_probs,
+ hyps_and_scores,
+ scores,
+ timesteps=max_decode_steps,
+ )
+
+ (
+ topk_hyps,
+ topk_scores,
+ topk_lengths,
+ log_probs,
+ ) = self._get_top_score_prediction(hyps_and_scores, topk=self.topk,)
+ # pick the best hyp
+ predictions = topk_hyps[:, 0, :]
+ predictions = batch_filter_seq2seq_output(
+ predictions, eos_id=self.eos_index
+ )
+
+ if self.return_log_probs:
+ return predictions, topk_scores, log_probs
+ else:
+ return predictions, topk_scores
+
+
+def inflate_tensor(tensor, times, dim):
+ """This function inflates the tensor for times along dim.
+
+ Arguments
+ ---------
+ tensor : torch.Tensor
+ The tensor to be inflated.
+ times : int
+ The tensor will inflate for this number of times.
+ dim : int
+ The dim to be inflated.
+
+ Returns
+ -------
+ torch.Tensor
+ The inflated tensor.
+
+ Example
+ -------
+ >>> tensor = torch.Tensor([[1,2,3], [4,5,6]])
+ >>> new_tensor = inflate_tensor(tensor, 2, dim=0)
+ >>> new_tensor
+ tensor([[1., 2., 3.],
+ [1., 2., 3.],
+ [4., 5., 6.],
+ [4., 5., 6.]])
+ """
+ return torch.repeat_interleave(tensor, times, dim=dim)
+
+def batch_filter_seq2seq_output(prediction, eos_id=-1):
+ """Calling batch_size times of filter_seq2seq_output.
+
+ Arguments
+ ---------
+ prediction : list of torch.Tensor
+ A list containing the output ints predicted by the seq2seq system.
+ eos_id : int, string
+ The id of the eos.
+
+ Returns
+ ------
+ list
+ The output predicted by seq2seq model.
+
+ Example
+ -------
+ >>> predictions = [torch.IntTensor([1,2,3,4]), torch.IntTensor([2,3,4,5,6])]
+ >>> predictions = batch_filter_seq2seq_output(predictions, eos_id=4)
+ >>> predictions
+ [[1, 2, 3], [2, 3]]
+ """
+ outputs = []
+ for p in prediction:
+ res = filter_seq2seq_output(p.tolist(), eos_id=eos_id)
+ outputs.append(res)
+ return outputs
+
+def filter_seq2seq_output(string_pred, eos_id=-1):
+ """Filter the output until the first eos occurs (exclusive).
+
+ Arguments
+ ---------
+ string_pred : list
+ A list containing the output strings/ints predicted by the seq2seq system.
+ eos_id : int, string
+ The id of the eos.
+
+ Returns
+ ------
+ list
+ The output predicted by seq2seq model.
+
+ Example
+ -------
+ >>> string_pred = ['a','b','c','d','eos','e']
+ >>> string_out = filter_seq2seq_output(string_pred, eos_id='eos')
+ >>> string_out
+ ['a', 'b', 'c', 'd']
+ """
+ if isinstance(string_pred, list):
+ try:
+ eos_index = next(
+ i for i, v in enumerate(string_pred) if v == eos_id
+ )
+ except StopIteration:
+ eos_index = len(string_pred)
+ string_out = string_pred[:eos_index]
+ else:
+ raise ValueError("The input must be a list.")
+ return string_out
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a899123463c3cd453fcc7a0677c81b9235e410d2
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/build.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+for i in fast*
+do
+ cd $i
+ bash build.sh
+ cd ..
+done
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/builder.py b/models/speech/speech_recognition/transformer_asr/ixrt/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c19a9f4bdd2133138621d0e39aebacf09e133f9
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/builder.py
@@ -0,0 +1,466 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import argparse
+import torch
+from tensorrt.deploy.api import GraphTransform, create_source, create_target
+from tensorrt.deploy.ir.data_type import DataType
+from tensorrt.deploy.ir.variable import Variable, VariableOptions
+from tensorrt.deploy.ir.graph import Graph
+from collections import OrderedDict
+import math
+import re
+import glob
+import os
+from onnx import numpy_helper
+import subprocess
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description="build ixrt engine", usage=""
+ )
+ parser.add_argument(
+ "--ckpt_path",
+ type=str,
+ required=True,
+ help="",
+ )
+ parser.add_argument(
+ "--head_num",
+ type=int,
+ required=True,
+ help="",
+ )
+ parser.add_argument(
+ "--max_batch_size",
+ type=int,
+ required=True,
+ help="",
+ )
+ parser.add_argument(
+ "--max_seq_len",
+ type=int,
+ required=True,
+ help="",
+ )
+ parser.add_argument(
+ "--onnx_path",
+ type=str,
+ default=".tmp.onnx",
+ help="",
+ )
+ parser.add_argument(
+ "--engine_path",
+ type=str,
+ required=True,
+ help="",
+ )
+ args = parser.parse_args()
+ return args
+
+
+def add_make_mask_op(graph, state_dict, args):
+ attributes = {}
+
+ t = graph
+ inputs = [
+ graph.make_variable('length_radio', dtype=DataType.FLOAT16),
+ graph.make_variable('input', dtype=DataType.FLOAT16),
+ ]
+
+ outputs = [t.make_variable("attention_mask", dtype=DataType.INT32)]
+
+ t.make_operator(
+ "MakeMaskByRadio_IxRT", inputs=inputs, outputs=outputs, **attributes
+ )
+
+
+def add_custom_linear_op(graph, state_dict, args):
+ linear_keys = [
+ "1.custom_src_module.layers.0.w.weight",
+ "1.custom_src_module.layers.0.w.bias"
+ ]
+ W = numpy_helper.from_array(state_dict[linear_keys[0]].cpu().numpy(), name="W")
+ B = numpy_helper.from_array(state_dict[linear_keys[1]].cpu().numpy(), name="B")
+ attributes = {
+ "out_dims": state_dict["1.custom_src_module.layers.0.w.weight"].size(0),
+ "type_id": 1,
+ "W": W,
+ "B": B,
+ }
+ assert state_dict['1.custom_src_module.layers.0.w.weight'].size(
+ 0) == state_dict["1.custom_src_module.layers.0.w.bias"].size(0)
+
+ t = graph
+ inputs = [
+ graph.get_variable('input'),
+ ]
+
+ outputs = [t.make_variable("custom_src_output")]
+ t.make_operator(
+ "CustomFCPluginDynamic_IxRT", inputs=inputs, outputs=outputs, **attributes
+ )
+
+
+# def add_custom_linear_op(graph, state_dict, args):
+# linear_keys = [
+# "1.custom_src_module.layers.0.w.weight",
+# "1.custom_src_module.layers.0.w.bias"
+# ]
+# attributes = {
+# "linear_dim": state_dict["1.custom_src_module.layers.0.w.weight"].size(0),
+# "hidden_size": state_dict["1.custom_src_module.layers.0.w.weight"].size(1),
+# "has_bias": 1,
+# "act_type": "none",
+# }
+# assert state_dict['1.custom_src_module.layers.0.w.weight'].size(
+# 0) == state_dict["1.custom_src_module.layers.0.w.bias"].size(0)
+#
+# t = graph
+# inputs = [
+# graph.get_variable('input'),
+# ]
+#
+# outputs = [t.make_variable("custom_src_output",dtype=DataType.FLOAT16)]
+# for key in linear_keys:
+# inputs.append(t.make_variable(name=key, value=state_dict[key].half()))
+# t.make_operator(
+# "LinearFP16", inputs=inputs, outputs=outputs, **attributes
+# )
+
+
+def add_pos_encode_op(graph, state_dict, args):
+ attributes = {}
+ t = graph
+ inputs = [
+ graph.get_variable('custom_src_output'),
+ ]
+ outputs = [t.make_variable("hidden_state", dtype=DataType.FLOAT16)]
+ t.make_operator(
+ "PosEncodeSinCos_IxRT", inputs=inputs, outputs=outputs, **attributes
+ )
+
+
+def add_transformer_op(graph, state_dict, args):
+ enc_tensor_layer_fp16_keys = OrderedDict([
+ ["1.encoder.layers.{}.norm1.norm.weight", [args.hidden_size]],
+ ["1.encoder.layers.{}.norm1.norm.bias", [args.hidden_size]],
+ ["1.encoder.layers.{}.self_att.att.in_proj_weight",
+ [args.hidden_size * 3, args.hidden_size]],
+ ["1.encoder.layers.{}.self_att.att.in_proj_bias", [args.hidden_size * 3]],
+ ["1.encoder.layers.{}.self_att.att.out_proj.weight",
+ [args.hidden_size, args.hidden_size]],
+ ["1.encoder.layers.{}.self_att.att.out_proj.bias", [args.hidden_size]],
+ ["1.encoder.layers.{}.pos_ffn.ffn.0.weight",
+ [args.inner_size, args.hidden_size]],
+ ["1.encoder.layers.{}.pos_ffn.ffn.0.bias", [args.inner_size]],
+ ["1.encoder.layers.{}.pos_ffn.ffn.3.weight",
+ [args.hidden_size, args.inner_size]],
+ ["1.encoder.layers.{}.pos_ffn.ffn.3.bias", [args.hidden_size]],
+ ["1.encoder.layers.{}.norm2.norm.weight", [args.hidden_size]],
+ ["1.encoder.layers.{}.norm2.norm.bias", [args.hidden_size]],
+ ])
+ attributes_legcy = {
+ "hidden_size": args.hidden_size,
+ "num_layers": args.num_layers,
+ "head_num": args.head_num,
+ "head_dim": args.head_dim,
+ "inner_size": args.inner_size,
+ "act_type": "gelu",
+ "normalize_before": 1,
+ "is_fmha": 1,
+ "atten_scaler": 1 / math.sqrt(args.head_dim)
+ }
+
+
+ attributes = {
+ "hidden_size": int(args.hidden_size),
+ "num_layers": int(args.num_layers),
+ "head_num": int(args.head_num),
+ "head_dim": int(args.head_dim),
+ "inner_size": int(args.inner_size),
+ "act_type": 12, #gelu
+ "normalize_before": 1,
+ "is_fmha": 1,
+ "atten_scaler": 1.0 / math.sqrt(args.head_dim),
+ "max_seq_len": int(args.max_seq_len),
+ "max_batch_size": int(args.max_batch_size),
+
+ }
+
+ t = graph
+ inputs = [
+ graph.get_variable('hidden_state'),
+ graph.get_variable('attention_mask'),
+ ]
+ outputs = [t.make_variable("encoder_out", dtype=DataType.FLOAT16)]
+ for layer_id in range(args.num_layers):
+ for key, shape in enc_tensor_layer_fp16_keys.items():
+ # we need cat qkv gemm's weight and bias
+ new_key = key.format(layer_id)
+ w = state_dict[new_key]
+ if list(w.shape) != shape:
+ print("weights shape error!")
+ print("key: ", key)
+ print("need shape: ", shape)
+ print("weight shape: ", w.shape)
+ exit(1)
+ inputs.append(t.make_variable(name=new_key, value=w.half()))
+ t.make_operator(
+ "TransformerEncoderFp16_IxRT", inputs=inputs, outputs=outputs, **attributes
+ )
+
+
+def add_layer_norm_op(graph, state_dict, args):
+ enc_ln_tensor_fp16_keys = OrderedDict([
+ ["1.encoder.norm.norm.weight", [args.hidden_size]],
+ ["1.encoder.norm.norm.bias", [args.hidden_size]],
+ ])
+ attributes = {
+ "epsilon": 1e-5,
+ "axis": -1,
+ "stash_type": 1
+ }
+ t = graph
+ inputs = [
+ graph.get_variable('encoder_out'),
+ ]
+ outputs = [t.make_variable("encoder_ln_out")]
+ for key, shape in enc_ln_tensor_fp16_keys.items():
+ new_key = key
+ w = state_dict[new_key]
+ if list(w.shape) != shape:
+ print("weights shape error!")
+ print("key: ", key)
+ print("need shape: ", shape)
+ print("weight shape: ", w.shape)
+ exit(1)
+ inputs.append(t.make_variable(name=new_key, value=w.half()))
+ t.make_operator(
+ "LayerNormalization", inputs=inputs, outputs=outputs, **attributes
+ )
+
+
+# def add_layer_norm_op(graph, state_dict, args):
+# enc_ln_tensor_fp16_keys = OrderedDict([
+# ["1.encoder.norm.norm.weight", [args.hidden_size]],
+# ["1.encoder.norm.norm.bias", [args.hidden_size]],
+# ])
+# attributes = {
+# "hidden_size": args.hidden_size,
+# }
+# t = graph
+# inputs = [
+# graph.get_variable('encoder_out'),
+# ]
+# outputs = [t.make_variable("encoder_ln_out",dtype=DataType.FLOAT16)]
+# for key, shape in enc_ln_tensor_fp16_keys.items():
+# new_key = key
+# w = state_dict[new_key]
+# if list(w.shape) != shape:
+# print("weights shape error!")
+# print("key: ", key)
+# print("need shape: ", shape)
+# print("weight shape: ", w.shape)
+# exit(1)
+# inputs.append(t.make_variable(name=new_key, value=w.half()))
+# t.make_operator(
+# "LayerNormFp16", inputs=inputs, outputs=outputs, **attributes
+# )
+
+def add_linear_op(graph, state_dict, args):
+ linear_keys = [
+ "3.w.weight",
+ "3.w.bias"
+ ]
+ W = numpy_helper.from_array(state_dict[linear_keys[0]].cpu().numpy(), name="W")
+ B = numpy_helper.from_array(state_dict[linear_keys[1]].cpu().numpy(), name="B")
+ attributes = {
+ "out_dims": state_dict["3.w.weight"].size(0),
+ "type_id": 1,
+ "W": W,
+ "B": B,
+ }
+ assert state_dict['3.w.weight'].size(0) == state_dict["3.w.bias"].size(0)
+
+ t = graph
+ inputs = [
+ graph.get_variable('encoder_ln_out'),
+ ]
+
+ outputs = [t.make_variable("lin_output")]
+ t.make_operator(
+ "CustomFCPluginDynamic_IxRT", inputs=inputs, outputs=outputs, **attributes
+ )
+
+
+#
+# def add_linear_op(graph, state_dict, args):
+# lin_keys = [
+# "3.w.weight",
+# "3.w.bias"
+# ]
+# attributes = {
+# "linear_dim": state_dict["3.w.weight"].size(0),
+# "hidden_size": state_dict["3.w.weight"].size(1),
+# "has_bias": 1,
+# "act_type": "none",
+# }
+# assert state_dict['3.w.weight'].size(0) == state_dict["3.w.bias"].size(0)
+#
+# t = graph
+# inputs = [
+# graph.get_variable('encoder_ln_out'),
+# ]
+#
+# outputs = [t.make_variable("lin_output",dtype=DataType.FLOAT16)]
+# for key in lin_keys:
+# inputs.append(t.make_variable(name=key, value=state_dict[key].half()))
+# t.make_operator(
+# "LinearFP16", inputs=inputs, outputs=outputs, **attributes
+# )
+
+
+def add_log_softmax_op(graph, state_dict, args):
+ attributes = {
+ "axis": "-1",
+ }
+
+ t = graph
+ inputs = [
+ graph.get_variable('lin_output'),
+ ]
+
+ outputs = [t.make_variable("log_softmax_output", dtype=DataType.FLOAT16)]
+
+ t.make_operator(
+ "LogSoftmax", inputs=inputs, outputs=outputs, **attributes
+ )
+
+
+def add_search_node(graph, state_dict, args):
+ attributes = {
+ "vocab_size": args.vocab_size,
+ "eos_id": args.vocab_size,
+ "pad_id": -10000,
+ "beam_size": 1,
+ "attr1": 1.0,
+ "min_decode_ratio": 0.0,
+ "max_decode_ratio": 1.0,
+ "ctc_weight": 0.40,
+ "using_eos_threshold": 0,
+ "length_normalization": 1,
+ }
+ t = graph
+ inputs = [
+ graph.get_variable('lin_output'),
+ ]
+
+ outputs = [t.make_variable("output_tokens", dtype=DataType.INT32)]
+ list_value_half = []
+ list_key_half = []
+ for key in state_dict.keys():
+ if "decoder" in key or "custom_tgt_module" in key or "2.w.weight" in key or "2.w.bias" in key:
+ list_key_half.append(key)
+ list_value_half.append(state_dict[key].half())
+ for i, item in enumerate(list_key_half):
+ inputs.append(t.make_variable(name=list_key_half[i], value=list_value_half[i]))
+ t.make_operator(
+ "Search", inputs=inputs, outputs=outputs, **attributes
+ )
+
+
+def get_num_layers(state_dict):
+ num_layers = -1
+ for key in state_dict:
+ layer_id = re.search(
+ "1.encoder.layers.([0-9]+).pos_ffn.ffn.0.bias", key)
+ if layer_id:
+ layer_id = layer_id.group(1)
+ num_layers = max(num_layers, int(layer_id) + 1)
+ assert num_layers > 0
+ return num_layers
+
+
+def build_engine(onnx_file, engine_file, max_batch_size,max_seq_len):
+ cmd = f"ixrtexec --onnx {onnx_file} --min_shape input:1x32x5120,length_radio:1 --opt_shape input:8x64x5120,length_radio:8 --max_shape input:{max_batch_size}x{max_seq_len}x5120,length_radio:64 --plugins ixrt_plugin --save_engine {engine_file}"
+ subprocess.run(cmd.split(), check=True)
+
+
+def main(args):
+ graph = Graph()
+ transform = GraphTransform(graph)
+ ckpt_path = glob.glob(os.path.join(args.ckpt_path, "*/model.ckpt"))[0]
+ print("load ckpt from: ", ckpt_path)
+ state_dict = torch.load(ckpt_path)
+
+ # print([i for i in state_dict ])
+ # print(state_dict['3.w.bias'])
+ args.hidden_size = state_dict['1.encoder.layers.0.norm1.norm.weight'].size(
+ 0)
+ args.head_dim = args.hidden_size / args.head_num
+ args.inner_size = state_dict['1.encoder.layers.0.pos_ffn.ffn.0.bias'].size(
+ 0)
+ args.vocab_size = state_dict['3.w.weight'].size(0)
+
+ args.num_layers = get_num_layers(state_dict)
+
+ args.src_len = state_dict["1.custom_src_module.layers.0.w.weight"].size(1)
+
+ # args.num_layers = 1
+ add_make_mask_op(transform, state_dict, args)
+ add_custom_linear_op(transform, state_dict, args)
+ add_pos_encode_op(transform, state_dict, args)
+ add_transformer_op(transform, state_dict, args)
+ add_layer_norm_op(transform, state_dict, args)
+ # add_linear_op(transform, state_dict, args)
+ # add_log_softmax_op(transform, state_dict, args)
+ # add_search_node(transform, state_dict, args)
+
+ # IO attributes
+ length_radio = graph.get_variable('length_radio')
+ length_radio.set_shape(["batch_size"])
+ length_radio.dtype = "float16"
+ graph.add_input(length_radio)
+
+ input = graph.get_variable('input')
+ input.set_shape(["batch_size", "seq_len", "src_len"])
+ input.dtype = "float16"
+ graph.add_input(input)
+
+ output = graph.get_variable('encoder_ln_out')
+ output.dtype = "float16"
+ graph.add_output(output)
+
+ create_target(saved_path=args.onnx_path).export(graph)
+
+ build_engine(args.onnx_path, args.engine_path, args.max_batch_size, args.max_seq_len)
+ print("save engine: ", args.engine_path)
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ ckpt_path = args.ckpt_path
+
+ main(args)
+
+"""
+python3 builder.py \
+--ckpt_path results/transformer/8886/save \
+--head_num 4 \
+--max_batch_size 64 \
+--max_seq_len 1024 \
+--engine_path transformer.engine
+"""
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/convert.py b/models/speech/speech_recognition/transformer_asr/ixrt/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d71a566c7d03daff6f063e46fb5984665714cb
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/convert.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import torch
+from faster_layer_norm import FasterLayerNorm
+
+def replace_layer_norm(model):
+ module_output = model
+
+ if isinstance(model, torch.nn.modules.normalization.LayerNorm):
+ return FasterLayerNorm(model.weight, model.bias)
+
+ for name, child in model.named_children():
+ module_output.add_module(
+ name, replace_layer_norm(child)
+ )
+ return module_output
+
+
+def convert_decoder_model(model):
+ model = replace_layer_norm(model)
+ # for layer in model.layers:
+ # norm = layer.norm1.norm
+ # print(type(norm))
+ # exit()
+ # new_norm = FasterLayerNorm(norm.weight, norm.bias)
+ # layer.norm1.norm = new_norm
+
+ # norm = layer.norm2.norm
+ # new_norm = FasterLayerNorm(norm.weight, norm.bias)
+ # layer.norm2.norm = new_norm
+
+ # norm = layer.norm3.norm
+ # new_norm = FasterLayerNorm(norm.weight, norm.bias)
+ # layer.norm3.norm = new_norm
+ return model
+
+# def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
+# if type(module) in layers:
+# return {name: module}
+# res = {}
+# for name1, child in module.named_children():
+# res.update(find_layers(
+# child, layers=layers, name=name + '.' + name1 if name != '' else name1
+# ))
+# return res
+
+def find_node(module):
+ if type(module) in [torch.nn.LayerNorm]:
+ print(module)
+ return
+ res = {}
+ for name1, child in module.named_children():
+ find_node(child)
+ return
+
+
+def patch_get_lookahead_mask(padded_input):
+ """Creates a binary mask for each sequence which maskes future frames.
+
+ Arguments
+ ---------
+ padded_input: torch.Tensor
+ Padded input tensor.
+
+ Example
+ -------
+ >>> a = torch.LongTensor([[1,1,0], [2,3,0], [4,5,0]])
+ >>> get_lookahead_mask(a)
+ tensor([[0., -inf, -inf],
+ [0., 0., -inf],
+ [0., 0., 0.]])
+ """
+ seq_len = padded_input.shape[1]
+ mask = (
+ torch.triu(torch.ones((seq_len, seq_len), device=padded_input.device))
+ == 1
+ ).transpose(0, 1)
+ mask = (
+ mask.float()
+ .masked_fill(mask == 0, float("-inf"))
+ .masked_fill(mask == 1, float(0.0))
+ )
+ return mask.detach().to(padded_input.device).to(torch.float16)
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/ctc.py b/models/speech/speech_recognition/transformer_asr/ixrt/ctc.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db6ab7e1b92a58b579af85805ee9c0b98b8f3c0
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/ctc.py
@@ -0,0 +1,394 @@
+"""Decoders and output normalization for CTC.
+
+Authors
+ * Mirco Ravanelli 2020
+ * Aku Rouhe 2020
+ * Sung-Lin Yeh 2020
+"""
+import torch
+from itertools import groupby
+from speechbrain.dataio.dataio import length_to_mask
+from faster_logsumexp import FasterLogSumExp
+from faster_stack import FasterStack
+from faster_cat import FastCat
+
+
+class CTCPrefixScorer:
+ """This class implements the CTC prefix scorer of Algorithm 2 in
+ reference: https://www.merl.com/publications/docs/TR2017-190.pdf.
+ Official implementation: https://github.com/espnet/espnet/blob/master/espnet/nets/ctc_prefix_score.py
+
+ Arguments
+ ---------
+ x : torch.Tensor
+ The encoder states.
+ enc_lens : torch.Tensor
+ The actual length of each enc_states sequence.
+ batch_size : int
+ The size of the batch.
+ beam_size : int
+ The width of beam.
+ blank_index : int
+ The index of the blank token.
+ eos_index : int
+ The index of the end-of-sequence (eos) token.
+ ctc_window_size: int
+ Compute the ctc scores over the time frames using windowing based on attention peaks.
+ If 0, no windowing applied.
+ """
+
+ def __init__(
+ self,
+ x,
+ enc_lens,
+ batch_size,
+ beam_size,
+ blank_index,
+ eos_index,
+ ctc_window_size=0,
+ ):
+ self.blank_index = blank_index
+ self.eos_index = eos_index
+ self.max_enc_len = x.size(1)
+ self.batch_size = batch_size
+ self.beam_size = beam_size
+ self.vocab_size = x.size(-1)
+ self.device = x.device
+ self.minus_inf = -1e4
+ self.last_frame_index = enc_lens - 1
+ self.ctc_window_size = ctc_window_size
+
+ # mask frames > enc_lens
+ mask = 1 - length_to_mask(enc_lens)
+ mask = mask.unsqueeze(-1).expand(-1, -1, x.size(-1)).eq(1)
+ x.masked_fill_(mask, self.minus_inf)
+ x[:, :, 0] = x[:, :, 0].masked_fill_(mask[:, :, 0], 0)
+
+ # dim=0: xnb, nonblank posteriors, dim=1: xb, blank posteriors
+ xnb = x.transpose(0, 1)
+ xb = (
+ xnb[:, :, self.blank_index]
+ .unsqueeze(2)
+ .expand(-1, -1, self.vocab_size)
+ )
+
+ # (2, L, batch_size * beam_size, vocab_size)
+ # self.x = torch.stack([xnb, xb])
+ self.x = FasterStack([xnb.contiguous(), xb.contiguous()])
+
+ # The first index of each sentence.
+ self.beam_offset = (
+ torch.arange(batch_size, device=self.device) * self.beam_size
+ )
+ # The first index of each candidates.
+ self.cand_offset = (
+ torch.arange(batch_size, device=self.device) * self.vocab_size
+ )
+
+ def forward_step(self, g, state, candidates=None, attn=None):
+ """This method if one step of forwarding operation
+ for the prefix ctc scorer.
+
+ Arguments
+ ---------
+ g : torch.Tensor
+ The tensor of prefix label sequences, h = g + c.
+ state : tuple
+ Previous ctc states.
+ candidates : torch.Tensor
+ (batch_size * beam_size, ctc_beam_size), The topk candidates for rescoring.
+ The ctc_beam_size is set as 2 * beam_size. If given, performing partial ctc scoring.
+ """
+
+ prefix_length = g.size(1)
+ last_char = [gi[-1] for gi in g] if prefix_length > 0 else [0] * len(g)
+ self.num_candidates = (
+ self.vocab_size if candidates is None else candidates.size(-1)
+ )
+ if state is None:
+ # r_prev: (L, 2, batch_size * beam_size)
+ r_prev = torch.full(
+ (self.max_enc_len, 2, self.batch_size, self.beam_size),
+ self.minus_inf,
+ device=self.device,
+ dtype=torch.float16
+ )
+
+ # Accumulate blank posteriors at each step
+ r_prev[:, 1] = torch.cumsum(
+ self.x[0, :, :, self.blank_index], 0
+ ).unsqueeze(2)
+ r_prev = r_prev.view(-1, 2, self.batch_size * self.beam_size)
+ psi_prev = 0.0
+ else:
+ r_prev, psi_prev = state
+ r_prev = r_prev.half()
+
+ # for partial search
+ if candidates is not None:
+ scoring_table = torch.full(
+ (self.batch_size * self.beam_size, self.vocab_size),
+ -1,
+ dtype=torch.long,
+ device=self.device,
+ )
+ # Assign indices of candidates to their positions in the table
+ col_index = torch.arange(
+ self.batch_size * self.beam_size, device=self.device
+ ).unsqueeze(1)
+ scoring_table[col_index, candidates] = torch.arange(
+ self.num_candidates, device=self.device
+ )
+ # Select candidates indices for scoring
+ scoring_index = (
+ candidates
+ + self.cand_offset.unsqueeze(1)
+ .repeat(1, self.beam_size)
+ .view(-1, 1)
+ ).view(-1)
+ x_inflate = torch.index_select(
+ self.x.view(2, -1, self.batch_size * self.vocab_size),
+ 2,
+ scoring_index,
+ ).view(2, -1, self.batch_size * self.beam_size, self.num_candidates)
+ # for full search
+ else:
+ scoring_table = None
+ x_inflate = (
+ self.x.unsqueeze(3)
+ .repeat(1, 1, 1, self.beam_size, 1)
+ .view(
+ 2, -1, self.batch_size * self.beam_size, self.num_candidates
+ )
+ )
+
+ # Prepare forward probs
+ r = torch.full(
+ (
+ self.max_enc_len,
+ 2,
+ self.batch_size * self.beam_size,
+ self.num_candidates,
+ ),
+ self.minus_inf,
+ device=self.device,
+ dtype=torch.float16
+ )
+ r.fill_(self.minus_inf)
+
+ # (Alg.2-6)
+ if prefix_length == 0:
+ r[0, 0] = x_inflate[0, 0]
+ # (Alg.2-10): phi = prev_nonblank + prev_blank = r_t-1^nb(g) + r_t-1^b(g)
+ r_sum = FasterLogSumExp(r_prev, 1)
+ phi = r_sum.unsqueeze(2).repeat(1, 1, self.num_candidates)
+
+ # (Alg.2-10): if last token of prefix g in candidates, phi = prev_b + 0
+ if candidates is not None:
+ for i in range(self.batch_size * self.beam_size):
+ pos = scoring_table[i, last_char[i]]
+ if pos != -1:
+ phi[:, i, pos] = r_prev[:, 1, i]
+ else:
+ for i in range(self.batch_size * self.beam_size):
+ phi[:, i, last_char[i]] = r_prev[:, 1, i]
+
+ # Start, end frames for scoring (|g| < |h|).
+ # Scoring based on attn peak if ctc_window_size > 0
+ if self.ctc_window_size == 0 or attn is None:
+ start = max(1, prefix_length)
+ end = self.max_enc_len
+ else:
+ _, attn_peak = torch.max(attn, dim=1)
+ max_frame = torch.max(attn_peak).item() + self.ctc_window_size
+ min_frame = torch.min(attn_peak).item() - self.ctc_window_size
+ start = max(max(1, prefix_length), int(min_frame))
+ end = min(self.max_enc_len, int(max_frame))
+
+ # Compute forward prob log(r_t^nb(h)) and log(r_t^b(h)):
+ for t in range(start, end):
+ # (Alg.2-11): dim=0, p(h|cur step is nonblank) = [p(prev step=y) + phi] * p(c)
+ rnb_prev = r[t - 1, 0]
+ # (Alg.2-12): dim=1, p(h|cur step is blank) = [p(prev step is blank) + p(prev step is nonblank)] * p(blank)
+ rb_prev = r[t - 1, 1]
+ # r_ = torch.stack([rnb_prev, phi[t - 1], rnb_prev, rb_prev]).view(
+ # 2, 2, self.batch_size * self.beam_size, self.num_candidates
+ # )
+ r_ = FasterStack([rnb_prev, phi[t - 1], rnb_prev, rb_prev]).view(
+ 2, 2, self.batch_size * self.beam_size, self.num_candidates
+ )
+ r[t] = FasterLogSumExp(r_, 1) + x_inflate[:, t]
+
+ # Compute the predix prob, psi
+ psi_init = r[start - 1, 0].unsqueeze(0)
+ # phi is prob at t-1 step, shift one frame and add it to the current prob p(c)
+ phix = FastCat((phi[0].unsqueeze(0), phi[:-1]), dim=0) + x_inflate[0]
+
+ # (Alg.2-13): psi = psi + phi * p(c)
+ if candidates is not None:
+ psi = torch.full(
+ (self.batch_size * self.beam_size, self.vocab_size),
+ self.minus_inf,
+ device=self.device,
+ dtype=torch.float16
+ )
+ psi_ = FasterLogSumExp(
+ FastCat((phix[start:end], psi_init), dim=0), dim=0
+ )
+ # only assign prob to candidates
+ for i in range(self.batch_size * self.beam_size):
+ psi[i, candidates[i]] = psi_[i]
+ else:
+ psi = FastCat((phix[start:end], psi_init), dim=0)
+ psi = FasterLogSumExp(psi, dim=0)
+
+ # (Alg.2-3): if c = , psi = log(r_T^n(g) + r_T^b(g)), where T is the length of max frames
+ for i in range(self.batch_size * self.beam_size):
+ psi[i, self.eos_index] = r_sum[
+ self.last_frame_index[i // self.beam_size], i
+ ]
+
+ # Exclude blank probs for joint scoring
+ psi[:, self.blank_index] = self.minus_inf
+
+ return psi - psi_prev, (r, psi, scoring_table)
+
+ def permute_mem(self, memory, index):
+ """This method permutes the CTC model memory
+ to synchronize the memory index with the current output.
+
+ Arguments
+ ---------
+ memory : No limit
+ The memory variable to be permuted.
+ index : torch.Tensor
+ The index of the previous path.
+
+ Return
+ ------
+ The variable of the memory being permuted.
+
+ """
+ r, psi, scoring_table = memory
+ # The index of top-K vocab came from in (t-1) timesteps.
+ best_index = (
+ index
+ + (self.beam_offset.unsqueeze(1).expand_as(index) * self.vocab_size)
+ ).view(-1)
+ # synchronize forward prob
+ psi = torch.index_select(psi.view(-1), dim=0, index=best_index)
+ psi = (
+ psi.view(-1, 1)
+ .repeat(1, self.vocab_size)
+ .view(self.batch_size * self.beam_size, self.vocab_size)
+ )
+
+ # synchronize ctc states
+ if scoring_table is not None:
+ effective_index = (
+ index // self.vocab_size + self.beam_offset.view(-1, 1)
+ ).view(-1)
+ selected_vocab = (index % self.vocab_size).view(-1)
+ score_index = scoring_table[effective_index, selected_vocab]
+ score_index[score_index == -1] = 0
+ best_index = score_index + effective_index * self.num_candidates
+
+ r = torch.index_select(
+ r.view(
+ -1, 2, self.batch_size * self.beam_size * self.num_candidates
+ ),
+ dim=-1,
+ index=best_index,
+ )
+ r = r.view(-1, 2, self.batch_size * self.beam_size)
+
+ return r, psi
+
+
+def filter_ctc_output(string_pred, blank_id=-1):
+ """Apply CTC output merge and filter rules.
+
+ Removes the blank symbol and output repetitions.
+
+ Arguments
+ ---------
+ string_pred : list
+ A list containing the output strings/ints predicted by the CTC system.
+ blank_id : int, string
+ The id of the blank.
+
+ Returns
+ -------
+ list
+ The output predicted by CTC without the blank symbol and
+ the repetitions.
+
+ Example
+ -------
+ >>> string_pred = ['a','a','blank','b','b','blank','c']
+ >>> string_out = filter_ctc_output(string_pred, blank_id='blank')
+ >>> print(string_out)
+ ['a', 'b', 'c']
+ """
+
+ if isinstance(string_pred, list):
+ # Filter the repetitions
+ string_out = [
+ v
+ for i, v in enumerate(string_pred)
+ if i == 0 or v != string_pred[i - 1]
+ ]
+
+ # Remove duplicates
+ string_out = [i[0] for i in groupby(string_out)]
+
+ # Filter the blank symbol
+ string_out = list(filter(lambda elem: elem != blank_id, string_out))
+ else:
+ raise ValueError("filter_ctc_out can only filter python lists")
+ return string_out
+
+
+def ctc_greedy_decode(probabilities, seq_lens, blank_id=-1):
+ """Greedy decode a batch of probabilities and apply CTC rules.
+
+ Arguments
+ ---------
+ probabilities : torch.tensor
+ Output probabilities (or log-probabilities) from the network with shape
+ [batch, probabilities, time]
+ seq_lens : torch.tensor
+ Relative true sequence lengths (to deal with padded inputs),
+ the longest sequence has length 1.0, others a value between zero and one
+ shape [batch, lengths].
+ blank_id : int, string
+ The blank symbol/index. Default: -1. If a negative number is given,
+ it is assumed to mean counting down from the maximum possible index,
+ so that -1 refers to the maximum possible index.
+
+ Returns
+ -------
+ list
+ Outputs as Python list of lists, with "ragged" dimensions; padding
+ has been removed.
+
+ Example
+ -------
+ >>> import torch
+ >>> probs = torch.tensor([[[0.3, 0.7], [0.0, 0.0]],
+ ... [[0.2, 0.8], [0.9, 0.1]]])
+ >>> lens = torch.tensor([0.51, 1.0])
+ >>> blank_id = 0
+ >>> ctc_greedy_decode(probs, lens, blank_id)
+ [[1], [1]]
+ """
+ if isinstance(blank_id, int) and blank_id < 0:
+ blank_id = probabilities.shape[-1] + blank_id
+ batch_max_len = probabilities.shape[1]
+ batch_outputs = []
+ for seq, seq_len in zip(probabilities, seq_lens):
+ actual_size = int(torch.round(seq_len * batch_max_len))
+ scores, predictions = torch.max(seq.narrow(0, 0, actual_size), dim=1)
+ out = filter_ctc_output(predictions.tolist(), blank_id=blank_id)
+ batch_outputs.append(out)
+ return batch_outputs
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/__init__.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..537d35c57ca840042a6e694f2ab29c333246d625
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/__init__.py
@@ -0,0 +1,13 @@
+import torch
+from faster_cat import sp_opt
+
+def FastCat(inputs,dim=0):
+ if len(inputs) == 2 and dim==0:
+ a,b = inputs
+ in_shape = a.shape
+ if len(in_shape)>1:
+ res, = sp_opt.test_opt_2(a.view(a.shape[0],-1),b.view(b.shape[0],-1))
+ new_shape = (a.shape[0]+b.shape[0],) + in_shape[1:]
+ res = res.view(*new_shape)
+ return res
+ return torch.cat(inputs,dim=dim)
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f679258d4f3e5e66187918d5b2126613a736dd6b
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/build.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+set -euox pipefail
+
+rm -rf build
+rm -rf *.so
+
+python3 setup.py build
+
+cp build/lib*/*.so .
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/kernel.cu b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..022fac397611cccca7f35c6cad0406969b123bf2
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/kernel.cu
@@ -0,0 +1,79 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+namespace iluvatar::inferrt::transformer {
+
+__global__ void Cat(half* a, half* b, half* output, int m1, int m2, int k) {
+ int i = blockIdx.y * blockDim.x + threadIdx.x;
+ // a
+ if (blockIdx.x < m1) {
+ half2* h2_a = reinterpret_cast(a + blockIdx.x * k);
+ half2* h2_out_a = reinterpret_cast(output + blockIdx.x * k);
+ if (i < k / 2) {
+ h2_out_a[i] = h2_a[i];
+ }
+ }
+ // b
+ if (blockIdx.x < m2) {
+ half2* h2_b = reinterpret_cast(b + blockIdx.x * k);
+ half2* h2_out_b =
+ reinterpret_cast(output + blockIdx.x * k + m1 * k);
+ if (i < k / 2) {
+ h2_out_b[i] = h2_b[i];
+ }
+ }
+}
+
+void IxinferCatLauncher(half* a, half* b, half* output, int m1, int m2, int k,
+ cudaStream_t stream) {
+ if (k % 2 != 0) {
+ throw std::runtime_error("IxinferStackLauncher: size error!");
+ }
+ int m = std::max(m1, m2);
+ int num_threads = 1024;
+ int half_k = k / 2;
+ int num_roll = (half_k - 1 + num_threads) / num_threads;
+ dim3 grid(m, num_roll);
+ dim3 block(num_threads);
+ Cat<<>>(a, b, output, m1, m2, k);
+}
+
+} // namespace iluvatar::inferrt::transformer
+
+std::vector one_test_opt_2(at::Tensor a, at::Tensor b) {
+ TORCH_CHECK(a.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(a.is_cuda());
+ TORCH_CHECK(a.is_contiguous());
+
+ TORCH_CHECK(b.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(b.is_cuda());
+ TORCH_CHECK(b.is_contiguous());
+
+ TORCH_CHECK(a.dim() == 2);
+ TORCH_CHECK(b.dim() == 2);
+
+ int m1 = a.size(0);
+ int m2 = b.size(0);
+
+ int k = a.size(1);
+
+ TORCH_CHECK(b.size(1) == k);
+
+ at::Tensor output = a.new_empty({(m1 + m2), k});
+
+ half* p_a = (half*)a.data_ptr();
+ half* p_b = (half*)b.data_ptr();
+ half* p_out = (half*)output.data_ptr();
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ iluvatar::inferrt::transformer::IxinferCatLauncher(p_a, p_b, p_out, m1, m2, k,
+ stream);
+ return {output};
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/setup.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a031577c2d36f4451bab736af4306b4bd741adc4
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/setup.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import glob
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# cpp_files = glob.glob(os.path.join(CUR_DIR,"*.cpp"))
+# cu_files = glob.glob(os.path.join(CUR_DIR,'*.cu'))
+# source_files = cpp_files + cu_files
+# print("source files:")
+# for i in source_files:
+# print(i)
+source_files = [
+ os.path.join(CUR_DIR,'test.cpp'),
+ os.path.join(CUR_DIR,'kernel.cu'),
+]
+
+for i in source_files:
+ assert os.path.isfile(i)
+ print(i)
+
+setup(
+ name="test",
+ ext_modules=[
+ CUDAExtension(
+ name="sp_opt",
+ libraries=["cuinfer"],
+ sources=source_files)
+ ],
+ cmdclass={
+ "build_ext": BuildExtension
+ }
+)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.cpp b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1172081109fd7c970de661427851854f13313b21
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.cpp
@@ -0,0 +1,21 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+
+std::vector one_test_opt_2(at::Tensor a, at::Tensor b);
+
+std::vector test_opt_2(at::Tensor a, at::Tensor b) {
+ return one_test_opt_2(a, b);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def("test_opt_2", &test_opt_2, "");
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2713dae297bf44340caa556ba9f0dd3860219326
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_cat/test.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import torch
+import sp_opt
+
+if __name__ == "__main__":
+ m1 = 320
+ m2 = 321
+ hidden_size = 5000
+
+ a = torch.randn([m1,hidden_size]).cuda().half()
+ b = torch.randn([m2,hidden_size]).cuda().half()
+
+
+ res_pt = torch.cat([a,b],dim=0)
+
+ res_cu, = sp_opt.test_opt_2(a,b)
+
+
+ diff = torch.abs(res_pt-res_cu)
+ print(diff)
+ print(diff.max())
+
+ for i in range(20):
+ res_cu, = sp_opt.test_opt_2(a,b)
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/__init__.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..20603650006a6e4d586957a9c38193a3c12937a9
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/__init__.py
@@ -0,0 +1,16 @@
+import torch
+from faster_layer_norm import sp_opt
+
+class FasterLayerNorm(torch.nn.Module):
+ def __init__(self, weight, bias):
+ super(FasterLayerNorm, self).__init__()
+ self.weight = weight
+ self.bias = bias
+
+ def forward(self, inputs, *args, **kwargs):
+ hidden_size = self.weight.size(0)
+ in_shape = inputs.shape
+ inputs = inputs.view(-1,hidden_size)
+ output, = sp_opt.test_opt(inputs,self.weight,self.bias)
+ output = output.view(*in_shape)
+ return output
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f679258d4f3e5e66187918d5b2126613a736dd6b
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/build.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+set -euox pipefail
+
+rm -rf build
+rm -rf *.so
+
+python3 setup.py build
+
+cp build/lib*/*.so .
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/kernel.cu b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..852db917b04c3c5c93d31e677e0d74aeb45f6edc
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/kernel.cu
@@ -0,0 +1,168 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include "transformer_helper.cuh"
+
+namespace iluvatar::inferrt::transformer {
+
+template
+__global__ void LnOpt2Kernel(half* input, half* ln_weight, half* ln_bias,
+ half* output, int hidden_size,
+ float layernorm_eps) {
+ input += blockIdx.x * hidden_size;
+ output += blockIdx.x * hidden_size;
+
+ half2* p_in = reinterpret_cast(input);
+ half2* p_out = reinterpret_cast(output);
+ half2* p_wei = reinterpret_cast(ln_weight);
+ half2* p_bias = reinterpret_cast(ln_bias);
+ int half_hidden_size = hidden_size / 2;
+
+ extern __shared__ half2 shmem[];
+
+ float s_mean;
+ float s_variance;
+ float x_sum = 0.0f;
+ float x2_sum = 0.0f;
+#pragma unroll UNROLL_FACTOR
+ for (int i = 0; i < UNROLL_FACTOR; ++i) {
+ int index = i * blockDim.x + threadIdx.x;
+ if (index < half_hidden_size) {
+ half2 value = p_in[index];
+ shmem[index] = value;
+ float val_1 = __half2float(value.x);
+ float val_2 = __half2float(value.y);
+ x_sum += val_1 + val_2;
+ x2_sum += val_1 * val_1 + val_2 * val_2;
+ }
+ }
+ float sums[2]; // 和,平方和
+ sums[0] = x_sum;
+ sums[1] = x2_sum;
+ blockReduceSumV2(sums);
+
+ s_mean = sums[0] / hidden_size;
+ s_variance = rsqrtf(sums[1] / hidden_size - s_mean * s_mean + layernorm_eps);
+
+#pragma unroll UNROLL_FACTOR
+ for (int i = 0; i < UNROLL_FACTOR; ++i) {
+ int index = i * blockDim.x + threadIdx.x;
+ if (index < half_hidden_size) {
+ half2 wei_value = p_wei[index];
+ half2 bias_value = p_bias[index];
+ half2 vals_value = shmem[index];
+
+ float2 norm_value;
+ norm_value.x = (__half2float(vals_value.x) - s_mean) * s_variance *
+ __half2float(wei_value.x) +
+ __half2float(bias_value.x);
+ norm_value.y = (__half2float(vals_value.y) - s_mean) * s_variance *
+ __half2float(wei_value.y) +
+ __half2float(bias_value.y);
+
+ __half2 res;
+ res.x = __float2half(norm_value.x);
+ res.y = __float2half(norm_value.y);
+
+ p_out[index] = res;
+ }
+ }
+}
+
+// FasterTransformer/src/fastertransformer/kernels/layernorm_kernels.cu
+void IxinferLnLauncherOpt2(__half* input, __half* ln_weight, __half* ln_bias,
+ __half* output, int batch_tokens, int hidden_size,
+ cudaStream_t stream) {
+ const float layernorm_eps = 1e-5;
+ if (hidden_size % 2 != 0) {
+ throw std::runtime_error("layer norm error: hidden_size % 2 != 0");
+ }
+ dim3 grid(batch_tokens);
+ int half_n = hidden_size / 2;
+ int half_n_warp = (half_n + warpSize - 1) / warpSize * warpSize;
+ dim3 block(std::min(half_n_warp, 1024));
+ int rolls_per_thread = (half_n + block.x - 1) / block.x;
+ switch (rolls_per_thread) {
+ case 1:
+ LnOpt2Kernel<1><<>>(
+ input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+ break;
+ case 2:
+ LnOpt2Kernel<2><<>>(
+ input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+ break;
+ case 3:
+ LnOpt2Kernel<3><<>>(
+ input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+ break;
+ case 4:
+ LnOpt2Kernel<4><<>>(
+ input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+ break;
+ case 5:
+ LnOpt2Kernel<5><<>>(
+ input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+ break;
+ case 6:
+ LnOpt2Kernel<6><<>>(
+ input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+ break;
+ case 7:
+ LnOpt2Kernel<7><<>>(
+ input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+ break;
+ case 8:
+ LnOpt2Kernel<8><<>>(
+ input, ln_weight, ln_bias, output, hidden_size, layernorm_eps);
+ break;
+ default:
+ std::cout << "hidden_size: " << hidden_size << std::endl;
+ throw std::runtime_error("layer norm error, unsupport hidden size! ");
+ break;
+ }
+}
+} // namespace iluvatar::inferrt::transformer
+
+std::vector one_test_opt(at::Tensor input, at::Tensor ln_weight,
+ at::Tensor ln_bias) {
+ TORCH_CHECK(input.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(input.is_cuda());
+ TORCH_CHECK(input.is_contiguous());
+
+ TORCH_CHECK(ln_weight.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(ln_weight.is_cuda());
+ TORCH_CHECK(ln_weight.is_contiguous());
+
+ TORCH_CHECK(ln_bias.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(ln_bias.is_cuda());
+ TORCH_CHECK(ln_bias.is_contiguous());
+
+ TORCH_CHECK(input.dim() == 2);
+ TORCH_CHECK(ln_weight.dim() == 1);
+ TORCH_CHECK(ln_bias.dim() == 1);
+
+ int batch_tokens = input.size(0);
+ int hidden_size = input.size(1);
+
+ TORCH_CHECK(ln_weight.size(0) == hidden_size);
+ TORCH_CHECK(ln_bias.size(0) == hidden_size);
+
+ at::Tensor output = at::empty_like(input);
+
+ half* p_in = (half*)input.data_ptr();
+ half* p_wei = (half*)ln_weight.data_ptr();
+ half* p_bias = (half*)ln_bias.data_ptr();
+ half* p_out = (half*)output.data_ptr();
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ iluvatar::inferrt::transformer::IxinferLnLauncherOpt2(
+ p_in, p_wei, p_bias, p_out, batch_tokens, hidden_size, stream);
+ return {output};
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/setup.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a031577c2d36f4451bab736af4306b4bd741adc4
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/setup.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import glob
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# cpp_files = glob.glob(os.path.join(CUR_DIR,"*.cpp"))
+# cu_files = glob.glob(os.path.join(CUR_DIR,'*.cu'))
+# source_files = cpp_files + cu_files
+# print("source files:")
+# for i in source_files:
+# print(i)
+source_files = [
+ os.path.join(CUR_DIR,'test.cpp'),
+ os.path.join(CUR_DIR,'kernel.cu'),
+]
+
+for i in source_files:
+ assert os.path.isfile(i)
+ print(i)
+
+setup(
+ name="test",
+ ext_modules=[
+ CUDAExtension(
+ name="sp_opt",
+ libraries=["cuinfer"],
+ sources=source_files)
+ ],
+ cmdclass={
+ "build_ext": BuildExtension
+ }
+)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/test.cpp b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f925c1b468189dbea8e5d8bfaaef623b989f3163
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/test.cpp
@@ -0,0 +1,22 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+std::vector one_test_opt(at::Tensor input, at::Tensor ln_weight,
+ at::Tensor ln_bias);
+
+std::vector test_opt(at::Tensor input, at::Tensor ln_weight,
+ at::Tensor ln_bias) {
+ return one_test_opt(input, ln_weight, ln_bias);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def("test_opt", &test_opt, "fast depthwise conv1d forward");
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/transformer_helper.cuh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/transformer_helper.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f8a57622c7d84e5549f99d419cda6bb5011a6ffa
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_layer_norm/transformer_helper.cuh
@@ -0,0 +1,295 @@
+#pragma once
+#include
+#include
+
+namespace iluvatar {
+namespace inferrt {
+namespace transformer {
+
+__forceinline__ int nearest_4(int x) {
+ if (x % 4 == 0) {
+ return x;
+ } else {
+ int padding = 4 - x % 4;
+ return x + padding;
+ }
+}
+
+__forceinline__ int nearest_2(int x) {
+ if (x % 2 == 0) {
+ return x;
+ } else {
+ int padding = 2 - x % 2;
+ return x + padding;
+ }
+}
+
+__forceinline__ int nearest_num(int x, int value) {
+ if (x % value == 0) {
+ return x;
+ } else {
+ int padding = value - x % value;
+ return x + padding;
+ }
+}
+
+__device__ int8_t float2int8(float x, float quant_scale) {
+ float i8_f = x * quant_scale;
+ int32_t i8 = floorf(i8_f + 0.5);
+ i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8);
+ return int8_t(i8);
+}
+
+__device__ void WelfordCombine(float val, float *mean, float *m2,
+ float *count) {
+ // Use Welford Online algorithem to compute mean and variance
+ // For more details you can refer to:
+ // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+ *count += 1;
+ float delta1 = val - *mean;
+ *mean += delta1 / *count;
+ float delta2 = val - *mean;
+ *m2 += delta1 * delta2;
+}
+
+__device__ void WelfordCombine(float b_mean, float b_m2, float b_count,
+ float *mean, float *m2, float *count) {
+ if (b_count == 0) {
+ return;
+ }
+ float new_count = *count + b_count;
+ float nb_over_n = b_count / new_count;
+ float delta = b_mean - *mean;
+ *mean += delta * nb_over_n;
+ *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
+ *count = new_count;
+}
+
+__device__ void WelfordWarpReduce(float thread_mean, float thread_m2,
+ float thread_count, float *mean, float *m2,
+ float *count) {
+ *mean = thread_mean;
+ *m2 = thread_m2;
+ *count = thread_count;
+ for (int mask = warpSize / 2; mask > 0; mask /= 2) {
+ float b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
+ float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask);
+ float b_count = __shfl_down_sync(0xffffffff, *count, mask);
+ WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
+ }
+}
+
+// load 两个 half2, 保存到 float4
+__device__ void load_float4_from_half(float4 &vals, __half2 *input, int index) {
+ __half2 i1 = input[index * 2];
+ __half2 i2 = input[index * 2 + 1];
+
+ vals.x = __half2float(i1.x);
+ vals.y = __half2float(i1.y);
+ vals.z = __half2float(i2.x);
+ vals.w = __half2float(i2.y);
+}
+
+__device__ char4 float42char4(float4 vals, float quant_scale) {
+ char4 res;
+ res.x = float2int8(vals.x, quant_scale);
+ res.y = float2int8(vals.y, quant_scale);
+ res.z = float2int8(vals.z, quant_scale);
+ res.w = float2int8(vals.w, quant_scale);
+ return res;
+}
+
+__device__ float4 char4addhalf2_dequant(char4 input_4, half2 residual_1,
+ half2 residual_2, float dequant_scale) {
+ float4 res;
+ res.x =
+ __int2float_rn(input_4.x) * dequant_scale + __half2float(residual_1.x);
+ res.y =
+ __int2float_rn(input_4.y) * dequant_scale + __half2float(residual_1.y);
+ res.z =
+ __int2float_rn(input_4.z) * dequant_scale + __half2float(residual_2.x);
+ res.w =
+ __int2float_rn(input_4.w) * dequant_scale + __half2float(residual_2.y);
+ return res;
+}
+
+__device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2,
+ int hidden_size, float epsilon,
+ half2 scale_1, half2 scale_2,
+ half2 bias_1, half2 bias_2) {
+ float4 norm_value;
+ norm_value.x = (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) *
+ __half2float(scale_1.x) +
+ __half2float(bias_1.x);
+ norm_value.y = (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) *
+ __half2float(scale_1.y) +
+ __half2float(bias_1.y);
+ norm_value.z = (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) *
+ __half2float(scale_2.x) +
+ __half2float(bias_2.x);
+ norm_value.w = (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) *
+ __half2float(scale_2.y) +
+ __half2float(bias_2.y);
+ return norm_value;
+}
+
+// softmax
+__forceinline__ __host__ __device__ int log2_ceil(int value) {
+ int log2_value = 0;
+ while ((1 << log2_value) < value) ++log2_value;
+ return log2_value;
+}
+template
+__device__ T WARP_SHFL_XOR(T value, int laneMask, int width) {
+ unsigned int mask = 0xffffffff;
+#if !(defined(__HIP_PLATFORM_HCC__) || defined(__ILUVATAR__))
+ return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+ return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template
+struct Add {
+ __device__ T operator()(T a, T b) const { return a + b; }
+};
+
+template
+struct Max {
+ __device__ T operator()(T a, T b) const { return a < b ? b : a; }
+};
+template class ReduceOp>
+__device__ void warp_reduce(acc_t *sum) {
+ ReduceOp r;
+#pragma unroll
+ for (int offset = REDUCE_WARP_SIZE / 2; offset > 0; offset /= 2) {
+ acc_t b = WARP_SHFL_XOR(*sum, offset, REDUCE_WARP_SIZE);
+ *sum = r(*sum, b);
+ }
+}
+
+__device__ void warp_argmax(float &value, int32_t &idx) {
+ for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+ float next_value = WARP_SHFL_XOR(value, offset, warpSize);
+ float next_idx = WARP_SHFL_XOR(idx, offset, warpSize);
+ if (next_value > value) {
+ value = next_value;
+ idx = next_idx;
+ }
+ }
+}
+
+// gelu
+// IxinferBiasGeluI8II8OKernel
+template
+__device__ T tanhf_exp(T x) {
+ // float e1 = __expf(x);
+ // float e2 = 1.0f / e1;
+ // return (e1 - e2) / (e1 + e2);
+
+ return (2.f / (1.f + __expf(-2.f * x)) - 1.f);
+}
+
+template
+__device__ T gelu(T x) {
+ float cdf =
+ 0.5f *
+ (1.0f + tanhf_exp((0.7978845608028654f * (x + 0.044715f * x * x * x))));
+ return x * cdf;
+}
+
+/* fp16 gelu */
+template <>
+__forceinline__ __device__ __half2 gelu<__half2>(__half2 val) {
+ __half2 val_pow3 = __hmul2(val, __hmul2(val, val));
+ float2 tmp_pow = __half22float2(val_pow3);
+ float2 tmp = __half22float2(val);
+
+ tmp.x =
+ 0.5f *
+ (1.0f + tanhf((0.7978845608028654f * (tmp.x + 0.044715f * tmp_pow.x))));
+ tmp.y =
+ 0.5f *
+ (1.0f + tanhf((0.7978845608028654f * (tmp.y + 0.044715f * tmp_pow.y))));
+ return __hmul2(val, __float22half2_rn(tmp));
+}
+
+/* Convert vector index to 3-dim tensor index */
+__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1,
+ int dim2, int *id0,
+ int *id1, int *id2) {
+ *id2 = src % dim2;
+ src /= dim2;
+
+ *id1 = src % dim1;
+ *id0 = src / dim1;
+}
+
+template
+__inline__ __device__ T warpReduceSumV2(T *val) {
+#pragma unroll
+ for (int i = 0; i < NUM; i++) {
+#pragma unroll
+ for (int mask = warpSize / 2; mask > 0; mask >>= 1)
+ val[i] += __shfl_xor_sync(0xffffffff, val[i], mask, warpSize);
+ }
+ return (T)(0.0f);
+}
+
+template
+__inline__ __device__ T blockReduceSumV2(T *val) {
+ static __shared__ T shared[NUM][warpSize + 1];
+ int lane = threadIdx.x % warpSize;
+ int wid = threadIdx.x / warpSize;
+
+ warpReduceSumV2(val);
+
+ if (lane == 0) {
+#pragma unroll
+ for (int i = 0; i < NUM; i++) {
+ shared[i][wid] = val[i];
+ }
+ }
+
+ __syncthreads();
+
+ bool is_mask = lane < (blockDim.x / warpSize);
+#pragma unroll
+ for (int i = 0; i < NUM; i++) {
+ val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+ }
+ warpReduceSumV2(val);
+ return (T)0.0f;
+}
+
+__inline__ __device__ void warpReduceSum2Number(float *x, float *y) {
+#pragma unroll
+ for (int mask = warpSize / 2; mask > 0; mask >>= 1) {
+ *x += __shfl_xor_sync(0xffffffff, *x, mask, warpSize);
+ *y += __shfl_xor_sync(0xffffffff, *y, mask, warpSize);
+ }
+}
+
+__inline__ __device__ void blockReduceSum2Number(float *x, float *y) {
+ static __shared__ float shared[2][warpSize + 1];
+ int lane = threadIdx.x % warpSize;
+ int wid = threadIdx.x / warpSize;
+
+ warpReduceSum2Number(x, y);
+ if (lane == 0) {
+ shared[0][wid] = *x;
+ shared[1][wid] = *y;
+ }
+ __syncthreads();
+ bool is_mask = lane < (blockDim.x / warpSize);
+ *x = is_mask ? shared[0][lane] : 0.0f;
+ *y = is_mask ? shared[0][lane] : 0.0f;
+
+ warpReduceSum2Number(x, y);
+}
+
+} // namespace transformer
+
+} // namespace inferrt
+} // namespace iluvatar
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/__init__.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d50b3758c439e1c8c73f3d1ad07d104526be71ab
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/__init__.py
@@ -0,0 +1,38 @@
+import torch
+from faster_logsumexp import sp_opt
+
+# class FasterLogSumExp(torch.nn.Module):
+# def __init__(self, weight, bias):
+# super(FasterLogSumExp, self).__init__()
+# self.weight = weight
+# self.bias = bias
+
+# def forward(self, inputs, *args, **kwargs):
+# hidden_size = self.weight.size(0)
+# in_shape = inputs.shape
+# inputs = inputs.view(-1,hidden_size)
+# output, = sp_opt.test_opt(inputs,self.weight,self.bias)
+# output = output.view(*in_shape)
+# return output
+
+def FasterLogSumExp(inputs,dim):
+ # print(inputs.shape, dim)
+ if dim == 1 and len(inputs.shape)>2 and inputs.size(1)==2:
+ in_shape = inputs.shape
+ inputs = inputs.view(in_shape[0],in_shape[1],-1)
+ res, = sp_opt.test_opt(inputs)
+ new_shape = (in_shape[0],) + in_shape[2:]
+ res = res.view(*new_shape)
+ return res
+ # dim==0 现在的实现会有bug?
+ # if dim == 0 and len(inputs.shape)>=2:
+ # in_shape = inputs.shape
+ # inputs = inputs.view(in_shape[0],-1)
+ # res, = sp_opt.test_opt_dim0(inputs)
+ # new_shape = in_shape[1:]
+ # res = res.view(*new_shape)
+ # return res
+ # print(f"not support shape: {inputs.shape} dim: {dim}")
+ res = torch.logsumexp(inputs, dim=dim)
+ return res
+
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f679258d4f3e5e66187918d5b2126613a736dd6b
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/build.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+set -euox pipefail
+
+rm -rf build
+rm -rf *.so
+
+python3 setup.py build
+
+cp build/lib*/*.so .
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/kernel.cu b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56eb0810bcb46f121761cd43cba931d285b0635c
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/kernel.cu
@@ -0,0 +1,155 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+namespace iluvatar::inferrt::transformer {
+
+__global__ void LogSumExpWith2(half* input, half* output, int H) {
+ half2* h2_in1 = reinterpret_cast(input + blockIdx.x * 2 * H);
+ half2* h2_in2 = reinterpret_cast(input + blockIdx.x * 2 * H + H);
+ half2* h2_out = reinterpret_cast(output + blockIdx.x * H);
+
+ int i = blockIdx.y * blockDim.x + threadIdx.x;
+ if (i < H / 2) {
+ float2 res;
+ half2 value1 = h2_in1[i];
+ half2 value2 = h2_in2[i];
+
+ res.x = std::log(__expf(__half2float(value1.x)) +
+ __expf(__half2float(value2.x)));
+ res.y = std::log(__expf(__half2float(value1.y)) +
+ __expf(__half2float(value2.y)));
+
+ half2 res_h2;
+ res_h2.x = __float2half(res.x);
+ res_h2.y = __float2half(res.y);
+ h2_out[i] = res_h2;
+ }
+}
+
+void IxinferLogSumExpLauncher(half* input, half* output, int N, int C, int H,
+ cudaStream_t stream) {
+ const float layernorm_eps = 1e-5;
+ if (H % 2 != 0) {
+ throw std::runtime_error("IxinferLogSumExpLauncher: size error!");
+ }
+ int num_threads = 1024;
+ int half_h = H / 2;
+ int num_roll = (half_h - 1 + num_threads) / num_threads;
+ dim3 grid(N, num_roll);
+ dim3 block(num_threads);
+ switch (C) {
+ case 2:
+ LogSumExpWith2<<>>(input, output, H);
+ break;
+ default:
+ throw std::runtime_error(
+ "IxinferLogSumExpLauncher error, unsupport size! ");
+ break;
+ }
+}
+
+// https://zhuanlan.zhihu.com/p/153535799
+__global__ void LogSumExpDim0(half* input, half* output, int N, int H) {
+ half2* h2_out = reinterpret_cast(output);
+
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+ float2 res;
+ res.x = 0.f;
+ res.y = 0.f;
+
+ float2 max_values;
+ max_values.x = -1000.f;
+ max_values.y = -1000.f;
+
+ for (int batch_idx = 0; batch_idx < N; batch_idx++) {
+ half2* h2_in = reinterpret_cast(input + batch_idx * H);
+ half2 value = h2_in[i];
+
+ if (max_values.x < __half2float(value.x)) {
+ max_values.x = __half2float(value.x);
+ }
+ if (max_values.y < __half2float(value.y)) {
+ max_values.y = __half2float(value.y);
+ }
+ }
+
+ for (int batch_idx = 0; batch_idx < N; batch_idx++) {
+ half2* h2_in = reinterpret_cast(input + batch_idx * H);
+ half2 value = h2_in[i];
+
+ res.x += __expf(__half2float(value.x) - max_values.x);
+ res.y += __expf(__half2float(value.y) - max_values.y);
+ }
+
+ half2 res_h2;
+ res_h2.x = __float2half(std::log(res.x) + max_values.x);
+ res_h2.y = __float2half(std::log(res.y) + max_values.y);
+
+ h2_out[i] = res_h2;
+}
+
+void IxinferLogSumExpLauncher(half* input, half* output, int N, int H,
+ cudaStream_t stream) {
+ if (H % 2 != 0) {
+ throw std::runtime_error("IxinferLogSumExpLauncher: size error!");
+ }
+ int num_threads = 1024;
+ int half_h = H / 2;
+ int num_roll = (half_h - 1 + num_threads) / num_threads;
+ dim3 grid(num_roll);
+ dim3 block(num_threads);
+ LogSumExpDim0<<>>(input, output, N, H);
+}
+
+} // namespace iluvatar::inferrt::transformer
+
+std::vector one_test_opt(at::Tensor input) {
+ TORCH_CHECK(input.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(input.is_cuda());
+ TORCH_CHECK(input.is_contiguous());
+
+ TORCH_CHECK(input.dim() == 3);
+
+ int N = input.size(0);
+ int C = input.size(1);
+ int H = input.size(2);
+
+ at::Tensor output = input.new_empty({N, H});
+
+ half* p_in = (half*)input.data_ptr();
+ half* p_out = (half*)output.data_ptr();
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ iluvatar::inferrt::transformer::IxinferLogSumExpLauncher(p_in, p_out, N, C, H,
+ stream);
+ return {output};
+}
+
+std::vector one_test_dim0(at::Tensor input) {
+ TORCH_CHECK(input.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(input.is_cuda());
+ TORCH_CHECK(input.is_contiguous());
+
+ TORCH_CHECK(input.dim() == 2);
+
+ int N = input.size(0);
+ int H = input.size(1);
+
+ at::Tensor output = input.new_empty({H});
+
+ half* p_in = (half*)input.data_ptr();
+ half* p_out = (half*)output.data_ptr();
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ iluvatar::inferrt::transformer::IxinferLogSumExpLauncher(p_in, p_out, N, H,
+ stream);
+ return {output};
+}
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/setup.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a031577c2d36f4451bab736af4306b4bd741adc4
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/setup.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import glob
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# cpp_files = glob.glob(os.path.join(CUR_DIR,"*.cpp"))
+# cu_files = glob.glob(os.path.join(CUR_DIR,'*.cu'))
+# source_files = cpp_files + cu_files
+# print("source files:")
+# for i in source_files:
+# print(i)
+source_files = [
+ os.path.join(CUR_DIR,'test.cpp'),
+ os.path.join(CUR_DIR,'kernel.cu'),
+]
+
+for i in source_files:
+ assert os.path.isfile(i)
+ print(i)
+
+setup(
+ name="test",
+ ext_modules=[
+ CUDAExtension(
+ name="sp_opt",
+ libraries=["cuinfer"],
+ sources=source_files)
+ ],
+ cmdclass={
+ "build_ext": BuildExtension
+ }
+)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.cpp b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5eaf6fe16e38d1a5694c391de30a6c4b82ed2af5
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.cpp
@@ -0,0 +1,27 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+std::vector one_test_opt(at::Tensor input);
+
+std::vector test_opt(at::Tensor input) {
+ return one_test_opt(input);
+}
+
+std::vector one_test_dim0(at::Tensor input);
+
+std::vector test_opt_dim0(at::Tensor input) {
+ return one_test_dim0(input);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def("test_opt", &test_opt, "");
+ m.def("test_opt_dim0", &test_opt_dim0, "");
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b22dbddab13f94854c0e334ca53348d9c41f2ba
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_logsumexp/test.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import torch
+import sp_opt
+
+if __name__ == "__main__":
+ batch_tokens = 2
+ c = 2
+ hidden_size = 320*5000
+
+ inputs = torch.randn([batch_tokens,c, hidden_size]).cuda().half()
+
+ # res1 = torch.log(torch.sum(torch.exp(inputs),dim=-1))
+ # res2 = torch.logsumexp(inputs,dim=-1)
+ # diff = torch.abs(res1-res2)
+ # print(diff.max())
+
+ res_pt = torch.logsumexp(inputs,dim=1)
+
+ res_cu, = sp_opt.test_opt(inputs)
+
+ diff = torch.abs(res_pt - res_cu)
+ print(diff.max())
+
+ for i in range(20):
+ res_cu, = sp_opt.test_opt(inputs)
+
+ batch_tokens = 55
+ hidden_size = 320*5000
+ inputs = torch.randn([batch_tokens,hidden_size]).cuda().half()
+ res_pt = torch.logsumexp(inputs,dim=0)
+ res_cu, = sp_opt.test_opt_dim0(inputs)
+
+ diff = torch.abs(res_pt - res_cu)
+ print(diff.max())
+ for i in range(20):
+ res_cu, = sp_opt.test_opt_dim0(inputs)
+
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/__init__.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48d0cf5b0f3bdd03e18f19dbfded0704b7048b2f
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/__init__.py
@@ -0,0 +1,33 @@
+import torch
+from faster_stack import sp_opt
+
+# class FasterLogSumExp(torch.nn.Module):
+# def __init__(self, weight, bias):
+# super(FasterLogSumExp, self).__init__()
+# self.weight = weight
+# self.bias = bias
+
+# def forward(self, inputs, *args, **kwargs):
+# hidden_size = self.weight.size(0)
+# in_shape = inputs.shape
+# inputs = inputs.view(-1,hidden_size)
+# output, = sp_opt.test_opt(inputs,self.weight,self.bias)
+# output = output.view(*in_shape)
+# return output
+
+def FasterStack(inputs):
+ if len(inputs) == 4:
+ a,b,c,d = inputs
+ in_shape = a.shape
+ res, = sp_opt.test_opt(a.view(-1),b.view(-1),c.view(-1),d.view(-1))
+ new_shape = (4,) + in_shape
+ res = res.view(*new_shape)
+ return res
+ if len(inputs) == 2:
+ a,b = inputs
+ in_shape = a.shape
+ res, = sp_opt.test_opt_2(a.view(-1),b.view(-1))
+ new_shape = (2,) + in_shape
+ res = res.view(*new_shape)
+ return res
+ return torch.stack(inputs)
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/build.sh b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f679258d4f3e5e66187918d5b2126613a736dd6b
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/build.sh
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+set -euox pipefail
+
+rm -rf build
+rm -rf *.so
+
+python3 setup.py build
+
+cp build/lib*/*.so .
\ No newline at end of file
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/kernel.cu b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0fdff64992ad17bb85b43632a0be28f9ae2419be
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/kernel.cu
@@ -0,0 +1,146 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+namespace iluvatar::inferrt::transformer {
+
+__global__ void Stack(half* a, half* b, half* c, half* d, half* output, int H) {
+ half2* h2_a = reinterpret_cast(a);
+ half2* h2_b = reinterpret_cast(b);
+ half2* h2_c = reinterpret_cast(c);
+ half2* h2_d = reinterpret_cast(d);
+
+ half2* h2_out_a = reinterpret_cast(output);
+ half2* h2_out_b = reinterpret_cast(output + H);
+ half2* h2_out_c = reinterpret_cast(output + H * 2);
+ half2* h2_out_d = reinterpret_cast(output + H * 3);
+
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if (i < H / 2) {
+ h2_out_a[i] = h2_a[i];
+ h2_out_b[i] = h2_b[i];
+ h2_out_c[i] = h2_c[i];
+ h2_out_d[i] = h2_d[i];
+ }
+}
+
+void IxinferStackLauncher(half* a, half* b, half* c, half* d, half* output,
+ int H, cudaStream_t stream) {
+ if (H % 2 != 0) {
+ throw std::runtime_error("IxinferStackLauncher: size error!");
+ }
+ int num_threads = 1024;
+ int half_h = H / 2;
+ int num_roll = (half_h - 1 + num_threads) / num_threads;
+ dim3 grid(num_roll);
+ dim3 block(num_threads);
+ Stack<<>>(a, b, c, d, output, H);
+}
+
+__global__ void Stack(half* a, half* b, half* output, int H) {
+ half2* h2_a = reinterpret_cast(a);
+ half2* h2_b = reinterpret_cast(b);
+
+ half2* h2_out_a = reinterpret_cast(output);
+ half2* h2_out_b = reinterpret_cast(output + H);
+
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+ if (i < H / 2) {
+ h2_out_a[i] = h2_a[i];
+ h2_out_b[i] = h2_b[i];
+ }
+}
+
+void IxinferStackLauncher(half* a, half* b, half* output, int H,
+ cudaStream_t stream) {
+ if (H % 2 != 0) {
+ throw std::runtime_error("IxinferStackLauncher: size error!");
+ }
+ int num_threads = 1024;
+ int half_h = H / 2;
+ int num_roll = (half_h - 1 + num_threads) / num_threads;
+ dim3 grid(num_roll);
+ dim3 block(num_threads);
+ Stack<<>>(a, b, output, H);
+}
+
+} // namespace iluvatar::inferrt::transformer
+
+std::vector one_test_opt(at::Tensor a, at::Tensor b, at::Tensor c,
+ at::Tensor d) {
+ TORCH_CHECK(a.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(a.is_cuda());
+ TORCH_CHECK(a.is_contiguous());
+
+ TORCH_CHECK(b.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(b.is_cuda());
+ TORCH_CHECK(b.is_contiguous());
+
+ TORCH_CHECK(c.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(c.is_cuda());
+ TORCH_CHECK(c.is_contiguous());
+
+ TORCH_CHECK(d.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(d.is_cuda());
+ TORCH_CHECK(d.is_contiguous());
+
+ TORCH_CHECK(a.dim() == 1);
+ TORCH_CHECK(b.dim() == 1);
+ TORCH_CHECK(c.dim() == 1);
+ TORCH_CHECK(d.dim() == 1);
+
+ int N = a.size(0);
+
+ TORCH_CHECK(b.size(0) == N);
+ TORCH_CHECK(c.size(0) == N);
+ TORCH_CHECK(d.size(0) == N);
+
+ at::Tensor output = a.new_empty({N * 4});
+
+ half* p_a = (half*)a.data_ptr();
+ half* p_b = (half*)b.data_ptr();
+ half* p_c = (half*)c.data_ptr();
+ half* p_d = (half*)d.data_ptr();
+ half* p_out = (half*)output.data_ptr();
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ iluvatar::inferrt::transformer::IxinferStackLauncher(p_a, p_b, p_c, p_d,
+ p_out, N, stream);
+ return {output};
+}
+
+std::vector one_test_opt_2(at::Tensor a, at::Tensor b) {
+ TORCH_CHECK(a.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(a.is_cuda());
+ TORCH_CHECK(a.is_contiguous());
+
+ TORCH_CHECK(b.scalar_type() == at::ScalarType::Half);
+ TORCH_CHECK(b.is_cuda());
+ TORCH_CHECK(b.is_contiguous());
+
+ TORCH_CHECK(a.dim() == 1);
+ TORCH_CHECK(b.dim() == 1);
+
+ int N = a.size(0);
+
+ TORCH_CHECK(b.size(0) == N);
+
+ at::Tensor output = a.new_empty({N * 2});
+
+ half* p_a = (half*)a.data_ptr();
+ half* p_b = (half*)b.data_ptr();
+ half* p_out = (half*)output.data_ptr();
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ iluvatar::inferrt::transformer::IxinferStackLauncher(p_a, p_b, p_out, N,
+ stream);
+ return {output};
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/setup.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a031577c2d36f4451bab736af4306b4bd741adc4
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/setup.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import os
+import glob
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# cpp_files = glob.glob(os.path.join(CUR_DIR,"*.cpp"))
+# cu_files = glob.glob(os.path.join(CUR_DIR,'*.cu'))
+# source_files = cpp_files + cu_files
+# print("source files:")
+# for i in source_files:
+# print(i)
+source_files = [
+ os.path.join(CUR_DIR,'test.cpp'),
+ os.path.join(CUR_DIR,'kernel.cu'),
+]
+
+for i in source_files:
+ assert os.path.isfile(i)
+ print(i)
+
+setup(
+ name="test",
+ ext_modules=[
+ CUDAExtension(
+ name="sp_opt",
+ libraries=["cuinfer"],
+ sources=source_files)
+ ],
+ cmdclass={
+ "build_ext": BuildExtension
+ }
+)
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.cpp b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..087030645cd95a86222ddfd0db55958edc3a49c6
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.cpp
@@ -0,0 +1,29 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+std::vector one_test_opt(at::Tensor a, at::Tensor b, at::Tensor c,
+ at::Tensor d);
+
+std::vector test_opt(at::Tensor a, at::Tensor b, at::Tensor c,
+ at::Tensor d) {
+ return one_test_opt(a, b, c, d);
+}
+
+std::vector one_test_opt_2(at::Tensor a, at::Tensor b);
+
+std::vector test_opt_2(at::Tensor a, at::Tensor b) {
+ return one_test_opt_2(a, b);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def("test_opt", &test_opt, "");
+ m.def("test_opt_2", &test_opt_2, "");
+}
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.py b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..185b829b9cb9372cf846f5828e668eaa984ab442
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/faster_stack/test.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import torch
+import sp_opt
+
+if __name__ == "__main__":
+ batch_tokens = 320
+ hidden_size = 5000
+
+ a = torch.randn([batch_tokens,hidden_size]).cuda().half()
+ b = torch.randn([batch_tokens,hidden_size]).cuda().half()
+ c = torch.randn([batch_tokens,hidden_size]).cuda().half()
+ d = torch.randn([batch_tokens,hidden_size]).cuda().half()
+
+ res_pt = torch.stack([a,b,c,d])
+
+ res_cu, = sp_opt.test_opt(a.view(-1),b.view(-1),c.view(-1),d.view(-1))
+ res_cu = res_cu.view(4,batch_tokens,hidden_size)
+
+ diff = torch.abs(res_pt-res_cu)
+ print(diff)
+ print(diff.max())
+
+ for i in range(20):
+ res_cu, = sp_opt.test_opt(a.view(-1),b.view(-1),c.view(-1),d.view(-1))
+
+ res_pt = torch.stack([a,b])
+
+ res_cu, = sp_opt.test_opt_2(a.view(-1),b.view(-1))
+ res_cu = res_cu.view(2,batch_tokens,hidden_size)
+
+ diff = torch.abs(res_pt-res_cu)
+ print(diff)
+ print(diff.max())
+ for i in range(20):
+ res_cu, = sp_opt.test_opt_2(a.view(-1),b.view(-1))
+ # # res1 = torch.log(torch.sum(torch.exp(inputs),dim=-1))
+ # # res2 = torch.logsumexp(inputs,dim=-1)
+ # # diff = torch.abs(res1-res2)
+ # # print(diff.max())
+
+ # res_pt = torch.logsumexp(inputs,dim=1)
+
+ # res_cu, = sp_opt.test_opt(inputs)
+
+ # diff = torch.abs(res_pt - res_cu)
+ # print(diff.max())
+
+ # for i in range(20):
+ # res_cu, = sp_opt.test_opt(inputs)
+
+ # batch_tokens = 55
+ # hidden_size = 320*5000
+ # inputs = torch.randn([batch_tokens,hidden_size]).cuda().half()
+ # res_pt = torch.logsumexp(inputs,dim=0)
+ # res_cu, = sp_opt.test_opt_dim0(inputs)
+
+ # diff = torch.abs(res_pt - res_cu)
+ # print(diff.max())
+ # for i in range(20):
+ # res_cu, = sp_opt.test_opt_dim0(inputs)
+
diff --git a/models/speech/speech_recognition/transformer_asr/ixrt/hparams/train_ASR_transformer.yaml b/models/speech/speech_recognition/transformer_asr/ixrt/hparams/train_ASR_transformer.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..859d09f31020a99c41462a367d47b8e986576841
--- /dev/null
+++ b/models/speech/speech_recognition/transformer_asr/ixrt/hparams/train_ASR_transformer.yaml
@@ -0,0 +1,253 @@
+# ############################################################################
+# Model: E2E ASR with Transformer
+# Encoder: Transformer Encoder
+# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch
+# Tokens: BPE with unigram
+# losses: CTC + KLdiv (Label Smoothing loss)
+# Training: AISHELL-1
+# Authors: Jianyuan Zhong, Titouan Parcollet
+# ############################################################################
+# Seed needs to be set at top of yaml, before objects with parameters are made
+seed: 8886
+__set_seed: !apply:torch.manual_seed [!ref ]
+output_folder: !ref results/transformer/
+cer_file: !ref /cer.txt
+save_folder: !ref /save
+train_log: !ref /train_log.txt
+
+# Data files
+data_folder: !PLACEHOLDER # e,g./path/to/aishell
+# noise/ris dataset will automatically be downloaded
+data_folder_rirs: !ref # Change this is needed
+skip_prep: False
+ckpt_interval_minutes: 15 # save checkpoint every N min
+train_data: !ref /csv_data/train.csv
+valid_data: !ref /csv_data/dev.csv
+test_data: !ref /csv_data/test.csv
+tokenizer_file: speechbrain/asr-transformer-aishell/tokenizer.ckpt
+
+# Training parameters
+number_of_epochs: 50
+batch_size: 64
+ctc_weight: 0.3
+gradient_accumulation: 4
+loss_reduction: 'batchmean'
+sorting: ascending
+
+dynamic_batching: False
+dynamic_batch_sampler:
+ feats_hop_size: 0.01
+ max_batch_len: 15 # in terms of "duration" in annotations by default, second here
+ left_bucket_len: 200 # old implementation attributs
+ multiplier: 1.1 # old implementation attributs
+ shuffle_ex: False # if true re-creates batches at each epoch shuffling examples.
+ num_buckets: 10 # floor(log(max_batch_len/left_bucket_len, multiplier)) + 1
+ batch_ordering: ascending
+
+num_workers: 6
+
+# stages related parameters
+stage_one_epochs: 40
+lr_adam: 1.0
+lr_sgd: 0.000025
+
+# Feature parameters
+sample_rate: 16000
+n_fft: 400
+n_mels: 80
+
+# Dataloader options
+train_dataloader_opts:
+ batch_size: !ref
+ shuffle: True
+
+valid_dataloader_opts:
+ batch_size: !ref
+
+test_dataloader_opts:
+ batch_size: !ref
+
+####################### Model parameters ###########################
+# Transformer
+d_model: 256
+nhead: 4
+num_encoder_layers: 12
+num_decoder_layers: 6
+d_ffn: 2048
+transformer_dropout: 0.1
+activation: !name:torch.nn.GELU
+output_neurons: 5000
+
+# Outputs
+blank_index: 0
+label_smoothing: 0.1
+pad_index: 0
+bos_index: 1
+eos_index: 2
+
+# Decoding parameters
+min_decode_ratio: 0.0
+max_decode_ratio: 1.0 # 1.0
+valid_search_interval: 10
+valid_beam_size: 10
+test_beam_size: 1
+ctc_weight_decode: 0.40
+
+############################## models ################################
+
+CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
+ input_shape: (8, 10, 80)
+ num_blocks: 2
+ num_layers_per_block: 1
+ out_channels: (256, 256)
+ kernel_sizes: (3, 3)
+ strides: (2, 2)
+ residuals: (False, False)
+
+Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
+ input_size: 5120
+ tgt_vocab: !ref
+ d_model: !ref
+ nhead: !ref
+ num_encoder_layers: !ref
+ num_decoder_layers: !ref
+ d_ffn: !ref
+ dropout: !ref
+ activation: !ref
+ normalize_before: True
+
+tokenizer: !new:sentencepiece.SentencePieceProcessor
+
+ctc_lin: !new:speechbrain.nnet.linear.Linear
+ input_size: !ref
+ n_neurons: !ref
+
+seq_lin: !new:speechbrain.nnet.linear.Linear
+ input_size: !ref
+ n_neurons: !ref
+
+env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt
+ openrir_folder: !ref
+ babble_prob: 0.0
+ reverb_prob: 0.0
+ noise_prob: 1.0
+ noise_snr_low: 0
+ noise_snr_high: 15
+
+modules:
+ CNN: !ref
+ Transformer: !ref
+ seq_lin: !ref
+ ctc_lin: !ref
+ env_corrupt: !ref
+
+model: !new:torch.nn.ModuleList
+ - [!ref , !ref , !ref , !ref ]
+
+# define two optimizers here for two-stage training
+Adam: !name:torch.optim.Adam
+ lr: 0
+ betas: (0.9, 0.98)
+ eps: 0.000000001
+
+SGD: !name:torch.optim.SGD
+ lr: !ref
+ momentum: 0.99
+ nesterov: True
+
+
+valid_search: !new:speechbrain.decoders.S2STransformerBeamSearch
+ modules: [!ref , !ref , !ref ]
+ bos_index: !ref
+ eos_index: !ref
+ blank_index: !ref
+ min_decode_ratio: !ref
+ max_decode_ratio: !ref
+ beam_size: !ref
+ ctc_weight: !ref
+ using_eos_threshold: False
+ length_normalization: True
+
+test_search: !new:speechbrain.decoders.S2STransformerBeamSearch
+ modules: [!ref , !ref , !ref ]
+ bos_index: !ref
+ eos_index: !ref
+ blank_index: !ref
+ min_decode_ratio: !ref
+ max_decode_ratio: !ref
+ beam_size: !ref
+ ctc_weight: !ref
+ using_eos_threshold: False
+ length_normalization: True
+
+log_softmax: !new:torch.nn.LogSoftmax
+ dim: -1
+
+ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+ blank_index: !ref