diff --git a/.gitignore b/.gitignore index 68b790b687ef52b8fe7e9397f5988832fbf89987..410ef0b679b8b66ab288b04cb946e0dd6c2a52c9 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,4 @@ cover/ checkpoints/ imagenet_val/ *.json +data/ diff --git a/models/nlp/language_model/bert_base_squad/ixrt/README.md b/models/nlp/language_model/bert_base_squad/ixrt/README.md index 6d0858ac6f92edab227711c5526ce47d2b5e250c..6a9737cff443470d1176b6eb41aaecb052d30d7f 100644 --- a/models/nlp/language_model/bert_base_squad/ixrt/README.md +++ b/models/nlp/language_model/bert_base_squad/ixrt/README.md @@ -29,13 +29,36 @@ bash script/prepare.sh v1_1 ## Inference -### FP16 +### On T4 + ```bash +# FP16 cd python pip install onnx pycuda # use --bs to set max_batch_size (dynamic) -bash script/build_engine --bs 32 -bash script/inference_squad.sh --bs {batch_size} +bash script/build_engine.sh --bs 32 +bash script/inference_squad.sh --bs 32 +``` + +```bash +# INT8 +cd python +pip install onnx pycuda +bash script/build_engine.sh --bs 32 --int8 +bash script/inference_squad.sh --bs 32 --int8 +``` +#### On iluvatar + +```bash +# FP16 +cd python/script +bash infer_bert_base_squad_fp16_ixrt.sh +``` + +```bash +# INT8 +cd python/script +bash infer_bert_base_squad_int8_ixrt.sh ``` ## Results @@ -44,3 +67,5 @@ Model | BatchSize | Precision | FPS | ACC ------|-----------|-----------|-----|---- BERT-Base-SQuAD | 32 | fp16 | Latency QPS: 1543.40 sentences/s | "exact_match": 80.92, "f1": 88.20 +## Referenece +- [bert-base-uncased.zip 外网链接](https://drive.google.com/file/d/1_DJDdKBanqJ6h3VGhH78F9EPgE2wK_Tw/view?usp=drive_link) \ No newline at end of file diff --git a/models/nlp/language_model/bert_base_squad/ixrt/cmake/FindCuda.cmake b/models/nlp/language_model/bert_base_squad/ixrt/cmake/FindCuda.cmake index 58e39e6003cb6a0545a76f9a6fab88e44fe39caa..e8aa67dc2dc3a2a03af152038dcd54f80c0497e8 100644 --- a/models/nlp/language_model/bert_base_squad/ixrt/cmake/FindCuda.cmake +++ b/models/nlp/language_model/bert_base_squad/ixrt/cmake/FindCuda.cmake @@ -11,7 +11,7 @@ if(DEFINED ENV{CUDA_PATH}) set(CUDA_PATH "$ENV{CUDA_PATH}") else() set(CUDA_PATH - "/opt/sw_home/local/cuda" + "/usr/local/corex" CACHE PATH "cuda installation root path") endif() message(STATUS "Use CUDA_PATH=${CUDA_PATH} ") diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..8632d95dec10d22834cf928ef8f8c940c1c12962 --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +import json +import tensorrt as trt +import time +import sys +import ctypes +import os +import numpy as np +from builder_utils import load_onnx_weights_and_quant, load_pytorch_weights_and_quant +from builder_utils import WQKV, BQKV # Attention Keys +from builder_utils import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT # Transformer Keys +from builder_utils import SQD_W, SQD_B # SQuAD Output Keys + +trt_version = [int(n) for n in trt.__version__.split('.')] +plugin_lib_name = "libnvinfer_plugin.so" if os.getenv('USE_TRT') == 'True' else "libixrt_plugin.so" +print(plugin_lib_name) + +TRT_LOGGER = trt.Logger(trt.Logger.WARNING) +from load_ixrt_plugin import load_ixrt_plugin +load_ixrt_plugin(TRT_LOGGER) + +plg_registry = trt.get_plugin_registry() +registry_list = plg_registry.plugin_creator_list +print("registry_list: ", [registry.name + '/' + registry.plugin_version for registry in registry_list]) +emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic_IxRT", "1", "") +qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic_IxRT", "1", "") +skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic_IxRT", "1", "") +ffn_plg_creator = plg_registry.get_plugin_creator("CustomFFNPluginDynamic_IxRT", "1", "") +gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic_IxRT", "1", "") +fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic_IxRT", "1", "") + +class BertConfig: + def __init__(self, bert_config_path, use_fp16, use_trt): + with open(bert_config_path, "r") as f: + data = json.load(f) + self.num_attention_heads = data["num_attention_heads"] + self.hidden_size = data["hidden_size"] + self.intermediate_size = data["intermediate_size"] + self.num_hidden_layers = data["num_hidden_layers"] + self.head_size = self.hidden_size // self.num_attention_heads + self.use_fp16 = use_fp16 + self.use_trt = use_trt + +def set_tensor_name(tensor, prefix, name): + tensor.name = prefix + name + +def set_output_name(layer, prefix, name, out_idx = 0): + set_tensor_name(layer.get_output(out_idx), prefix, name) + +def set_output_range(layer, maxval, out_idx = 0): + layer.get_output(out_idx).set_dynamic_range(-maxval, maxval) + +def get_mha_dtype(config): + dtype = trt.float32 + if config.use_fp16: + dtype = trt.float16 + return int(dtype) + +def custom_fc(network, input_tensor, out_dims, W, B): + pf_out_dims = trt.PluginField("out_dims", np.array(out_dims, dtype=np.int32), trt.PluginFieldType.INT32) + pf_type = trt.PluginField("type_id", np.array(int(trt.float16), dtype=np.int32), trt.PluginFieldType.INT32) + pf_W = trt.PluginField("W", W, trt.PluginFieldType.FLOAT32) + fields = [pf_out_dims, pf_type, pf_W] + if B is not None: + pf_B = trt.PluginField("B", B, trt.PluginFieldType.FLOAT32) + fields.append(pf_B) + + pfc = trt.PluginFieldCollection(fields) + fc_plugin = fc_plg_creator.create_plugin("fcplugin", pfc) + plug_inputs = [input_tensor] + out_dense = network.add_plugin_v2(plug_inputs, fc_plugin) + return out_dense + +def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask): + """ + Add the attention layer + """ + B, S, hidden_size = input_tensor.shape + num_heads = config.num_attention_heads + head_size = int(hidden_size / num_heads) + + Wall = init_dict[prefix + WQKV] + Ball = init_dict[prefix + BQKV] + + # FC_attention + mult_all = custom_fc(network, input_tensor, 3 * hidden_size, Wall, Ball) + + has_mask = imask is not None + # QKV2CTX + pf_type = trt.PluginField("type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32) + pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32) + pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32) + pf_has_mask = trt.PluginField("has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32) + pfc = trt.PluginFieldCollection([pf_hidden_size, pf_num_heads, pf_has_mask, pf_type]) + qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc) + + qkv_in = [mult_all.get_output(0)] + if has_mask: + qkv_in.append(imask) + qkv2ctx = network.add_plugin_v2(qkv_in, qkv2ctx_plug) + return qkv2ctx + + +def skipln(prefix, config, init_dict, network, input_tensor, skip, bias=None): + """ + Add the skip layer + """ + idims = input_tensor.shape + hidden_size = idims[2] + + dtype = trt.float32 + if config.use_fp16: + dtype = trt.float16 + + pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32) + wbeta = init_dict[prefix + "beta"] + pf_beta = trt.PluginField("beta", wbeta, trt.PluginFieldType.FLOAT32) + wgamma = init_dict[prefix + "gamma"] + pf_gamma = trt.PluginField("gamma", wgamma, trt.PluginFieldType.FLOAT32) + pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32) + + fields = [pf_ld, pf_beta, pf_gamma, pf_type ] + + if bias is not None: + pf_bias = trt.PluginField("bias", bias, trt.PluginFieldType.FLOAT32) + fields.append(pf_bias) + + pfc = trt.PluginFieldCollection(fields) + skipln_plug = skln_plg_creator.create_plugin("skipln", pfc) + + skipln_inputs = [input_tensor, skip] + layer = network.add_plugin_v2(skipln_inputs, skipln_plug) + return layer + +def ffn_trt(prefix, config, init_dict, network, input_tensor): + # FC1 + GELU + B_mid = init_dict[prefix + B_MID] + W_mid = init_dict[prefix + W_MID] + mid_dense = network.add_fully_connected(input_tensor, config.intermediate_size, W_mid, B_mid) + + dtype = trt.float32 + if config.use_fp16: + dtype = trt.float16 + pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32) + pf_ld = trt.PluginField("ld", np.array([config.hidden_size], np.int32), trt.PluginFieldType.INT32) + + pfc = trt.PluginFieldCollection([pf_type, pf_ld]) + gelu_plug = gelu_plg_creator.create_plugin("gelu", pfc) + + gelu_inputs = [mid_dense.get_output(0)] + gelu_layer = network.add_plugin_v2(gelu_inputs, gelu_plug) + + intermediate_act = gelu_layer.get_output(0) + + # FC2 + # Dense to hidden size + B_lout = init_dict[prefix + B_LOUT] + W_lout = init_dict[prefix + W_LOUT] + out_dense = network.add_fully_connected(intermediate_act, config.hidden_size, W_lout, B_lout) + B_lout = None + + out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, out_dense.get_output(0), input_tensor, B_lout) + return out_layer + +def ffn(prefix, config, init_dict, network, input_tensor): + # FC1 + GELU + B_mid = init_dict[prefix + B_MID] + W_mid = init_dict[prefix + W_MID] + B_lout = init_dict[prefix + B_LOUT] + W_lout = init_dict[prefix + W_LOUT] + pf_out_dim = trt.PluginField("out_dims", np.array(config.hidden_size, np.int32), trt.PluginFieldType.INT32) + pf_type = trt.PluginField("type_id", np.array(int(trt.float16), np.int32), trt.PluginFieldType.INT32) + pf_W1 = trt.PluginField("W1", W_mid, trt.PluginFieldType.FLOAT32) + pf_W2 = trt.PluginField("W2", W_lout, trt.PluginFieldType.FLOAT32) + pf_B1 = trt.PluginField("B1", B_mid, trt.PluginFieldType.FLOAT32) + pf_act_type = trt.PluginField("act_type", np.array(int(3), np.int32), trt.PluginFieldType.INT32) + pfc = trt.PluginFieldCollection([pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type]) + ffn_plug = ffn_plg_creator.create_plugin("ffn", pfc) + + ffn_inputs = [input_tensor] + ffn_layer = network.add_plugin_v2(ffn_inputs, ffn_plug) + + out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, ffn_layer.get_output(0), input_tensor, B_lout) + return out_layer + +def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imask): + """ + Add the transformer layer + """ + idims = input_tensor.shape + hidden_size = idims[2] + + context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask) + attention_heads = context_transposed.get_output(0) + + # FC0 + B_aout = init_dict[prefix + B_AOUT] + W_aout = init_dict[prefix + W_AOUT] + attention_out_fc = custom_fc(network, attention_heads, hidden_size, W_aout, B_aout) + B_aout = None + + skiplayer = skipln(prefix + "attention_output_layernorm_",config, init_dict, network, attention_out_fc.get_output(0), input_tensor, B_aout) + attention_ln = skiplayer.get_output(0) + + if config.use_trt: + ffn_layer = ffn_trt(prefix, config, init_dict, network, attention_ln) + else: + ffn_layer = ffn(prefix, config, init_dict, network, attention_ln) + return ffn_layer + +def bert_model(config, init_dict, network, input_tensor, input_mask): + """ + Create the bert model + """ + prev_input = input_tensor + for layer in range(0, config.num_hidden_layers): + ss = "l{}_".format(layer) + out_layer = transformer_layer_opt(ss, config, init_dict, network, prev_input, input_mask) + prev_input = out_layer.get_output(0) + return prev_input + +def squad_output(prefix, config, init_dict, network, input_tensor): + """ + Create the squad output + """ + + idims = input_tensor.shape + B, S, hidden_size = idims + + W_out = init_dict[prefix + SQD_W] + B_out = init_dict[prefix + SQD_B] + + dense = custom_fc(network, input_tensor, 2, W_out, B_out) + + if config.use_trt: + OUT = network.add_shuffle(dense.get_output(0)) + OUT.second_transpose = (1, 0, 2) + return OUT + return dense + +def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes): + input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) + segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) + input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) + + if len(sequence_lengths) > 1: + profile = builder.create_optimization_profile() + min_shape = (batch_sizes[0], sequence_lengths[0]) + opt_shape = (batch_sizes[1], sequence_lengths[1]) + max_shape = (batch_sizes[2], sequence_lengths[2]) + assert(sequence_lengths[0] <= sequence_lengths[1] and sequence_lengths[1] <= sequence_lengths[2]) + + print('set dynamic shape -> ', min_shape, opt_shape, max_shape) + profile.set_shape("input_ids", min_shape, opt_shape, max_shape) + profile.set_shape("segment_ids", min_shape, opt_shape, max_shape) + profile.set_shape("input_mask", min_shape, opt_shape, max_shape) + builder_config.add_optimization_profile(profile) + + wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"], trt.PluginFieldType.FLOAT32) + wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"], trt.PluginFieldType.FLOAT32) + wwordemb = trt.PluginField("bert_embeddings_word_embeddings", weights_dict["bert_embeddings_word_embeddings"], trt.PluginFieldType.FLOAT32) + wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32) + wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32) + + output_fp16 = trt.PluginField("output_fp16", np.array([1 if config.use_fp16 else 0]).astype(np.int32), trt.PluginFieldType.INT32) + mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32) + + pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type]) + fn = emln_plg_creator.create_plugin("embeddings", pfc) + + if config.use_trt: + input_ids = network.add_shuffle(input_ids) + input_ids.second_transpose = (1, 0) + segment_ids = network.add_shuffle(segment_ids) + segment_ids.second_transpose = (1, 0) + input_mask = network.add_shuffle(input_mask) + input_mask.second_transpose = (1, 0) + inputs = [input_ids.get_output(0), segment_ids.get_output(0), input_mask.get_output(0)] + else: + inputs = [input_ids, segment_ids, input_mask] + emb_layer = network.add_plugin_v2(inputs, fn) + return emb_layer + +def build_engine(batch_sizes, sequence_lengths, config, weights_dict): + explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + + builder = trt.Builder(TRT_LOGGER) + with builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config: + if config.use_fp16: + builder_config.set_flag(trt.BuilderFlag.FP16) + + # Create the network + emb_layer = emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes) + embeddings = emb_layer.get_output(0) + mask_idx = emb_layer.get_output(1) + + bert_out = bert_model(config, weights_dict, network, embeddings, mask_idx) + + squad_logits = squad_output("cls_", config, weights_dict, network, bert_out) + squad_logits_out = squad_logits.get_output(0) + + network.mark_output(squad_logits_out) + + build_start_time = time.time() + plan = builder.build_serialized_network(network, builder_config) + build_time_elapsed = (time.time() - build_start_time) + TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed)) + return plan + +def str2bool(v): + return v.lower() in ('yes', 'true') + +def main(): + parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-z", "--use_trt", type=str2bool, default=False, help = "Whether to use tensorRT or IxRT") + parser.add_argument("-x", "--onnx", required=False, help="The ONNX model file path.") + parser.add_argument("-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path.") + parser.add_argument("-o", "--output", required=True, default="bert_base_384.engine", help="The bert engine file, ex bert.engine") + parser.add_argument("-b", "--batch-size", nargs='+', help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.", type=int) + parser.add_argument("-s", "--sequence-length", nargs='+', help="Sequence length of the BERT model", type=int) + parser.add_argument("-c", "--config-dir", required=True, + help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google") + parser.add_argument("-f", "--fp16", action="store_true", help="Indicates that inference should be run in FP16 precision", required=False) + parser.add_argument("-j", "--squad-json", default="squad/dev-v1.1.json", help="squad json dataset used for int8 calibration", required=False) + parser.add_argument("-v", "--vocab-file", default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt", help="Path to file containing entire understandable vocab", required=False) + parser.add_argument("--verbose", action="store_true", help="Turn on verbose logger and set profiling verbosity to DETAILED", required=False) + + args, _ = parser.parse_known_args() + args.batch_size = args.batch_size or [1] + args.sequence_length = args.sequence_length or [128] + + if len(args.sequence_length) not in [1, 3]: + print("Error: You must provide either one or three integers.") + sys.exit(1) + + if len(args.batch_size) not in [1, 3]: + print("Error: You must provide either one or three integers.") + sys.exit(1) + + if args.verbose: + TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE + + bert_config_path = args.config_dir + TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path)) + + config = BertConfig(bert_config_path, args.fp16, args.use_trt) + + if args.onnx != None: + weights_dict = load_onnx_weights_and_quant(args.onnx, config) + elif args.pytorch != None: + weights_dict = load_pytorch_weights_and_quant(args.pytorch, config) + else: + raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.") + + with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as serialized_engine: + TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output)) + with open(args.output, "wb") as fout: + fout.write(serialized_engine) + TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.") + +if __name__ == "__main__": + main() diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_int8.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_int8.py new file mode 100644 index 0000000000000000000000000000000000000000..7167882bff938a2020dfd896cacfd43572e6d5be --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_int8.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import argparse +import json +import tensorrt as trt +import time +import sys +import ctypes +import os +import numpy as np +from builder_utils_int8 import load_pytorch_weights_and_quant +from builder_utils_int8 import WQKV, BQKV # Attention Keys +from builder_utils_int8 import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT # Transformer Keys +from builder_utils_int8 import SQD_W, SQD_B # SQuAD Output Keys +from builder import custom_fc as custom_fc_fp16 + +trt_version = [int(n) for n in trt.__version__.split('.')] + +TRT_LOGGER = trt.Logger(trt.Logger.ERROR) +from load_ixrt_plugin import load_ixrt_plugin +load_ixrt_plugin(TRT_LOGGER) + +plg_registry = trt.get_plugin_registry() +registry_list = plg_registry.plugin_creator_list +print("registry_list: ", [registry.name + '/' + registry.plugin_version for registry in registry_list]) +emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic_IxRT", "2", "") +qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic_IxRT", "3", "") +skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic_IxRT", "3", "") +gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic_IxRT", "1", "") +fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic_IxRT", "2", "") + +# +class BertConfig: + def __init__(self, bert_config_path, use_int8): + with open(bert_config_path, "r") as f: + data = json.load(f) + self.num_attention_heads = data["num_attention_heads"] + self.hidden_size = data["hidden_size"] + self.intermediate_size = data["intermediate_size"] + self.num_hidden_layers = data["num_hidden_layers"] + self.head_size = self.hidden_size // self.num_attention_heads + self.use_int8 = use_int8 + +def set_tensor_name(tensor, prefix, name): + tensor.name = prefix + name + +def set_output_name(layer, prefix, name, out_idx = 0): + set_tensor_name(layer.get_output(out_idx), prefix, name) + +def set_output_range(layer, maxval, out_idx = 0): + layer.get_output(out_idx).set_dynamic_range(-maxval, maxval) + +def get_mha_dtype(config): + dtype = trt.float32 + if config.use_int8: + dtype = trt.int8 + return int(dtype) + +def custom_fc(prefix, config, init_dict, network, input_tensor, out_dims, W, B): + pf_out_dims = trt.PluginField("out_dims", np.array([out_dims], dtype=np.int32), trt.PluginFieldType.INT32) + pf_W = trt.PluginField("W", W, trt.PluginFieldType.FLOAT32) + + fields = [pf_out_dims, pf_W] + + if config.use_int8: + amax_vec = [init_dict[prefix + "wei_amax"]] + if B is not None: + pf_B = trt.PluginField("Bias", B, trt.PluginFieldType.FLOAT32) + amax_vec.append(init_dict[prefix + "out_amax"]) + pf_amax = trt.PluginField("fc_amax", np.array(amax_vec, np.float32), trt.PluginFieldType.FLOAT32) + fields.append(pf_B) + fields.append(pf_amax) + else: + pf_amax = trt.PluginField("fc_amax", np.array(amax_vec, np.float32), trt.PluginFieldType.FLOAT32) + fields.append(pf_amax) + + pfc = trt.PluginFieldCollection(fields) + fc_plugin = fc_plg_creator.create_plugin("fcplugin", pfc) + plug_inputs = [input_tensor] + out_dense = network.add_plugin_v2(plug_inputs, fc_plugin) + return out_dense + +def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask): + """ + Add the attention layer + """ + B, S, hidden_size = input_tensor.shape + num_heads = config.num_attention_heads + head_size = int(hidden_size / num_heads) + + Wall = init_dict[prefix + WQKV] + Ball = init_dict[prefix + BQKV] + + # FC_attention + mult_all = custom_fc(prefix + "self_qkv_", config, init_dict, network, input_tensor, 3*hidden_size, Wall, Ball) + set_output_range(mult_all, init_dict[prefix + "self_qkv_out_amax"]) + + has_mask = imask is not None + + # QKV2CTX + pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32) + pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32) + fields = [pf_hidden_size, pf_num_heads] + dq_probs = [ + init_dict[prefix + "arrange_qkv_amax"], + init_dict[prefix + "softmax_in_amax"], + init_dict[prefix + "softmax_out_amax"] + ] + pf_dq = trt.PluginField("dq_probs", np.array(dq_probs, np.float32), trt.PluginFieldType.FLOAT32) + fields.append(pf_dq) + + pfc = trt.PluginFieldCollection(fields) + qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc) + + qkv_in = [mult_all.get_output(0)] + if has_mask: + qkv_in.append(imask) + qkv2ctx = network.add_plugin_v2(qkv_in, qkv2ctx_plug) + if config.use_int8: + set_output_range(qkv2ctx, init_dict[prefix + "output_dense_in_amax"]) + return qkv2ctx + + +def skipln(prefix, config, init_dict, network, input_tensor, skip, residual, is_last_layer, bias=None): + """ + Add the skip layer + """ + idims = input_tensor.shape + hidden_size = idims[2] + + dtype = trt.float32 + if config.use_int8: + dtype = trt.int8 + + wbeta = init_dict[prefix + "beta"] + wgamma = init_dict[prefix + "gamma"] + + pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32) + pf_beta = trt.PluginField("beta", wbeta, trt.PluginFieldType.FLOAT32) + pf_gamma = trt.PluginField("gamma", wgamma, trt.PluginFieldType.FLOAT32) + pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32) + + fields = [pf_ld, pf_beta, pf_gamma, pf_type ] + if bias is not None: + pf_bias = trt.PluginField("bias", bias, trt.PluginFieldType.FLOAT32) + fields.append(pf_bias) + if is_last_layer: + pf_fp32 = trt.PluginField("output_fp32", np.array([1], np.int32), trt.PluginFieldType.INT32) + fields.append(pf_fp32) + + pfc = trt.PluginFieldCollection(fields) + skipln_plug = skln_plg_creator.create_plugin("skipln", pfc) + + skipln_inputs = [input_tensor, skip] + if config.use_int8: + skipln_inputs.append(residual) + layer = network.add_plugin_v2(skipln_inputs, skipln_plug) + return layer + +def ffn(prefix, config, init_dict, network, input_tensor, residual, is_last_layer): + # FC1 + GELU + B_mid = init_dict[prefix + B_MID] + W_mid = init_dict[prefix + W_MID] + + mid_dense = custom_fc(prefix + "intermediate_dense_", config, init_dict, network, input_tensor, config.intermediate_size, W_mid, None) + set_output_range(mid_dense, init_dict[prefix + "intermediate_dense_out_amax"]) + + dtype = trt.float32 + + if config.use_int8: + dtype = trt.int8 + + pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32) + pf_ld = trt.PluginField("ld", np.array([int(config.intermediate_size)], np.int32), trt.PluginFieldType.INT32) + fields = [pf_type, pf_ld] + if config.use_int8: + pf_bias = trt.PluginField("bias", B_mid, trt.PluginFieldType.FLOAT32) + fields.append(pf_bias) + + pfc = trt.PluginFieldCollection(fields) + gelu_plug = gelu_plg_creator.create_plugin("gelu", pfc) + + gelu_inputs = [mid_dense.get_output(0)] + gelu_layer = network.add_plugin_v2(gelu_inputs, gelu_plug) + + if config.use_int8: + set_output_range(gelu_layer, init_dict[prefix + "output_dense_in_amax"]) + + intermediate_act = gelu_layer.get_output(0) + # set_tensor_name(intermediate_act, prefix, "gelu") + + # FC2 + # Dense to hidden size + B_lout = init_dict[prefix + B_LOUT] + W_lout = init_dict[prefix + W_LOUT] + out_dense = custom_fc(prefix + "output_dense_", config, init_dict, network, intermediate_act, config.hidden_size, W_lout, None) + set_output_range(out_dense, init_dict[prefix + "output_dense_out_amax"]) + + out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, out_dense.get_output(0), input_tensor, residual, is_last_layer, B_lout) + return out_layer + +def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imask, residual, is_last_layer): + """ + Add the transformer layer + """ + idims = input_tensor.shape + hidden_size = idims[2] + + context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask) + attention_heads = context_transposed.get_output(0) + + # FC0 + B_aout = init_dict[prefix + B_AOUT] + W_aout = init_dict[prefix + W_AOUT] + attention_out_fc = custom_fc(prefix + "attention_output_dense_", config, init_dict, network, attention_heads, hidden_size, W_aout, None) + set_output_range(attention_out_fc, init_dict[prefix + "attention_output_dense_out_amax"]) + + skiplayer = skipln(prefix + "attention_output_layernorm_", config, init_dict, network, attention_out_fc.get_output(0), input_tensor, residual, False, B_aout) + if config.use_int8: + set_output_range(skiplayer, init_dict[prefix + "intermediate_dense_in_amax"]) + + ffn_layer = ffn(prefix, config, init_dict, network, skiplayer.get_output(0), skiplayer.get_output(1), is_last_layer) + return ffn_layer + +def bert_model(config, init_dict, network, input_tensor, input_mask, residual): + """ + Create the bert model + """ + prev_input = input_tensor + for layer in range(0, config.num_hidden_layers): + ss = "l{}_".format(layer) + out_layer = transformer_layer_opt(ss, config, init_dict, network, prev_input, input_mask, residual, + True if config.use_int8 and layer == config.num_hidden_layers - 1 else False) + prev_input = out_layer.get_output(0) + residual = None + if config.use_int8: + residual = out_layer.get_output(1) + if layer < config.num_hidden_layers - 1: + set_output_range(out_layer, init_dict["l{}_".format(layer+1) + "attention_self_qkv_in_amax"]) + else: + set_output_range(out_layer, 1) + + return prev_input + +def squad_output(prefix, config, init_dict, network, input_tensor): + """ + Create the squad output + """ + + idims = input_tensor.shape + B, S, hidden_size = idims + + W_out = init_dict[prefix + SQD_W] + B_out = init_dict[prefix + SQD_B] + + dense = custom_fc_fp16(network, input_tensor, 2, W_out, B_out) + return dense + +def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes): + input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) + segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) + input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) + + if len(sequence_lengths) > 1: + profile = builder.create_optimization_profile() + min_shape = (batch_sizes[0], sequence_lengths[0]) + opt_shape = (batch_sizes[1], sequence_lengths[1]) + max_shape = (batch_sizes[2], sequence_lengths[2]) + assert(sequence_lengths[0] <= sequence_lengths[1] and sequence_lengths[1] <= sequence_lengths[2]) + + print('set dynamic shape -> ', min_shape, opt_shape, max_shape) + profile.set_shape("input_ids", min_shape, opt_shape, max_shape) + profile.set_shape("segment_ids", min_shape, opt_shape, max_shape) + profile.set_shape("input_mask", min_shape, opt_shape, max_shape) + builder_config.add_optimization_profile(profile) + + wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"], trt.PluginFieldType.FLOAT32) + wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"], trt.PluginFieldType.FLOAT32) + wwordemb = trt.PluginField("bert_embeddings_word_embeddings", weights_dict["bert_embeddings_word_embeddings"], trt.PluginFieldType.FLOAT32) + wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32) + wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32) + + output_fp16 = trt.PluginField("output_fp16", np.array([1]).astype(np.int32), trt.PluginFieldType.INT32) + mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32) + + pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type]) + fn = emln_plg_creator.create_plugin("embeddings", pfc) + + inputs = [input_ids, segment_ids, input_mask] + emb_layer = network.add_plugin_v2(inputs, fn) + + if config.use_int8: + set_output_range(emb_layer, weights_dict["l0_attention_self_qkv_in_amax"]) + set_output_range(emb_layer, 1.0, 1) + return emb_layer + +def build_engine(batch_sizes, sequence_lengths, config, weights_dict): + explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + + builder = trt.Builder(TRT_LOGGER) + with builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config: + network = builder.create_network(explicit_batch_flag) + builder_config = builder.create_builder_config() + builder_config.set_flag(trt.BuilderFlag.INT8) + + # Create the network + emb_layer = emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes) + embeddings = emb_layer.get_output(0) + mask_idx = emb_layer.get_output(1) + + residual_buffer = None + if config.use_int8: + residual_buffer = emb_layer.get_output(2) + + bert_out = bert_model(config, weights_dict, network, embeddings, mask_idx, residual_buffer) + + squad_logits = squad_output("cls_", config, weights_dict, network, bert_out) + squad_logits_out = squad_logits.get_output(0) + + network.mark_output(squad_logits_out) + + build_start_time = time.time() + plan = builder.build_serialized_network(network, builder_config) + build_time_elapsed = (time.time() - build_start_time) + TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed)) + return plan + +def main(): + parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-x", "--onnx", required=False, help="The ONNX model file path.") + parser.add_argument("-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path.") + parser.add_argument("-o", "--output", required=True, default="bert_base_384.engine", help="The bert engine file, ex bert.engine") + parser.add_argument("-b", "--batch-size", nargs='+', help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.", type=int) + parser.add_argument("-s", "--sequence-length", nargs='+', help="Sequence length of the BERT model", type=int) + parser.add_argument("-c", "--config-dir", required=True, + help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google") + parser.add_argument("-f", "--fp16", action="store_true", help="Indicates that inference should be run in FP16 precision", required=False) + parser.add_argument("-i", "--int8", action="store_true", help="Indicates that inference should be run in INT8 precision", required=False) + parser.add_argument("-j", "--squad-json", default="squad/dev-v1.1.json", help="squad json dataset used for int8 calibration", required=False) + parser.add_argument("-v", "--vocab-file", default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt", help="Path to file containing entire understandable vocab", required=False) + parser.add_argument("--verbose", action="store_true", help="Turn on verbose logger and set profiling verbosity to DETAILED", required=False) + + args, _ = parser.parse_known_args() + args.batch_size = args.batch_size or [1] + args.sequence_length = args.sequence_length or [128] + + if len(args.sequence_length) not in [1, 3]: + print("Error: You must provide either one or three integers.") + sys.exit(1) + + if len(args.batch_size) not in [1, 3]: + print("Error: You must provide either one or three integers.") + sys.exit(1) + + if args.verbose: + TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE + + bert_config_path = os.path.join(args.config_dir, "config.json") + TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path)) + + config = BertConfig(bert_config_path, args.int8) + + if args.onnx != None: + if args.int8: + raise RuntimeError("int8 onnx not supported now!!!") + elif args.pytorch != None: + weights_dict = load_pytorch_weights_and_quant(args.pytorch, config) + else: + raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.") + + # engine = build_engine(args.batch_size, args.workspace_size, args.sequence_length, config, weights_dict, args.squad_json, args.vocab_file, None, args.calib_num, args.verbose) + with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as serialized_engine: + TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output)) + with open(args.output, "wb") as fout: + fout.write(serialized_engine) + TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.") + +if __name__ == "__main__": + main() diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_utils.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..25018bd1c9f2da211a650f16b335613abb04a4eb --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_utils.py @@ -0,0 +1,269 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import onnx +import numpy as np +import tensorrt as trt +import json +import struct +import torch + +TRT_LOGGER = trt.Logger(trt.Logger.INFO) + +""" +Attentions Keys +""" +WQ = "self_query_kernel" +BQ = "self_query_bias" +WK = "self_key_kernel" +BK = "self_key_bias" +WV = "self_value_kernel" +BV = "self_value_bias" +WQKV = "self_qkv_kernel" +BQKV = "self_qkv_bias" + +""" +Transformer Keys +""" +W_AOUT = "attention_output_dense_kernel" +B_AOUT = "attention_output_dense_bias" +AOUT_LN_BETA = "attention_output_layernorm_beta" +AOUT_LN_GAMMA = "attention_output_layernorm_gamma" +W_MID = "intermediate_dense_kernel" +B_MID = "intermediate_dense_bias" +W_LOUT = "output_dense_kernel" +B_LOUT = "output_dense_bias" +LOUT_LN_BETA = "output_layernorm_beta" +LOUT_LN_GAMMA = "output_layernorm_gamma" + +""" +Squad Output Keys +""" +SQD_W = "squad_output_weights" +SQD_B = "squad_output_bias" + + +def get_onnx_weight_dict(tensor_dict, config): + N = config.num_attention_heads + H = config.head_size + hidden_size = config.hidden_size + + weights_dict = dict() + for outname, tensor in tensor_dict.items(): + if outname.find("_amax") != -1: + weights_dict[outname] = tensor.flatten() + elif outname.find(BQ) != -1: + prefix = outname[:outname.find(BQ)] + + Wqkv = np.zeros((3, hidden_size, hidden_size), np.float32) + Bqkv = np.zeros((3, hidden_size), np.float32) + + Wqkv[0,:,:] = tensor_dict[prefix + WQ] + Wqkv[1,:,:] = tensor_dict[prefix + WK] + Wqkv[2,:,:] = tensor_dict[prefix + WV] + Bqkv[0,:] = tensor + Bqkv[1,:] = tensor_dict[prefix + BK] + Bqkv[2,:] = tensor_dict[prefix + BV] + + if config.use_trt: + Wqkv = np.ascontiguousarray(Wqkv.reshape((3, N, H, N, H)).transpose((1,0,2,3,4))) + Bqkv = np.ascontiguousarray(Bqkv.reshape((3, N, H)).transpose((1,0,2))) + + weights_dict[prefix + WQKV] = Wqkv.flatten() + weights_dict[prefix + BQKV] = Bqkv.flatten() + weights_dict[prefix + WQKV + "_notrans"] = np.ascontiguousarray(Wqkv.T).flatten() + + elif outname.find(BK) != -1 or outname.find(BV) != -1 or outname.find(WQ) != -1 or outname.find(WK) != -1 or outname.find(WV) != -1: + pass + else: + flat_tensor = np.ascontiguousarray(tensor).flatten() + weights_dict[outname] = flat_tensor + + if outname.find("kernel") != -1 and config.use_trt: + tensor = np.transpose(tensor) + weights_dict[outname + "_notrans"] = np.ascontiguousarray(tensor).flatten() + + return weights_dict + +def onnx_to_trt_name(onnx_name): + """ + Converting variables in the onnx checkpoint to names corresponding to the naming convention used in the TF version, expected by the builder + """ + qkv_strings = {'key', 'value', 'query', 'query_key_value'} + onnx_name = onnx_name.lower() + toks = [t.strip('_') for t in onnx_name.split('.')] + if toks[0] == 'bert': #embeddings or encoder + if toks[1] == 'encoder': #transformer + # Token conversions for sparse checkpoints + if toks[-2] == 'dense_act': + toks[-2] = 'dense' + elif toks[-3] == 'dense_act': + if toks[-2] == 'input_quantizer': + toks[-2] = 'input' + elif toks[-2] == 'weight_quantizer': + toks[-2] = 'kernel' + toks[-3] = 'dense' + elif toks[-2].startswith('matmul'): + toks[-2] = { + 'matmul_q_quantizer': 'qv_a_input_quantizer', + 'matmul_k_quantizer': 'qv_b_input_quantizer', + 'matmul_v_quantizer': 'av_b_input_quantizer', + 'matmul_a_quantizer': 'av_a_input_quantizer', + }[toks[-2].replace('input_', '')] + + # Token conversions for all checkpoints + if toks[-2] == 'layernorm': #bias->beta, weight->gamma + toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma' + elif (toks[-2] == 'dense' or toks[-2] in qkv_strings) and toks[-1] == 'weight': + toks[-1] = 'kernel' + elif (toks[-3] == 'dense' or toks[-3] in qkv_strings) and toks[-1] == 'amax': + if toks[-2] == 'weight_quantizer': + toks[-2] = 'kernel' + elif toks[-2] == 'input_quantizer': + toks[-2] = 'input' + + if 'final_input_quantizer' not in toks[2]: + ind = toks.index('layers')+1 if 'layers' in toks else 3 + toks = toks[ind:] + toks[0] = 'l{}'.format(int(toks[0])) + else: + if toks[-2] == 'layernorm': #bias->beta, weight->gamma + toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma' + else: #embeddings: drop "_weight" suffix + if toks[-1] == 'amax': + toks[-2] = 'amax' + toks = toks[:-1] + elif 'qa' in onnx_name: + name = 'cls_squad_output_bias' if toks[-1] == 'bias' else 'cls_squad_output_weights' + return name + else: + print("Encountered unknown case:", onnx_name) + assert(False) + parsed = '_'.join(toks) + return parsed + +def pt_to_trt_name(pt_name): + """ + Converting variables in the onnx checkpoint to names corresponding to the naming convention used in the TF version, expected by the builder + """ + qkv_strings = {'key', 'value', 'query', 'query_key_value'} + pt_name = pt_name.lower() + toks = [t.strip('_') for t in pt_name.split('.')] + if toks[0] == 'bert': #embeddings or encoder + if toks[1] == 'encoder': #transformer + if toks[-2] == 'layernorm': #bias->beta, weight->gamma + toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma' + elif (toks[-2] == 'dense' or toks[-2] in qkv_strings) and toks[-1] == 'weight': + toks[-1] = 'kernel' + + if 'final_input_quantizer' not in toks[2]: + ind = toks.index('layers')+1 if 'layers' in toks else 3 + toks = toks[ind:] + toks[0] = 'l{}'.format(int(toks[0])) + + else: + if toks[-2] == 'layernorm': #bias->beta, weight->gamma + toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma' + else: #embeddings: drop "_weight" suffix + toks = toks[:-1] + + elif 'qa_outputs' in pt_name: ## + name = 'cls_squad_output_bias' if toks[-1] == 'bias' else 'cls_squad_output_weights' + return name + else: + print("Encountered unknown case:", pt_name) + assert(False) + parsed = '_'.join(toks) + return parsed + +def load_onnx_weights_and_quant(path, config): + """ + Load the weights from the onnx checkpoint + """ + model = onnx.load(path) + weights = model.graph.initializer + # for w in weights: + # print(w.name, w.dims,flush=True) + tensor_dict = dict((onnx_to_trt_name(w.name), np.frombuffer(w.raw_data, np.int8).reshape(w.dims)) + if w.name.split('_')[-1] == 'mask' else + (onnx_to_trt_name(w.name), np.frombuffer(w.raw_data, np.float32).reshape(w.dims)) + for w in weights) + # for key in tensor_dict: + # print(key, tensor_dict[key].shape,flush=True) + + return get_onnx_weight_dict(tensor_dict, config) + +def load_pytorch_weights_and_quant(path, config): + """ + Load the weights from the pytorch checkpoint + """ + state_dict = torch.load(path, map_location='cpu') + # for name in state_dict: + # print(name, state_dict[name].size(),flush=True) + tensor_dict = {pt_to_trt_name(name):val.numpy() for name, val in state_dict.items()} + # for key in tensor_dict: + # print(key, tensor_dict[key].shape,flush=True) + return get_onnx_weight_dict(tensor_dict, config) + +class BertConfig: + def __init__(self, bert_config_path, use_fp16, use_int8=False): + with open(bert_config_path, "r") as f: + data = json.load(f) + self.num_attention_heads = data["num_attention_heads"] + self.hidden_size = data["hidden_size"] + self.intermediate_size = data["intermediate_size"] + self.num_hidden_layers = data["num_hidden_layers"] + self.head_size = self.hidden_size // self.num_attention_heads + self.use_fp16 = use_fp16 + self.use_int8 = use_int8 + +if __name__ == '__main__': + bert_config_path = '../bert-large-uncased/bert_config.json' + onnx_model_path = '../bert-large-uncased/bert_large_v1_1_fake_quant.onnx' + weight_save_path = "../bert-large-uncased/bert_large_v1_1.wts" + config = config = BertConfig(bert_config_path, True) + weights_dict = load_onnx_weights_and_quant(onnx_model_path, config) + f = open(weight_save_path, "w") + num = 0 + for key, value in weights_dict.items(): + if key.find('_amax') == -1: + num += 1 + + f.write('{}\n'.format(num)) + for key, value in weights_dict.items(): + print('key: ', key) + if key.find('_amax') != -1: + continue + f.write("{} {}".format(key, len(value))) + print(len(value)) + for v in value: + f.write(" ") + f.write(struct.pack('>f', float(v)).hex()) + f.write("\n") diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_utils_int8.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_utils_int8.py new file mode 100644 index 0000000000000000000000000000000000000000..67a53f05b4fbaba98420924abe3a4d7afdbd01bd --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/builder_utils_int8.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import tensorrt as trt +import json +import struct +import torch + +TRT_LOGGER = trt.Logger(trt.Logger.INFO) + +""" +Attentions Keys +""" +WQ = "self_query_kernel" +BQ = "self_query_bias" +WK = "self_key_kernel" +BK = "self_key_bias" +WV = "self_value_kernel" +BV = "self_value_bias" +WQKV = "self_qkv_kernel" +BQKV = "self_qkv_bias" + +""" +Transformer Keys +""" +W_AOUT = "attention_output_dense_kernel" +B_AOUT = "attention_output_dense_bias" +AOUT_LN_BETA = "attention_output_layernorm_beta" +AOUT_LN_GAMMA = "attention_output_layernorm_gamma" +W_MID = "intermediate_dense_kernel" +B_MID = "intermediate_dense_bias" +W_LOUT = "output_dense_kernel" +B_LOUT = "output_dense_bias" +LOUT_LN_BETA = "output_layernorm_beta" +LOUT_LN_GAMMA = "output_layernorm_gamma" + +""" +Squad Output Keys +""" +SQD_W = "squad_output_weights" +SQD_B = "squad_output_bias" + +ixrt_name_map = { + "bert.embeddings.LayerNorm.bias": "bert_embeddings_layernorm_beta", + "bert.embeddings.LayerNorm.weight" : "bert_embeddings_layernorm_gamma", + "bert.embeddings.word_embeddings.weight" : "bert_embeddings_word_embeddings", + "bert.embeddings.token_type_embeddings.weight" : "bert_embeddings_token_type_embeddings", + "bert.embeddings.position_embeddings.weight" : "bert_embeddings_position_embeddings", + "qa_outputs.weight" : "cls_squad_output_weights", + "qa_outputs.bias" : "cls_squad_output_bias" +} + +ixrt_atten_name_map = { + "bert.encoder.layer.{}.self_attn.qkv_proj.weight" : "l{}_attention_self_qkv_kernel", + "bert.encoder.layer.{}.self_attn.qkv_proj.bias" : "l{}_attention_self_qkv_bias", + "bert.encoder.layer.{}.self_attn.out_proj.bias" : "l{}_attention_output_dense_bias", + "bert.encoder.layer.{}.self_attn.out_proj.weight" : "l{}_attention_output_dense_kernel", + "bert.encoder.layer.{}.fc1.weight" : "l{}_intermediate_dense_kernel", + "bert.encoder.layer.{}.fc1.bias" : "l{}_intermediate_dense_bias", + "bert.encoder.layer.{}.fc2.weight" : "l{}_output_dense_kernel", + "bert.encoder.layer.{}.fc2.bias" : "l{}_output_dense_bias", + "bert.encoder.layer.{}.self_attn_layer_norm.weight" : "l{}_attention_output_layernorm_gamma", + "bert.encoder.layer.{}.self_attn_layer_norm.bias" : "l{}_attention_output_layernorm_beta", + "bert.encoder.layer.{}.final_layer_norm.weight" : "l{}_output_layernorm_gamma", + "bert.encoder.layer.{}.final_layer_norm.bias" : "l{}_output_layernorm_beta", + "bert.encoder.layer.{}.self_attn.qkv_proj.weight_quant.clip.clip_value_max" : "l{}_attention_self_qkv_wei_amax", + "bert.encoder.layer.{}.self_attn.qkv_proj.input_quant.clip.clip_value_max" : "l{}_attention_self_qkv_in_amax", + "bert.encoder.layer.{}.self_attn.qkv_proj.output_quant.clip.clip_value_max" : "l{}_attention_self_qkv_out_amax", + "bert.encoder.layer.{}.self_attn.attention_quant.clip.clip_value_max" : "l{}_attention_arrange_qkv_amax", + "bert.encoder.layer.{}.self_attn.softmax_in_quant.clip.clip_value_max" : "l{}_attention_softmax_in_amax", + "bert.encoder.layer.{}.self_attn.atten_score_out_quant.clip.clip_value_max" : "l{}_attention_softmax_out_amax", + "bert.encoder.layer.{}.self_attn.out_proj.input_quant.clip.clip_value_max" : "l{}_attention_output_dense_in_amax", + "bert.encoder.layer.{}.self_attn.out_proj.output_quant.clip.clip_value_max" : "l{}_attention_output_dense_out_amax", + "bert.encoder.layer.{}.self_attn.out_proj.weight_quant.clip.clip_value_max" : "l{}_attention_output_dense_wei_amax", + "bert.encoder.layer.{}.fc1.input_quant.clip.clip_value_max" : "l{}_intermediate_dense_in_amax", + "bert.encoder.layer.{}.fc1.output_quant.clip.clip_value_max" : "l{}_intermediate_dense_out_amax", + "bert.encoder.layer.{}.fc1.weight_quant.clip.clip_value_max" : "l{}_intermediate_dense_wei_amax", + "bert.encoder.layer.{}.fc2.input_quant.clip.clip_value_max" : "l{}_output_dense_in_amax", + "bert.encoder.layer.{}.fc2_out_quant.clip.clip_value_max" : "l{}_output_dense_out_amax", + "bert.encoder.layer.{}.fc2.weight_quant.clip.clip_value_max" : "l{}_output_dense_wei_amax" +} + +def get_weight_dict(tensor_dict, config): + N = config.num_attention_heads + H = config.head_size + hidden_size = config.hidden_size + + weights_dict = dict() + for outname, tensor in tensor_dict.items(): + if outname.find("_amax") != -1: + weights_dict[outname] = tensor.item() + elif outname.find(BQ) != -1: + prefix = outname[:outname.find(BQ)] + + Wqkv = np.zeros((3, hidden_size, hidden_size), np.float32) + Bqkv = np.zeros((3, hidden_size), np.float32) + + Wqkv[0,:,:] = tensor_dict[prefix + WQ] + Wqkv[1,:,:] = tensor_dict[prefix + WK] + Wqkv[2,:,:] = tensor_dict[prefix + WV] + Bqkv[0,:] = tensor + Bqkv[1,:] = tensor_dict[prefix + BK] + Bqkv[2,:] = tensor_dict[prefix + BV] + + weights_dict[prefix + WQKV] = Wqkv.flatten() + weights_dict[prefix + BQKV] = Bqkv.flatten() + elif outname.find(BK) != -1 or outname.find(BV) != -1 or outname.find(WQ) != -1 or outname.find(WK) != -1 or outname.find(WV) != -1: + pass + else: + flat_tensor = np.ascontiguousarray(tensor).flatten() + weights_dict[outname] = flat_tensor + + return weights_dict + +def pytorch_to_trt_name(state_dict, num_layer): + tensor_dict = {} + for name in ixrt_name_map.keys(): + tensor_dict[ixrt_name_map[name]] = state_dict[name] + + for name in ixrt_atten_name_map.keys(): + for layer_id in range(num_layer): + key_name = name.format(layer_id) + value_name = ixrt_atten_name_map[name].format(layer_id) + tensor_dict[value_name] = state_dict[key_name] + return tensor_dict + +def load_pytorch_weights_and_quant(path, config): + """ + Load the weights from the pytorch checkpoint + """ + state_dict = torch.load(path, map_location='cpu') + tensor_dict = pytorch_to_trt_name(state_dict, config.num_hidden_layers) + return get_weight_dict(tensor_dict, config) + +class BertConfig: + def __init__(self, bert_config_path, use_fp16, use_int8=False, use_trt=False): + with open(bert_config_path, "r") as f: + data = json.load(f) + self.num_attention_heads = data["num_attention_heads"] + self.hidden_size = data["hidden_size"] + self.intermediate_size = data["intermediate_size"] + self.num_hidden_layers = data["num_hidden_layers"] + self.head_size = self.hidden_size // self.num_attention_heads + self.use_fp16 = use_fp16 + self.use_int8 = use_int8 + self.use_trt = use_trt + +if __name__ == '__main__': + bert_config_path = './data/bert-large-uncased/bert_config.json' + pytorch_model_path = './data/bert-large-uncased/bert_large_int8_qat.bin' + weight_save_path = "./data/bert-large-uncased/bert_large_v1_1_int8.wts" + config = BertConfig(bert_config_path, True) + weights_dict = load_pytorch_weights_and_quant(pytorch_model_path, config) + f = open(weight_save_path, "w") + num = 0 + for key, value in weights_dict.items(): + if key.find('_amax') == -1: + num += 1 + + f.write('{}\n'.format(num)) + for key, value in weights_dict.items(): + if key.find('_amax') != -1: + continue + print('key: ', key) + f.write("{} {}".format(key, len(value))) + print(len(value)) + for v in value: + f.write(" ") + f.write(struct.pack('>f', float(v)).hex()) + f.write("\n") + + f.write('{}\n'.format(len(weights_dict) - num)) + for key, value in weights_dict.items(): + if key.find('_amax') == -1: + continue + print('key: ', key) + print('value: ', value) + f.write('{} '.format(key)) + f.write(struct.pack('>f', float(weights_dict[key])).hex()) + f.write('\n') diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py new file mode 100644 index 0000000000000000000000000000000000000000..92c4e83bf7f150156108b7ccd99f0a9373222c2a --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate-v1.1.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Obtained from https://rajpurkar.github.io/SQuAD-explorer/ + +""" Official evaluation script for v1.1 of the SQuAD dataset. """ +from __future__ import print_function +from collections import Counter +import string +import re +import argparse +import json +import sys + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return (normalize_answer(prediction) == normalize_answer(ground_truth)) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + +def evaluate(dataset, predictions, f1_acc): + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article['paragraphs']: + for qa in paragraph['qas']: + total += 1 + if qa['id'] not in predictions: + message = 'Unanswered question ' + qa['id'] + \ + ' will receive score 0.' + print(message, file=sys.stderr) + continue + ground_truths = list(map(lambda x: x['text'], qa['answers'])) + prediction = predictions[qa['id']] + exact_match += metric_max_over_ground_truths( + exact_match_score, prediction, ground_truths) + f1 += metric_max_over_ground_truths( + f1_score, prediction, ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + status = 1 + if (f1 < f1_acc - 0.5): + print("&&&& FAILED TensorRT BERT Squad Accuracy matches reference.") + status = 0 + else: + print("&&&& PASSED TensorRT BERT Squad Accuracy matches reference.") + + return {'exact_match': exact_match, 'f1': f1, "status": status} + +if __name__ == '__main__': + expected_version = '1.1' + parser = argparse.ArgumentParser( + description='Evaluation for SQuAD ' + expected_version) + parser.add_argument('dataset_file', help='Dataset file') + parser.add_argument('prediction_file', help='Prediction File') + parser.add_argument('f1_acc', help='Reference Accuracy') + args = parser.parse_args() + with open(args.dataset_file) as dataset_file: + dataset_json = json.load(dataset_file) + if (dataset_json['version'] != expected_version): + print('Evaluation expects v-' + expected_version + + ', but got dataset with v-' + dataset_json['version'], + file=sys.stderr) + dataset = dataset_json['data'] + with open(args.prediction_file) as prediction_file: + predictions = json.load(prediction_file) + f1_acc = float(args.f1_acc) + res = evaluate(dataset, predictions, f1_acc) + print(res) + if res["status"] == 1: + print("pass!") + exit() + else: + print("failed!") + exit(1) diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..49b0dedec85518e852bd3d18e106945273094e27 --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/evaluate.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Official evaluation script for v1.1 of the SQuAD dataset. """ + +import argparse +import json +import re +import string +import sys +from collections import Counter + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return normalize_answer(prediction) == normalize_answer(ground_truth) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def evaluate(dataset, predictions): + f1 = exact_match = total = 0 + for article in dataset: + for paragraph in article["paragraphs"]: + for qa in paragraph["qas"]: + total += 1 + if qa["id"] not in predictions: + message = ( + "Unanswered question " + qa["id"] + " will receive score 0." + ) + print(message, file=sys.stderr) + continue + ground_truths = list(map(lambda x: x["text"], qa["answers"])) + prediction = predictions[qa["id"]] + exact_match += metric_max_over_ground_truths( + exact_match_score, prediction, ground_truths + ) + f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) + + exact_match = 100.0 * exact_match / total + f1 = 100.0 * f1 / total + + return {"exact_match": exact_match, "f1": f1} + + +if __name__ == "__main__": + expected_version = "1.1" + parser = argparse.ArgumentParser( + description="Evaluation for SQuAD " + expected_version + ) + parser.add_argument("dataset_file", help="Dataset file") + parser.add_argument("prediction_file", help="Prediction File") + args = parser.parse_args() + with open(args.dataset_file) as dataset_file: + dataset_json = json.load(dataset_file) + if dataset_json["version"] != expected_version: + print( + "Evaluation expects v-" + + expected_version + + ", but got dataset with v-" + + dataset_json["version"], + file=sys.stderr, + ) + dataset = dataset_json["data"] + with open(args.prediction_file) as prediction_file: + predictions = json.load(prediction_file) + print(json.dumps(evaluate(dataset, predictions))) diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/__init__.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/calibrator.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/calibrator.py new file mode 100644 index 0000000000000000000000000000000000000000..beacc625fae0f73bda3480054e4ecceca85fb240 --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/calibrator.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import tensorrt as trt +import os + +import pycuda.driver as cuda +import pycuda.autoinit +import numpy as np +import helpers.tokenization as tokenization +import helpers.data_processing as dp + +class BertCalibrator(trt.IInt8LegacyCalibrator): + def __init__(self, squad_json, vocab_file, cache_file, batch_size, max_seq_length, num_inputs): + # Whenever you specify a custom constructor for a TensorRT class, + # you MUST call the constructor of the parent explicitly. + trt.IInt8LegacyCalibrator.__init__(self) + + self.cache_file = cache_file + + # Every time get_batch is called, the next batch of size batch_size will be copied to the device and returned. + self.data = dp.read_squad_json(squad_json) + self.max_seq_length = max_seq_length + self.batch_size = batch_size + self.current_index = 0 + self.num_inputs = num_inputs + self.tokenizer = tokenization.BertTokenizer(vocab_file=vocab_file, do_lower_case=True) + self.doc_stride = 128 + self.max_query_length = 64 + + # Allocate enough memory for a whole batch. + self.device_inputs = [cuda.mem_alloc(self.max_seq_length * trt.int32.itemsize * self.batch_size) for binding in range(3)] + + def free(self): + for dinput in self.device_inputs: + dinput.free() + + def get_batch_size(self): + return self.batch_size + + # TensorRT passes along the names of the engine bindings to the get_batch function. + # You don't necessarily have to use them, but they can be useful to understand the order of + # the inputs. The bindings list is expected to have the same ordering as 'names'. + def get_batch(self, names): + if self.current_index + self.batch_size > self.num_inputs: + print("Calibrating index {:} batch size {:} exceed max input limit {:} sentences".format(self.current_index, self.batch_size, self.num_inputs)) + return None + + current_batch = int(self.current_index / self.batch_size) + if current_batch % 10 == 0: + print("Calibrating batch {:}, containing {:} sentences".format(current_batch, self.batch_size)) + + input_ids = [] + segment_ids = [] + input_mask = [] + for i in range(self.batch_size): + example = self.data[self.current_index + i] + features = dp.convert_example_to_features(example.doc_tokens, example.question_text, self.tokenizer, self.max_seq_length, self.doc_stride, self.max_query_length) + if len(input_ids) and len(segment_ids) and len(input_mask): + input_ids = np.concatenate((input_ids, features[0].input_ids)) + segment_ids = np.concatenate((segment_ids, features[0].segment_ids)) + input_mask = np.concatenate((input_mask, features[0].input_mask)) + else: + input_ids = features[0].input_ids + segment_ids = features[0].segment_ids + input_mask = features[0].input_mask + + cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel()) + cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel()) + cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel()) + + self.current_index += self.batch_size + return self.device_inputs + + def read_calibration_cache(self): + # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None. + if os.path.exists(self.cache_file): + with open(self.cache_file, "rb") as f: + return f.read() + + def write_calibration_cache(self, cache): + with open(self.cache_file, "wb") as f: + f.write(cache) + f.flush() + os.fsync(f) + + def get_quantile(self): + return 0.9999 + + def get_regression_cutoff(self): + return 1.0 + + def read_histogram_cache(self, length): + return None + + def write_histogram_cache(self, ptr, length): + return None diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/data_processing.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/data_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..712e1a61d29a198eb276f41a9249b0c66e3786ba --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/data_processing.py @@ -0,0 +1,497 @@ +#!/usr/bin/env python3 +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import helpers.tokenization as tokenization +import collections +import numpy as np +import six +import math +import json + + +def convert_doc_tokens(paragraph_text): + + """ Return the list of tokens from the doc text """ + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + doc_tokens = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + + return doc_tokens + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_length, + doc_stride, max_query_length): + """Loads a data file into a list of `InputBatch`s.""" + + query_tokens = tokenizer.tokenize(question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + _Feature = collections.namedtuple( # pylint: disable=invalid-name + "Feature", + ["input_ids", "input_mask", "segment_ids", "tokens", "token_to_orig_map", "token_is_max_context"]) + + + features = [] + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + # while len(input_ids) < max_seq_length: + # input_ids.append(0) + # input_mask.append(0) + # segment_ids.append(0) + + # assert len(input_ids) == max_seq_length + # assert len(input_mask) == max_seq_length + # assert len(segment_ids) == max_seq_length + + def create_int_feature(values): + feature = np.asarray(values, dtype=np.int32, order=None) + return feature + + + features.append(_Feature( + input_ids = create_int_feature(input_ids), + input_mask = create_int_feature(input_mask), + segment_ids = create_int_feature(segment_ids), + tokens = tokens, + token_to_orig_map = token_to_orig_map, + token_is_max_context = token_is_max_context + )) + return features + + +def read_squad_json(input_file): + """read from squad json into a list of examples""" + with open(input_file, "r", encoding='utf-8') as reader: + input_data = json.load(reader)["data"] + + _Example = collections.namedtuple( # pylint: disable=invalid-name + "Example", + ["id", "question_text", "doc_tokens"]) + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = convert_doc_tokens(paragraph_text) + + for qa in paragraph["qas"]: + examples.append(_Example( + id = qa["id"], + question_text = qa["question"], + doc_tokens = doc_tokens + )) + + return examples + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +def get_predictions(doc_tokens, features, results, n_best_size, max_answer_length): + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + prediction = "" + scores_diff_json = 0.0 + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + version_2_with_negative = False + + for result in results: + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + feature = features[result.feature_index] + + # if we could have irrelevant answers, get the min score of irrelevant + if version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = 0 + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=result.feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + + if version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=result.feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + + if pred.start_index > 0: # this is a non-null prediction + feature = features[pred.feature_index] + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, True) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + if len(final_text): + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # if we didn't inlude the empty option in the n-best, inlcude it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", start_logit=null_start_logit, + end_logit=null_end_logit)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + null_score_diff_threshold = 0.0 + if not version_2_with_negative: + prediction = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json = score_diff + if score_diff > null_score_diff_threshold: + prediction = "" + else: + prediction = best_non_null_entry.text + + return prediction, nbest_json, scores_diff_json diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/tokenization.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/tokenization.py new file mode 100644 index 0000000000000000000000000000000000000000..434f411df061376e565c13b5a96466175b39383c --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/helpers/tokenization.py @@ -0,0 +1,446 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r", encoding='utf-8') as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BertTokenizer(object): + """Runs end-to-end tokenization: punctuation splitting + wordpiece""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(self.vocab[token]) + return ids + + def convert_ids_to_tokens(self, ids): + """Converts a sequence of ids in wordpiece tokens using the vocab.""" + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/inference.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..a85e765c91152562d6180307c2bb1317dc385356 --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/inference.py @@ -0,0 +1,420 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import time +import json +import ctypes +import argparse +import collections +import numpy as np +import tensorrt as trt +import pycuda.driver as cuda +import pycuda.autoinit + +import helpers.tokenization as tokenization +import helpers.data_processing as dp +from tqdm import tqdm +import math + +from load_ixrt_plugin import load_ixrt_plugin +TRT_LOGGER = trt.Logger(trt.Logger.ERROR) + +def parse_args(): + """ + Parse command line arguments + """ + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('-e', '--engine', + help='Path to BERT TensorRT engine') + parser.add_argument("-b", "--batch-size", default=1, help="Batch size for inference.", type=int) + parser.add_argument('-p', '--passage', nargs='*', + help='Text for paragraph/passage for BERT QA', + default='') + parser.add_argument('-pf', '--passage-file', + help='File containing input passage', + default='') + parser.add_argument('-q', '--question', nargs='*', + help='Text for query/question for BERT QA', + default='') + parser.add_argument('-qf', '--question-file', + help='File containing input question', + default='') + parser.add_argument('-sq', '--squad-json', + help='SQuAD json file', + default='') + parser.add_argument('-o', '--output-prediction-file', + help='Output prediction file for SQuAD evaluation', + default='./predictions.json') + parser.add_argument('-v', '--vocab-file', + help='Path to file containing entire understandable vocab') + parser.add_argument('-s', '--sequence-length', + help='The sequence length to use. Defaults to 128', + default=128, type=int) + parser.add_argument('--max-query-length', + help='The maximum length of a query in number of tokens. Queries longer than this will be truncated', + default=64, type=int) + parser.add_argument('--max-answer-length', + help='The maximum length of an answer that can be generated', + default=30, type=int) + parser.add_argument('--n-best-size', + help='Total number of n-best predictions to generate in the nbest_predictions.json output file', + default=20, type=int) + parser.add_argument('--doc-stride', + help='When splitting up a long document into chunks, what stride to take between chunks', + default=128, type=int) + parser.add_argument('--target_qps', + help="target qps metric", required=False, type=int) + parser.add_argument("-i", "--int8", action="store_true", help="Indicates that inference should be run in INT8 precision", required=False) + args, _ = parser.parse_known_args() + return args + +if __name__ == '__main__': + args = parse_args() + + paragraph_text = None + squad_examples = None + output_prediction_file = None + + if not args.passage == '': + paragraph_text = ' '.join(args.passage) + elif not args.passage_file == '': + f = open(args.passage_file, 'r') + paragraph_text = f.read() + elif not args.squad_json == '': + squad_examples = dp.read_squad_json(args.squad_json) + output_prediction_file = args.output_prediction_file + else: + paragraph_text = input("Paragraph: ") + + question_text = None + if not args.question == '': + question_text = ' '.join(args.question) + elif not args.question_file == '': + f = open(args.question_file, 'r') + question_text = f.read() + + tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) + # When splitting up a long document into chunks, how much stride to take between chunks. + doc_stride = args.doc_stride + # The maximum total input sequence length after WordPiece tokenization. + # Sequences longer than this will be truncated, and sequences shorter + max_seq_length = args.sequence_length + + def question_features(tokens, question): + # Extract features from the paragraph and question + return dp.convert_example_to_features(tokens, question, tokenizer, max_seq_length, doc_stride, args.max_query_length) + + load_ixrt_plugin(TRT_LOGGER) + + # The first context created will use the 0th profile. A new context must be created + # for each additional profile needed. Here, we only use batch size 1, thus we only need the first profile. + with open(args.engine, 'rb') as f: + runtime = trt.Runtime(TRT_LOGGER) + engine = runtime.deserialize_cuda_engine(f.read()) + context = engine.create_execution_context() + + # select engine profile + selected_profile = -1 + num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles + for idx in range(engine.num_optimization_profiles): + profile_shape = engine.get_profile_shape(profile_index = idx, binding = idx * num_binding_per_profile) + if profile_shape[0][0] <= args.batch_size and profile_shape[2][0] >= args.batch_size and profile_shape[0][1] <= max_seq_length and profile_shape[2][1] >= max_seq_length: + selected_profile = idx + break + if selected_profile == -1: + raise RuntimeError("Could not find any profile that can run batch size {}.".format(args.batch_size)) + + # Create a stream in which to copy inputs/outputs and run inference. + stream = cuda.Stream() + + # if args.use_trt: + # context.active_optimization_profile = selected_profile + # else: + context.set_optimization_profile_async(selected_profile, stream.handle) + binding_idx_offset = selected_profile * num_binding_per_profile + + input_shape = (args.batch_size, max_seq_length) + input_nbytes = trt.volume(input_shape) * 4 + for binding in range(3): + context.set_binding_shape(binding, input_shape) + assert context.all_binding_shapes_specified + + # Allocate device memory for inputs. + d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)] + + # Allocate output buffer by querying the size from the context. This may be different for different input shapes. + h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32) + d_output = cuda.mem_alloc(h_output.nbytes) + + def inference(features, tokens): + global h_output + + _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name + "NetworkOutput", + ["start_logits", "end_logits", "feature_index"]) + networkOutputs = [] + + eval_time_elapsed = 0 + for feature_index, feature in enumerate(features): + # Copy inputs + input_ids_batch = np.repeat(np.expand_dims(feature.input_ids, 0), args.batch_size, axis=0) + segment_ids_batch = np.repeat(np.expand_dims(feature.segment_ids, 0), args.batch_size, axis=0) + input_mask_batch = np.repeat(np.expand_dims(feature.input_mask, 0), args.batch_size, axis=0) + + input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel())) + segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel())) + input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel())) + + eval_start_time = time.time() + cuda.memcpy_htod_async(d_inputs[0], input_ids, stream) + cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream) + cuda.memcpy_htod_async(d_inputs[2], input_mask, stream) + + # Run inference + context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) + # Synchronize the stream + stream.synchronize() + eval_time_elapsed += (time.time() - eval_start_time) + + # Transfer predictions back from GPU + cuda.memcpy_dtoh_async(h_output, d_output, stream) + stream.synchronize() + # for x in h_output[0].reshape(-1,2): + # print(x) + # Only retrieve and post-process the first batch + batch = h_output[0] + + networkOutputs.append(_NetworkOutput( + start_logits = np.array(batch.squeeze()[:, 0]), + end_logits = np.array(batch.squeeze()[:, 1]), + feature_index = feature_index + )) + + eval_time_elapsed /= len(features) + + # Total number of n-best predictions to generate in the nbest_predictions.json output file + n_best_size = 20 + + # The maximum length of an answer that can be generated. This is needed + # because the start and end predictions are not conditioned on one another + max_answer_length = 30 + + prediction, nbest_json, scores_diff_json = dp.get_predictions(tokens, features, + networkOutputs, args.n_best_size, args.max_answer_length) + + return eval_time_elapsed, prediction, nbest_json + + def print_single_query(eval_time_elapsed, prediction, nbest_json): + print("------------------------") + print("Running inference in {:.3f} Sentences/Sec".format(args.batch_size/eval_time_elapsed)) + print("------------------------") + + print("Answer: '{}'".format(prediction)) + print("With probability: {:.3f}".format(nbest_json[0]['probability'] * 100.0)) + + def inference_all_dynamic(features_list, squad_examples, sort_index, all_precision): + # h_output = torch.tensor((args.batch_size, max_seq_length, 2)) + global h_output + _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name + "NetworkOutput", + ["start_logits", "end_logits", "feature_index"]) + networkOutputs = [] + + batch_input_ids = [] + batch_segment_ids = [] + all_token_ids = [] + batch_example_list = [] + batch_feature_list = [] + batch_feature = [] + batch_example = [] + max_batch_length = 0 + seq_length_list = [] + for index in sort_index: + batch_feature.append(features_list[index]) + batch_example.append(squad_examples[index]) + max_batch_length = max(max_batch_length, len(features_list[index].input_ids)) + if args.int8: + max_batch_length = math.ceil(max_batch_length / 2) * 2 + else: + # workround to solve bs=1 10% slow + if args.batch_size == 1: + max_batch_length = math.ceil(max_batch_length / 64) * 64 + seq_length_list.append(len(features_list[index].input_ids)) + if len(batch_feature) == args.batch_size: + batch_input_ids = [ + np.pad(bf.input_ids, (0, max_batch_length - bf.input_ids.shape[0]), 'constant',constant_values = (0)).reshape(1, -1) + for bf in batch_feature + ] + batch_input_ids = np.concatenate(batch_input_ids, axis=0) + batch_segment_ids = [ + np.pad(bf.segment_ids, (0, max_batch_length - bf.segment_ids.shape[0]), 'constant',constant_values = (0)).reshape(1, -1) + for bf in batch_feature + ] + batch_segment_ids = np.concatenate(batch_segment_ids, axis=0) + all_token_ids.append( + [ + batch_input_ids.astype(np.int32), + batch_segment_ids.astype(np.int32) + ] + ) + batch_example_list.append(batch_example) + batch_feature_list.append(batch_feature) + batch_input_ids = [] + batch_segment_ids = [] + batch_feature = [] + batch_example = [] + max_batch_length = 0 + + if len(batch_feature): + batch_input_ids = [ + np.pad(bf.input_ids, (0, max_batch_length - bf.input_ids.shape[0]), 'constant',constant_values = (0)).reshape(1, -1) + for bf in batch_feature + ] + batch_input_ids = np.concatenate(batch_input_ids, axis=0) + batch_segment_ids = [ + np.pad(bf.segment_ids, (0, max_batch_length - bf.segment_ids.shape[0]), 'constant',constant_values = (0)).reshape(1, -1) + for bf in batch_feature + ] + batch_segment_ids = np.concatenate(batch_segment_ids, axis=0) + all_token_ids.append( + [ + batch_input_ids.astype(np.int32), + batch_segment_ids.astype(np.int32) + ] + ) + batch_input_ids = [] + batch_segment_ids = [] + batch_example_list.append(batch_example) + batch_feature_list.append(batch_feature) + + # warm up + for i in range(20): + for binding in range(3): + context.set_binding_shape(binding, (args.batch_size, max_seq_length)) + assert context.all_binding_shapes_specified + cuda.memcpy_htod_async(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream) + cuda.memcpy_htod_async(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream) + context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) + stream.synchronize() + + infer_toal_time = 0 + output_index = 0 + for input_ids, segment_ids in tqdm(all_token_ids): + for binding in range(3): + context.set_binding_shape(binding, input_ids.shape) + assert context.all_binding_shapes_specified + + cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream) + cuda.memcpy_htod_async(d_inputs[1], segment_ids.ravel(), stream) + stream.synchronize() + + infer_start_time = time.time() + context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) + stream.synchronize() + infer_end_time = time.time() + infer_time = infer_end_time - infer_start_time + infer_toal_time += infer_time + + cuda.memcpy_dtoh_async(h_output, d_output, stream) + stream.synchronize() + + new_h_output = np.array(h_output.reshape(-1)[:input_ids.shape[0]*input_ids.shape[1]*2]).reshape(input_ids.shape[0], input_ids.shape[1], 2) + for index in range(input_ids.shape[0]): + networkOutputs.append(_NetworkOutput( + start_logits = new_h_output[index, :seq_length_list[output_index], 0], + end_logits = new_h_output[index, :seq_length_list[output_index], 1], + feature_index = index + )) + output_index += 1 + + output_index = 0 + for (be, bf) in zip(batch_example_list, batch_feature_list): + for index in range(len(bf)): + prediction, nbest_json, scores_diff_json = dp.get_predictions(be[index].doc_tokens, bf, + [networkOutputs[output_index]], args.n_best_size, args.max_answer_length) + output_index += 1 + all_precision[be[index].id] = prediction + return infer_toal_time, all_precision + + status = 0 + if squad_examples: + all_predictions = collections.OrderedDict() + + features_list = [] + lengths = [] + + for example_index, example in enumerate(squad_examples): + features = question_features(example.doc_tokens, example.question_text) + features_list.append(features[0]) + lengths.append(len(features[0].input_ids)) + + sort_index = np.argsort(lengths) + infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions) + + qps = math.ceil(len(squad_examples)/args.batch_size)*args.batch_size/infer_time + print(f"Latency QPS: {qps} sentences/s") + + with open(output_prediction_file, "w") as f: + f.write(json.dumps(all_predictions, indent=4)) + print("\nOutput dump to {}".format(output_prediction_file)) + + if args.target_qps: + if qps >= args.target_qps: + print(f"target qps: {args.target_qps}, qps: {qps}, pass.") + else: + print(f"target qps: {args.target_qps}, qps: {qps}, failed.") + status = 1 + else: + # Extract tokecs from the paragraph + doc_tokens = dp.convert_doc_tokens(paragraph_text) + + if question_text: + print("\nPassage: {}".format(paragraph_text)) + print("\nQuestion: {}".format(question_text)) + + features = question_features(doc_tokens, question_text) + eval_time_elapsed, prediction, nbest_json = inference(features, doc_tokens) + print_single_query(eval_time_elapsed, prediction, nbest_json) + else: + # If no question text is provided, loop until the question is 'exit' + EXIT_CMDS = ["exit", "quit"] + question_text = input("Question (to exit, type one of {:}): ".format(EXIT_CMDS)) + + while question_text.strip() not in EXIT_CMDS: + features = question_features(doc_tokens, question_text) + eval_time_elapsed, prediction, nbest_json = inference(features, doc_tokens) + # print_single_query(eval_time_elapsed, prediction, nbest_json) + # question_text = input("Question (to exit, type one of {:}): ".format(EXIT_CMDS)) + del context + del engine + sys.exit(status) \ No newline at end of file diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/load_ixrt_plugin.py b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/load_ixrt_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..ed2939c651317f9e6b086242e49b251ff7ba56c4 --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/ixrt/load_ixrt_plugin.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from os.path import join, dirname, exists +import tensorrt as trt +import ctypes + +current_directory = os.getcwd() + +def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""): + if not dynamic_path: + dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so") + if not exists(dynamic_path): + raise FileNotFoundError( + f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") + ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL) + trt.init_libnvinfer_plugins(logger, namespace) + print(f"Loaded plugin from {dynamic_path}") \ No newline at end of file diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/script/infer_bert_base_squad_fp16_ixrt.sh b/models/nlp/language_model/bert_base_squad/ixrt/python/script/infer_bert_base_squad_fp16_ixrt.sh new file mode 100644 index 0000000000000000000000000000000000000000..4da5ac8fa51e5abdc9aa600c64a87cb0a125be5f --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/script/infer_bert_base_squad_fp16_ixrt.sh @@ -0,0 +1,64 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +set -eo pipefail + +BSZ=32 +TGT=87 +USE_TRT=False + +# Update arguments +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + --use_trt) USE_TRT=${arguments[index]};; + esac +done + +current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) +project_path=$(realpath ${current_path}/..) +checkpoints_path=${project_path}/data/bert_base_uncased_squad +datasets_path=${project_path}/data/ + +echo 'USE_TRT='${USE_TRT} +export USE_TRT=$USE_TRT + +echo "Step1 Build Engine FP16(bert base squad)!" +cd ${project_path}/ixrt +python3 builder.py -x ${checkpoints_path}/bert_base_squad.onnx \ + -w 4096 \ + -o ${checkpoints_path}/bert_base_b${BSZ}.engine \ + -s 1 384 384 \ + -b 1 ${BSZ} ${BSZ}\ + --fp16 \ + -c ${checkpoints_path}/config.json \ + -z ${USE_TRT} + +echo "Step2 Run dev.json and generate json" +python3 inference.py -e ${checkpoints_path}/bert_base_b${BSZ}.engine \ + -s 384 \ + -b ${BSZ} \ + -sq ${datasets_path}/squad/dev-v1.1.json \ + -v ${checkpoints_path}/vocab.txt \ + -o ${checkpoints_path}/predictions-bert_base_b${BSZ}.json \ + -z ${USE_TRT} + +echo "Step3 Inference(test F1-score)" +python3 evaluate-v1.1.py ${datasets_path}/squad/dev-v1.1.json ${checkpoints_path}/predictions-bert_base_b${BSZ}.json ${TGT} \ No newline at end of file diff --git a/models/nlp/language_model/bert_base_squad/ixrt/python/script/infer_bert_base_squad_int8_ixrt.sh b/models/nlp/language_model/bert_base_squad/ixrt/python/script/infer_bert_base_squad_int8_ixrt.sh new file mode 100644 index 0000000000000000000000000000000000000000..b5596c1cf6e840b33bcee30e91a61bb22dec2c44 --- /dev/null +++ b/models/nlp/language_model/bert_base_squad/ixrt/python/script/infer_bert_base_squad_int8_ixrt.sh @@ -0,0 +1,64 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +set -eo pipefail + +BSZ=32 +TGT=86 +USE_TRT=False + +# Update arguments +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + --use_trt) USE_TRT=${arguments[index]};; + esac +done + +current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) +project_path=$(realpath ${current_path}/..) +echo ${project_path} +checkpoints_path=${project_path}/data/bert_base_uncased_squad/ +datasets_path=${project_path}/data/ + +echo 'USE_TRT='${USE_TRT} +export USE_TRT=$USE_TRT + +echo "Step1 Build Engine Int8(bert base squad)!" +cd ${project_path}/ixrt +python3 builder_int8.py -pt ${checkpoints_path}/bert_base_int8_qat.bin \ + -o ${checkpoints_path}/bert_base_int8_b${BSZ}.engine \ + -b 1 ${BSZ} ${BSZ} \ + -s 1 384 384 \ + -i \ + -c ${checkpoints_path} + +echo "Step2 Run dev.json and generate json" +python3 inference.py -e ${checkpoints_path}/bert_base_int8_b${BSZ}.engine \ + -b ${BSZ} \ + -s 384 \ + -sq ${datasets_path}/squad/dev-v1.1.json \ + -v ${checkpoints_path}/vocab.txt \ + -o ${checkpoints_path}/predictions-bert_base_int8_b${BSZ}.json \ + -z ${USE_TRT} \ + -i + +echo "Step3 Inference(test F1-score)" +python3 evaluate-v1.1.py ${datasets_path}/squad/dev-v1.1.json ${checkpoints_path}/predictions-bert_base_int8_b${BSZ}.json ${TGT} \ No newline at end of file