diff --git a/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt b/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt deleted file mode 100644 index 9a0e7a1217b72ee65ddca197e07b75294e736d60..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/CMakeLists.txt +++ /dev/null @@ -1,49 +0,0 @@ -cmake_minimum_required(VERSION 3.10) - -project(nv_plugin) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake;${CMAKE_MODULE_PATH}") -set(CMAKE_CXX_EXTENSIONS OFF) - -set(TARGET_NAME ixrt_plugin) -set(SHARED_TARGET ${TARGET_NAME}) -set(STATIC_TARGET ${TARGET_NAME}_static) -set(PLUGIN_REPO_PATH ${PROJECT_SOURCE_DIR}) - -if(DEFINED USE_TENSORRT) - find_package(CUDA) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-arch=sm_75) - - include_directories( - ${CUDA_PATH}/include) - - message(STATUS "Plugin lib use TRT 8.6.1") - set(TRT_INC_PATH /usr/include/x86_64-linux-gnu/) - set(TRT_LIB_PATH /usr/lib/x86_64-linux-gnu/ /usr/local/cuda/targets/x86_64-linux/lib) - set(TRT_LIBRARY nvinfer cublasLt) - - message(STATUS "cuda_libs = ${CUDA_LIBRARIES}") - message(STATUS "cudadevrt_libs = ${CUDA_cudadevrt_LIBRARY}") -else() - include(FindIxrt) - include(FindCompiler) - include(FindCuda) - set(TRT_LIBRARY cublasLt cudart ixrt) - include_directories(${IXRT_INCLUDE_DIR} - ${CUDA_PATH}/include) - add_definitions(-D__ILUVATAR__) - - string(APPEND CMAKE_CXX_FLAGS " -std=c++17") -endif() - -include(FindPluginFiles) - -################################## Compile Options ###################################### -cuda_add_library(${SHARED_TARGET} SHARED - ${PLUGIN_FILES} -) - -target_link_libraries(${SHARED_TARGET} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY} ${TRT_LIBRARY}) -target_link_directories(${SHARED_TARGET} PUBLIC ${CUDA_PATH}/lib64 ${TRT_LIB_PATH} ${IXRT_LIB_DIR}) -target_include_directories(${SHARED_TARGET} PUBLIC ${CUDA_PATH}/include ${TRT_INC_PATH} src PUBLIC src/common) diff --git a/models/nlp/plm/bert_large_squad/ixrt/README.md b/models/nlp/plm/bert_large_squad/ixrt/README.md index 13e741143479e58b50e79745511facd2542c4ea7..f66034138a29e03f054007f7391aca3c539b6dd5 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/README.md +++ b/models/nlp/plm/bert_large_squad/ixrt/README.md @@ -17,58 +17,21 @@ BERT is designed to pre-train deep bidirectional representations from unlabeled Get `bert-large-uncased.zip` from [Google Drive](https://drive.google.com/file/d/1eD8QBkbK6YN-_YXODp3tmpp3cZKlrPTA/view?usp=drive_link) -```bash -cd python/ -bash script/prepare.sh v1_1 -``` - -### Install Dependencies - -#### Install on Iluvatar - -```bash -cmake -S . -B build -cmake --build build -j16 -``` - -#### Install on NV - -Require tensorrt_version >= 8.6 - -```bash -# Get TensorRT docker image -docker pull nvcr.io/nvidia/tensorrt:23.04-py3 -# Run TensorRT docker -``` - -```bash -# Install requirements.txt in TensorRT docker -pip3 install -r requirements.txt - -# Build -cmake -S . -B build -DUSE_TENSORRT=true -cmake --build build -j16 -``` ## Model Inference ### FP16 ```bash -cd python/ - -# use --bs to set max_batch_size (dynamic) -bash script/build_engine.sh --bs 32 -bash script/inference_squad.sh --bs 32 +bash script/infer_bert_large_squad_fp16_accuracy.sh +bash script/infer_bert_large_squad_fp16_performance.sh ``` ### INT8 ```bash -cd python -pip install onnx pycuda -bash script/build_engine.sh --bs 32 --int8 -bash script/inference_squad.sh --bs 32 --int8 +bash script/infer_bert_large_squad_int8_accuracy.sh +bash script/infer_bert_large_squad_int8_performance.sh ``` | Model | BatchSize | Precision | Latency QPS | exact_match | f1 | diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder.py b/models/nlp/plm/bert_large_squad/ixrt/builder.py similarity index 38% rename from models/nlp/plm/bert_large_squad/ixrt/python/builder.py rename to models/nlp/plm/bert_large_squad/ixrt/builder.py index 627027a09834314d25883d768b935b970a0fa64f..970f91bc27011be0ca26e1ac2a4f4cc255010ec8 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/builder.py +++ b/models/nlp/plm/bert_large_squad/ixrt/builder.py @@ -1,3 +1,19 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# + #!/usr/bin/env python3 # Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. @@ -30,38 +46,64 @@ # limitations under the License. # -import os import argparse -import json -import tensorrt as trt -import time -import sys import ctypes +import json import os -import numpy as np -from builder_utils import load_onnx_weights_and_quant, load_pytorch_weights_and_quant -from builder_utils import WQKV, BQKV # Attention Keys -from builder_utils import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT # Transformer Keys -from builder_utils import SQD_W, SQD_B # SQuAD Output Keys +import sys +import time -trt_version = [int(n) for n in trt.__version__.split('.')] -plugin_lib_name = "libnvinfer_plugin.so" if os.getenv('USE_TRT') == 'True' else "libixrt_plugin.so" +import numpy as np +import ixrt +from builder_utils import ( # Attention Keys; Transformer Keys; SQuAD Output Keys + B_AOUT, + B_LOUT, + B_MID, + BQKV, + SQD_B, + SQD_W, + W_AOUT, + W_LOUT, + W_MID, + WQKV, + load_onnx_weights_and_quant, + load_pytorch_weights_and_quant, +) + +plugin_lib_name = ( + "libnvinfer_plugin.so" if os.getenv("USE_TRT") == "True" else "libixrt_plugin.so" +) print(plugin_lib_name) -TRT_LOGGER = trt.Logger(trt.Logger.WARNING) -from load_ixrt_plugin import load_ixrt_plugin, is_nvidia_platform +TRT_LOGGER = ixrt.Logger(ixrt.Logger.WARNING) +from load_ixrt_plugin import load_ixrt_plugin + load_ixrt_plugin(TRT_LOGGER) -plg_registry = trt.get_plugin_registry() +plg_registry = ixrt.get_plugin_registry() registry_list = plg_registry.plugin_creator_list -print("registry_list: ", [registry.name + '/' + registry.plugin_version for registry in registry_list]) -emln_plg_creator = plg_registry.get_plugin_creator("CustomEmbLayerNormPluginDynamic_IxRT", "1", "") -qkv2_plg_creator = plg_registry.get_plugin_creator("CustomQKVToContextPluginDynamic_IxRT", "1", "") -skln_plg_creator = plg_registry.get_plugin_creator("CustomSkipLayerNormPluginDynamic_IxRT", "1", "") -ffn_plg_creator = plg_registry.get_plugin_creator("CustomFFNPluginDynamic_IxRT", "1", "") -gelu_plg_creator = plg_registry.get_plugin_creator("CustomGeluPluginDynamic_IxRT", "1", "") +print( + "registry_list: ", + [registry.name + "/" + registry.plugin_version for registry in registry_list], +) +emln_plg_creator = plg_registry.get_plugin_creator( + "CustomEmbLayerNormPluginDynamic_IxRT", "1", "" +) +qkv2_plg_creator = plg_registry.get_plugin_creator( + "CustomQKVToContextPluginDynamic_IxRT", "1", "" +) +skln_plg_creator = plg_registry.get_plugin_creator( + "CustomSkipLayerNormPluginDynamic_IxRT", "1", "" +) +ffn_plg_creator = plg_registry.get_plugin_creator( + "CustomFFNPluginDynamic_IxRT", "1", "" +) +gelu_plg_creator = plg_registry.get_plugin_creator( + "CustomGeluPluginDynamic_IxRT", "1", "" +) fc_plg_creator = plg_registry.get_plugin_creator("CustomFCPluginDynamic_IxRT", "1", "") + class BertConfig: def __init__(self, bert_config_path, use_fp16, use_trt): with open(bert_config_path, "r") as f: @@ -74,42 +116,51 @@ class BertConfig: self.use_fp16 = use_fp16 self.use_trt = use_trt + def set_tensor_name(tensor, prefix, name): tensor.name = prefix + name -def set_output_name(layer, prefix, name, out_idx = 0): + +def set_output_name(layer, prefix, name, out_idx=0): set_tensor_name(layer.get_output(out_idx), prefix, name) -def set_output_range(layer, maxval, out_idx = 0): + +def set_output_range(layer, maxval, out_idx=0): layer.get_output(out_idx).set_dynamic_range(-maxval, maxval) + def get_mha_dtype(config): - dtype = trt.float32 + dtype = ixrt.float32 if config.use_fp16: - dtype = trt.float16 + dtype = ixrt.float16 return int(dtype) + def custom_fc(network, input_tensor, out_dims, W, B): - pf_out_dims = trt.PluginField("out_dims", np.array(out_dims, dtype=np.int32), trt.PluginFieldType.INT32) - pf_type = trt.PluginField("type_id", np.array(int(trt.float16), dtype=np.int32), trt.PluginFieldType.INT32) - pf_W = trt.PluginField("W", W, trt.PluginFieldType.FLOAT32) + pf_out_dims = ixrt.PluginField( + "out_dims", np.array(out_dims, dtype=np.int32), ixrt.PluginFieldType.INT32 + ) + pf_type = ixrt.PluginField( + "type_id", np.array(int(ixrt.float16), dtype=np.int32), ixrt.PluginFieldType.INT32 + ) + pf_W = ixrt.PluginField("W", W, ixrt.PluginFieldType.FLOAT32) fields = [pf_out_dims, pf_type, pf_W] if B is not None: - pf_B = trt.PluginField("B", B, trt.PluginFieldType.FLOAT32) + pf_B = ixrt.PluginField("B", B, ixrt.PluginFieldType.FLOAT32) fields.append(pf_B) - pfc = trt.PluginFieldCollection(fields) + pfc = ixrt.PluginFieldCollection(fields) fc_plugin = fc_plg_creator.create_plugin("fcplugin", pfc) plug_inputs = [input_tensor] out_dense = network.add_plugin_v2(plug_inputs, fc_plugin) return out_dense + def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask): """ Add the attention layer """ - assert(len(input_tensor.shape) == 5) - B, S, hidden_size, _, _ = input_tensor.shape + B, S, hidden_size = input_tensor.shape num_heads = config.num_attention_heads head_size = int(hidden_size / num_heads) @@ -117,18 +168,27 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask) Ball = init_dict[prefix + BQKV] # FC_attention - if config.use_trt: - mult_all = network.add_fully_connected(input_tensor, 3 * hidden_size, Wall, Ball) - else: - mult_all = custom_fc(network, input_tensor, 3 * hidden_size, Wall, Ball) + mult_all = custom_fc(network, input_tensor, 3 * hidden_size, Wall, Ball) has_mask = imask is not None # QKV2CTX - pf_type = trt.PluginField("type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32) - pf_hidden_size = trt.PluginField("hidden_size", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32) - pf_num_heads = trt.PluginField("num_heads", np.array([num_heads], np.int32), trt.PluginFieldType.INT32) - pf_has_mask = trt.PluginField("has_mask", np.array([has_mask], np.int32), trt.PluginFieldType.INT32) - pfc = trt.PluginFieldCollection([pf_hidden_size, pf_num_heads, pf_has_mask, pf_type]) + pf_type = ixrt.PluginField( + "type_id", + np.array([get_mha_dtype(config)], np.int32), + ixrt.PluginFieldType.INT32, + ) + pf_hidden_size = ixrt.PluginField( + "hidden_size", np.array([hidden_size], np.int32), ixrt.PluginFieldType.INT32 + ) + pf_num_heads = ixrt.PluginField( + "num_heads", np.array([num_heads], np.int32), ixrt.PluginFieldType.INT32 + ) + pf_has_mask = ixrt.PluginField( + "has_mask", np.array([has_mask], np.int32), ixrt.PluginFieldType.INT32 + ) + pfc = ixrt.PluginFieldCollection( + [pf_hidden_size, pf_num_heads, pf_has_mask, pf_type] + ) qkv2ctx_plug = qkv2_plg_creator.create_plugin("qkv2ctx", pfc) qkv_in = [mult_all.get_output(0)] @@ -143,46 +203,56 @@ def skipln(prefix, config, init_dict, network, input_tensor, skip, bias=None): Add the skip layer """ idims = input_tensor.shape - assert len(idims) == 5 hidden_size = idims[2] - dtype = trt.float32 + dtype = ixrt.float32 if config.use_fp16: - dtype = trt.float16 + dtype = ixrt.float16 - pf_ld = trt.PluginField("ld", np.array([hidden_size], np.int32), trt.PluginFieldType.INT32) + pf_ld = ixrt.PluginField( + "ld", np.array([hidden_size], np.int32), ixrt.PluginFieldType.INT32 + ) wbeta = init_dict[prefix + "beta"] - pf_beta = trt.PluginField("beta", wbeta, trt.PluginFieldType.FLOAT32) + pf_beta = ixrt.PluginField("beta", wbeta, ixrt.PluginFieldType.FLOAT32) wgamma = init_dict[prefix + "gamma"] - pf_gamma = trt.PluginField("gamma", wgamma, trt.PluginFieldType.FLOAT32) - pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32) + pf_gamma = ixrt.PluginField("gamma", wgamma, ixrt.PluginFieldType.FLOAT32) + pf_type = ixrt.PluginField( + "type_id", np.array([int(dtype)], np.int32), ixrt.PluginFieldType.INT32 + ) - fields = [pf_ld, pf_beta, pf_gamma, pf_type ] + fields = [pf_ld, pf_beta, pf_gamma, pf_type] if bias is not None: - pf_bias = trt.PluginField("bias", bias, trt.PluginFieldType.FLOAT32) + pf_bias = ixrt.PluginField("bias", bias, ixrt.PluginFieldType.FLOAT32) fields.append(pf_bias) - pfc = trt.PluginFieldCollection(fields) + pfc = ixrt.PluginFieldCollection(fields) skipln_plug = skln_plg_creator.create_plugin("skipln", pfc) skipln_inputs = [input_tensor, skip] layer = network.add_plugin_v2(skipln_inputs, skipln_plug) return layer + def ffn_trt(prefix, config, init_dict, network, input_tensor): - # FC1 + GELU + # FC1 + GELU B_mid = init_dict[prefix + B_MID] W_mid = init_dict[prefix + W_MID] - mid_dense = network.add_fully_connected(input_tensor, config.intermediate_size, W_mid, B_mid) + mid_dense = network.add_fully_connected( + input_tensor, config.intermediate_size, W_mid, B_mid + ) - dtype = trt.float32 + dtype = ixrt.float32 if config.use_fp16: - dtype = trt.float16 - pf_type = trt.PluginField("type_id", np.array([int(dtype)], np.int32), trt.PluginFieldType.INT32) - pf_ld = trt.PluginField("ld", np.array([config.hidden_size], np.int32), trt.PluginFieldType.INT32) - - pfc = trt.PluginFieldCollection([pf_type, pf_ld]) + dtype = ixrt.float16 + pf_type = ixrt.PluginField( + "type_id", np.array([int(dtype)], np.int32), ixrt.PluginFieldType.INT32 + ) + pf_ld = ixrt.PluginField( + "ld", np.array([config.hidden_size], np.int32), ixrt.PluginFieldType.INT32 + ) + + pfc = ixrt.PluginFieldCollection([pf_type, pf_ld]) gelu_plug = gelu_plg_creator.create_plugin("gelu", pfc) gelu_inputs = [mid_dense.get_output(0)] @@ -194,54 +264,88 @@ def ffn_trt(prefix, config, init_dict, network, input_tensor): # Dense to hidden size B_lout = init_dict[prefix + B_LOUT] W_lout = init_dict[prefix + W_LOUT] - out_dense = network.add_fully_connected(intermediate_act, config.hidden_size, W_lout, B_lout) + out_dense = network.add_fully_connected( + intermediate_act, config.hidden_size, W_lout, B_lout + ) B_lout = None - out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, out_dense.get_output(0), input_tensor, B_lout) + out_layer = skipln( + prefix + "output_layernorm_", + config, + init_dict, + network, + out_dense.get_output(0), + input_tensor, + B_lout, + ) return out_layer + def ffn(prefix, config, init_dict, network, input_tensor): # FC1 + GELU B_mid = init_dict[prefix + B_MID] W_mid = init_dict[prefix + W_MID] B_lout = init_dict[prefix + B_LOUT] W_lout = init_dict[prefix + W_LOUT] - pf_out_dim = trt.PluginField("out_dims", np.array(config.hidden_size, np.int32), trt.PluginFieldType.INT32) - pf_type = trt.PluginField("type_id", np.array(int(trt.float16), np.int32), trt.PluginFieldType.INT32) - pf_W1 = trt.PluginField("W1", W_mid, trt.PluginFieldType.FLOAT32) - pf_W2 = trt.PluginField("W2", W_lout, trt.PluginFieldType.FLOAT32) - pf_B1 = trt.PluginField("B1", B_mid, trt.PluginFieldType.FLOAT32) - pf_act_type = trt.PluginField("act_type", np.array(int(3), np.int32), trt.PluginFieldType.INT32) - pfc = trt.PluginFieldCollection([pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type]) + pf_out_dim = ixrt.PluginField( + "out_dims", np.array(config.hidden_size, np.int32), ixrt.PluginFieldType.INT32 + ) + pf_type = ixrt.PluginField( + "type_id", np.array(int(ixrt.float16), np.int32), ixrt.PluginFieldType.INT32 + ) + pf_W1 = ixrt.PluginField("W1", W_mid, ixrt.PluginFieldType.FLOAT32) + pf_W2 = ixrt.PluginField("W2", W_lout, ixrt.PluginFieldType.FLOAT32) + pf_B1 = ixrt.PluginField("B1", B_mid, ixrt.PluginFieldType.FLOAT32) + pf_act_type = ixrt.PluginField( + "act_type", np.array(int(3), np.int32), ixrt.PluginFieldType.INT32 + ) + pfc = ixrt.PluginFieldCollection( + [pf_out_dim, pf_type, pf_W1, pf_W2, pf_B1, pf_act_type] + ) ffn_plug = ffn_plg_creator.create_plugin("ffn", pfc) ffn_inputs = [input_tensor] ffn_layer = network.add_plugin_v2(ffn_inputs, ffn_plug) - out_layer = skipln(prefix + "output_layernorm_", config, init_dict, network, ffn_layer.get_output(0), input_tensor, B_lout) + out_layer = skipln( + prefix + "output_layernorm_", + config, + init_dict, + network, + ffn_layer.get_output(0), + input_tensor, + B_lout, + ) return out_layer + def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imask): """ Add the transformer layer """ idims = input_tensor.shape - assert len(idims) == 5 hidden_size = idims[2] - context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask) + context_transposed = attention_layer_opt( + prefix + "attention_", config, init_dict, network, input_tensor, imask + ) attention_heads = context_transposed.get_output(0) - + # FC0 B_aout = init_dict[prefix + B_AOUT] W_aout = init_dict[prefix + W_AOUT] - if config.use_trt: - attention_out_fc = network.add_fully_connected(attention_heads, hidden_size, W_aout, B_aout) - else: - attention_out_fc = custom_fc(network, attention_heads, hidden_size, W_aout, B_aout) - B_aout = None - - skiplayer = skipln(prefix + "attention_output_layernorm_",config, init_dict, network, attention_out_fc.get_output(0), input_tensor, B_aout) + attention_out_fc = custom_fc(network, attention_heads, hidden_size, W_aout, B_aout) + B_aout = None + + skiplayer = skipln( + prefix + "attention_output_layernorm_", + config, + init_dict, + network, + attention_out_fc.get_output(0), + input_tensor, + B_aout, + ) attention_ln = skiplayer.get_output(0) if config.use_trt: @@ -250,121 +354,277 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imas ffn_layer = ffn(prefix, config, init_dict, network, attention_ln) return ffn_layer + def bert_model(config, init_dict, network, input_tensor, input_mask): """ Create the bert model """ prev_input = input_tensor for layer in range(0, config.num_hidden_layers): - ss = "l{}_".format(layer) - out_layer = transformer_layer_opt(ss, config, init_dict, network, prev_input, input_mask) + ss = "l{}_".format(layer) + out_layer = transformer_layer_opt( + ss, config, init_dict, network, prev_input, input_mask + ) prev_input = out_layer.get_output(0) return prev_input + def squad_output(prefix, config, init_dict, network, input_tensor): """ Create the squad output """ idims = input_tensor.shape - assert len(idims) == 5 - B, S, hidden_size, _, _ = idims + B, S, hidden_size = idims W_out = init_dict[prefix + SQD_W] B_out = init_dict[prefix + SQD_B] + dense = custom_fc(network, input_tensor, 2, W_out, B_out) + if config.use_trt: - dense = network.add_fully_connected(input_tensor, 2, W_out, B_out) - else: - dense = custom_fc(network, input_tensor, 2, W_out, B_out) - + OUT = network.add_shuffle(dense.get_output(0)) + OUT.second_transpose = (1, 0, 2) + return OUT return dense -def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes): - input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) - segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) - input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1 if len(batch_sizes) > 1 else batch_sizes[0], -1 if len(sequence_lengths) > 1 else sequence_lengths[0])) + +def emb_layernorm( + builder, + network, + config, + weights_dict, + builder_config, + sequence_lengths, + batch_sizes, +): + input_ids = network.add_input( + name="input_ids", + dtype=ixrt.int32, + shape=( + -1 if len(batch_sizes) > 1 else batch_sizes[0], + -1 if len(sequence_lengths) > 1 else sequence_lengths[0], + ), + ) + segment_ids = network.add_input( + name="segment_ids", + dtype=ixrt.int32, + shape=( + -1 if len(batch_sizes) > 1 else batch_sizes[0], + -1 if len(sequence_lengths) > 1 else sequence_lengths[0], + ), + ) + input_mask = network.add_input( + name="input_mask", + dtype=ixrt.int32, + shape=( + -1 if len(batch_sizes) > 1 else batch_sizes[0], + -1 if len(sequence_lengths) > 1 else sequence_lengths[0], + ), + ) if len(sequence_lengths) > 1: profile = builder.create_optimization_profile() min_shape = (batch_sizes[0], sequence_lengths[0]) opt_shape = (batch_sizes[1], sequence_lengths[1]) max_shape = (batch_sizes[2], sequence_lengths[2]) - assert(sequence_lengths[0] <= sequence_lengths[1] and sequence_lengths[1] <= sequence_lengths[2]) - - print('set dynamic shape -> ', min_shape, opt_shape, max_shape) + assert ( + sequence_lengths[0] <= sequence_lengths[1] + and sequence_lengths[1] <= sequence_lengths[2] + ) + + print("set dynamic shape -> ", min_shape, opt_shape, max_shape) profile.set_shape("input_ids", min_shape, opt_shape, max_shape) profile.set_shape("segment_ids", min_shape, opt_shape, max_shape) profile.set_shape("input_mask", min_shape, opt_shape, max_shape) builder_config.add_optimization_profile(profile) - wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"], trt.PluginFieldType.FLOAT32) - wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"], trt.PluginFieldType.FLOAT32) - wwordemb = trt.PluginField("bert_embeddings_word_embeddings", weights_dict["bert_embeddings_word_embeddings"], trt.PluginFieldType.FLOAT32) - wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32) - wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32) - - output_fp16 = trt.PluginField("output_fp16", np.array([1 if config.use_fp16 else 0]).astype(np.int32), trt.PluginFieldType.INT32) - mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32) - - pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type]) + wbeta = ixrt.PluginField( + "bert_embeddings_layernorm_beta", + weights_dict["bert_embeddings_layernorm_beta"], + ixrt.PluginFieldType.FLOAT32, + ) + + wgamma = ixrt.PluginField( + "bert_embeddings_layernorm_gamma", + weights_dict["bert_embeddings_layernorm_gamma"], + ixrt.PluginFieldType.FLOAT32, + ) + wwordemb = ixrt.PluginField( + "bert_embeddings_word_embeddings", + weights_dict["bert_embeddings_word_embeddings"], + ixrt.PluginFieldType.FLOAT32, + ) + wtokemb = ixrt.PluginField( + "bert_embeddings_token_type_embeddings", + weights_dict["bert_embeddings_token_type_embeddings"], + ixrt.PluginFieldType.FLOAT32, + ) + wposemb = ixrt.PluginField( + "bert_embeddings_position_embeddings", + weights_dict["bert_embeddings_position_embeddings"], + ixrt.PluginFieldType.FLOAT32, + ) + + output_fp16 = ixrt.PluginField( + "output_fp16", + np.array([1 if config.use_fp16 else 0]).astype(np.int32), + ixrt.PluginFieldType.INT32, + ) + mha_type = ixrt.PluginField( + "mha_type_id", + np.array([get_mha_dtype(config)], np.int32), + ixrt.PluginFieldType.INT32, + ) + + pfc = ixrt.PluginFieldCollection( + [wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type] + ) fn = emln_plg_creator.create_plugin("embeddings", pfc) - inputs = [input_ids, segment_ids, input_mask] + if config.use_trt: + input_ids = network.add_shuffle(input_ids) + input_ids.second_transpose = (1, 0) + segment_ids = network.add_shuffle(segment_ids) + segment_ids.second_transpose = (1, 0) + input_mask = network.add_shuffle(input_mask) + input_mask.second_transpose = (1, 0) + inputs = [ + input_ids.get_output(0), + segment_ids.get_output(0), + input_mask.get_output(0), + ] + else: + inputs = [input_ids, segment_ids, input_mask] emb_layer = network.add_plugin_v2(inputs, fn) return emb_layer + def build_engine(batch_sizes, sequence_lengths, config, weights_dict): - explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + explicit_batch_flag = 1 << int(ixrt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) - builder = trt.Builder(TRT_LOGGER) - with builder.create_network(explicit_batch_flag) as network, builder.create_builder_config() as builder_config: + builder = ixrt.Builder(TRT_LOGGER) + with builder.create_network( + explicit_batch_flag + ) as network, builder.create_builder_config() as builder_config: if config.use_fp16: - builder_config.set_flag(trt.BuilderFlag.FP16) + builder_config.set_flag(ixrt.BuilderFlag.FP16) # Create the network - emb_layer = emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes) + emb_layer = emb_layernorm( + builder, + network, + config, + weights_dict, + builder_config, + sequence_lengths, + batch_sizes, + ) embeddings = emb_layer.get_output(0) mask_idx = emb_layer.get_output(1) - + bert_out = bert_model(config, weights_dict, network, embeddings, mask_idx) squad_logits = squad_output("cls_", config, weights_dict, network, bert_out) squad_logits_out = squad_logits.get_output(0) + squad_logits.set_output_type(0, ixrt.float32) network.mark_output(squad_logits_out) build_start_time = time.time() - engine = builder.build_engine(network, builder_config) - build_time_elapsed = (time.time() - build_start_time) - TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed)) - return engine - + serialized_engine = builder.build_serialized_network(network, builder_config) + build_time_elapsed = time.time() - build_start_time + TRT_LOGGER.log( + TRT_LOGGER.INFO, "build serialized_engine in {:.3f} Sec".format(build_time_elapsed) + ) + return serialized_engine + + def str2bool(v): - return v.lower() in ('yes', 'true') + return v.lower() in ("yes", "true") + def main(): - parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("-z", "--use_trt", type=str2bool, default=False, help = "Whether to use tensorRT or IxRT") - parser.add_argument("-x", "--onnx", required=False, help="The ONNX model file path.") - parser.add_argument("-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path.") - parser.add_argument("-o", "--output", required=True, default="bert_base_384.engine", help="The bert engine file, ex bert.engine") - parser.add_argument("-b", "--batch-size", nargs='+', help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.", type=int) - parser.add_argument("-s", "--sequence-length", nargs='+', help="Sequence length of the BERT model", type=int) - parser.add_argument("-c", "--config-dir", required=True, - help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google") - parser.add_argument("-f", "--fp16", action="store_true", help="Indicates that inference should be run in FP16 precision", required=False) - parser.add_argument("-j", "--squad-json", default="squad/dev-v1.1.json", help="squad json dataset used for int8 calibration", required=False) - parser.add_argument("-v", "--vocab-file", default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt", help="Path to file containing entire understandable vocab", required=False) - parser.add_argument("--verbose", action="store_true", help="Turn on verbose logger and set profiling verbosity to DETAILED", required=False) + parser = argparse.ArgumentParser( + description="IxRT BERT Sample", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-z", + "--use_trt", + type=str2bool, + default=False, + help="Whether to use ixrt or IxRT", + ) + parser.add_argument( + "-x", "--onnx", required=False, help="The ONNX model file path." + ) + parser.add_argument( + "-pt", "--pytorch", required=False, help="The PyTorch checkpoint file path." + ) + parser.add_argument( + "-o", + "--output", + required=True, + default="bert_base_384.engine", + help="The bert engine file, ex bert.engine", + ) + parser.add_argument( + "-b", + "--batch-size", + nargs="+", + help="Batch size(s) to optimize for. The engine will be usable with any batch size below this, but may not be optimal for smaller sizes. Can be specified multiple times to optimize for more than one batch size.", + type=int, + ) + parser.add_argument( + "-s", + "--sequence-length", + nargs="+", + help="Sequence length of the BERT model", + type=int, + ) + parser.add_argument( + "-c", + "--config-dir", + required=True, + help="The folder containing the bert_config.json, which can be downloaded e.g. from https://github.com/google-research/bert#pre-trained-models or by running download_models.py in dle/TensorFlow/LanguageModeling/BERT/data/pretrained_models_google", + ) + parser.add_argument( + "-f", + "--fp16", + action="store_true", + help="Indicates that inference should be run in FP16 precision", + required=False, + ) + parser.add_argument( + "-j", + "--squad-json", + default="squad/dev-v1.1.json", + help="squad json dataset used for int8 calibration", + required=False, + ) + parser.add_argument( + "-v", + "--vocab-file", + default="./pre-trained_model/uncased_L-24_H-1024_A-16/vocab.txt", + help="Path to file containing entire understandable vocab", + required=False, + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Turn on verbose logger and set profiling verbosity to DETAILED", + required=False, + ) args, _ = parser.parse_known_args() args.batch_size = args.batch_size or [1] args.sequence_length = args.sequence_length or [128] - args.use_trt = is_nvidia_platform() if len(args.sequence_length) not in [1, 3]: - print("Error: You must provide either one or three integers.") + print( + "Error: You must provide either one or three integers." + ) sys.exit(1) if len(args.batch_size) not in [1, 3]: @@ -375,7 +635,9 @@ def main(): TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE bert_config_path = args.config_dir - TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path)) + TRT_LOGGER.log( + TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path) + ) config = BertConfig(bert_config_path, args.fp16, args.use_trt) @@ -384,15 +646,18 @@ def main(): elif args.pytorch != None: weights_dict = load_pytorch_weights_and_quant(args.pytorch, config) else: - raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.") + raise RuntimeError( + "You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model." + ) - with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as engine: - TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...") - serialized_engine = engine.serialize() + with build_engine( + args.batch_size, args.sequence_length, config, weights_dict + ) as serialized_engine: TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output)) with open(args.output, "wb") as fout: fout.write(serialized_engine) TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py b/models/nlp/plm/bert_large_squad/ixrt/builder_int8.py similarity index 96% rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py rename to models/nlp/plm/bert_large_squad/ixrt/builder_int8.py index e51d7c40d5fd0a9d79514b0367b446058ddec14f..c08b06bc2242c970f2ff649d3def79f311a68e1f 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_int8.py +++ b/models/nlp/plm/bert_large_squad/ixrt/builder_int8.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -15,6 +15,7 @@ # under the License. # # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -43,6 +44,7 @@ from builder_utils_int8 import load_pytorch_weights_and_quant from builder_utils_int8 import WQKV, BQKV # Attention Keys from builder_utils_int8 import W_AOUT, B_AOUT, W_MID, B_MID, W_LOUT, B_LOUT # Transformer Keys from builder_utils_int8 import SQD_W, SQD_B # SQuAD Output Keys +from builder import custom_fc as custom_fc_fp16 trt_version = [int(n) for n in trt.__version__.split('.')] @@ -114,8 +116,7 @@ def attention_layer_opt(prefix, config, init_dict, network, input_tensor, imask) """ Add the attention layer """ - assert(len(input_tensor.shape) == 5) - B, S, hidden_size, _, _ = input_tensor.shape + B, S, hidden_size = input_tensor.shape num_heads = config.num_attention_heads head_size = int(hidden_size / num_heads) @@ -157,7 +158,6 @@ def skipln(prefix, config, init_dict, network, input_tensor, skip, residual, is_ Add the skip layer """ idims = input_tensor.shape - assert len(idims) == 5 hidden_size = idims[2] dtype = trt.float32 @@ -236,7 +236,6 @@ def transformer_layer_opt(prefix, config, init_dict, network, input_tensor, imas Add the transformer layer """ idims = input_tensor.shape - assert len(idims) == 5 hidden_size = idims[2] context_transposed = attention_layer_opt(prefix + "attention_", config, init_dict, network, input_tensor, imask) @@ -281,13 +280,12 @@ def squad_output(prefix, config, init_dict, network, input_tensor): """ idims = input_tensor.shape - assert len(idims) == 5 - B, S, hidden_size, _, _ = idims + B, S, hidden_size = idims W_out = init_dict[prefix + SQD_W] B_out = init_dict[prefix + SQD_B] - dense = network.add_fully_connected(input_tensor, 2, W_out, B_out) + dense = custom_fc_fp16(network, input_tensor, 2, W_out, B_out) return dense def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes): @@ -314,7 +312,7 @@ def emb_layernorm(builder, network, config, weights_dict, builder_config, sequen wtokemb = trt.PluginField("bert_embeddings_token_type_embeddings", weights_dict["bert_embeddings_token_type_embeddings"], trt.PluginFieldType.FLOAT32) wposemb = trt.PluginField("bert_embeddings_position_embeddings", weights_dict["bert_embeddings_position_embeddings"], trt.PluginFieldType.FLOAT32) - output_fp16 = trt.PluginField("output_fp16", np.array([0]).astype(np.int32), trt.PluginFieldType.INT32) + output_fp16 = trt.PluginField("output_fp16", np.array([1]).astype(np.int32), trt.PluginFieldType.INT32) mha_type = trt.PluginField("mha_type_id", np.array([get_mha_dtype(config)], np.int32), trt.PluginFieldType.INT32) pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type]) @@ -354,10 +352,10 @@ def build_engine(batch_sizes, sequence_lengths, config, weights_dict): network.mark_output(squad_logits_out) build_start_time = time.time() - engine = builder.build_engine(network, builder_config) + plan = builder.build_serialized_network(network, builder_config) build_time_elapsed = (time.time() - build_start_time) TRT_LOGGER.log(TRT_LOGGER.INFO, "build engine in {:.3f} Sec".format(build_time_elapsed)) - return engine + return plan def main(): parser = argparse.ArgumentParser(description="TensorRT BERT Sample", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -389,7 +387,7 @@ def main(): if args.verbose: TRT_LOGGER.min_severity = TRT_LOGGER.VERBOSE - bert_config_path = args.config_dir + bert_config_path = os.path.join(args.config_dir, "bert_config.json") TRT_LOGGER.log(TRT_LOGGER.INFO, "Using configuration file: {:}".format(bert_config_path)) config = BertConfig(bert_config_path, args.int8) @@ -403,13 +401,11 @@ def main(): raise RuntimeError("You need either specify TF checkpoint using option --ckpt or ONNX using option --onnx to build TRT BERT model.") # engine = build_engine(args.batch_size, args.workspace_size, args.sequence_length, config, weights_dict, args.squad_json, args.vocab_file, None, args.calib_num, args.verbose) - with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as engine: - TRT_LOGGER.log(TRT_LOGGER.VERBOSE, "Serializing Engine...") - serialized_engine = engine.serialize() + with build_engine(args.batch_size, args.sequence_length, config, weights_dict) as serialized_engine: TRT_LOGGER.log(TRT_LOGGER.INFO, "Saving Engine to {:}".format(args.output)) with open(args.output, "wb") as fout: fout.write(serialized_engine) TRT_LOGGER.log(TRT_LOGGER.INFO, "Done.") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py b/models/nlp/plm/bert_large_squad/ixrt/builder_utils.py similarity index 77% rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py rename to models/nlp/plm/bert_large_squad/ixrt/builder_utils.py index 767379778633cafe889a4df414d8cc487495559b..8999935a341a86c12eb791e5ff13161a4d1a38aa 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils.py +++ b/models/nlp/plm/bert_large_squad/ixrt/builder_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,6 +14,7 @@ # under the License. # # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -93,6 +94,10 @@ def get_onnx_weight_dict(tensor_dict, config): Bqkv[1,:] = tensor_dict[prefix + BK] Bqkv[2,:] = tensor_dict[prefix + BV] + if config.use_trt: + Wqkv = np.ascontiguousarray(Wqkv.reshape((3, N, H, N, H)).transpose((1,0,2,3,4))) + Bqkv = np.ascontiguousarray(Bqkv.reshape((3, N, H)).transpose((1,0,2))) + weights_dict[prefix + WQKV] = Wqkv.flatten() weights_dict[prefix + BQKV] = Bqkv.flatten() weights_dict[prefix + WQKV + "_notrans"] = np.ascontiguousarray(Wqkv.T).flatten() @@ -103,6 +108,10 @@ def get_onnx_weight_dict(tensor_dict, config): flat_tensor = np.ascontiguousarray(tensor).flatten() weights_dict[outname] = flat_tensor + if outname.find("kernel") != -1 and config.use_trt: + tensor = np.transpose(tensor) + weights_dict[outname + "_notrans"] = np.ascontiguousarray(tensor).flatten() + return weights_dict def onnx_to_trt_name(onnx_name): @@ -162,24 +171,67 @@ def onnx_to_trt_name(onnx_name): parsed = '_'.join(toks) return parsed +def pt_to_trt_name(pt_name): + """ + Converting variables in the onnx checkpoint to names corresponding to the naming convention used in the TF version, expected by the builder + """ + qkv_strings = {'key', 'value', 'query', 'query_key_value'} + pt_name = pt_name.lower() + toks = [t.strip('_') for t in pt_name.split('.')] + if toks[0] == 'bert': #embeddings or encoder + if toks[1] == 'encoder': #transformer + if toks[-2] == 'layernorm': #bias->beta, weight->gamma + toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma' + elif (toks[-2] == 'dense' or toks[-2] in qkv_strings) and toks[-1] == 'weight': + toks[-1] = 'kernel' + + if 'final_input_quantizer' not in toks[2]: + ind = toks.index('layers')+1 if 'layers' in toks else 3 + toks = toks[ind:] + toks[0] = 'l{}'.format(int(toks[0])) + + else: + if toks[-2] == 'layernorm': #bias->beta, weight->gamma + toks[-1] = 'beta' if toks[-1] == 'bias' else 'gamma' + else: #embeddings: drop "_weight" suffix + toks = toks[:-1] + + elif 'qa_outputs' in pt_name: ## + name = 'cls_squad_output_bias' if toks[-1] == 'bias' else 'cls_squad_output_weights' + return name + else: + print("Encountered unknown case:", pt_name) + assert(False) + parsed = '_'.join(toks) + return parsed + def load_onnx_weights_and_quant(path, config): """ Load the weights from the onnx checkpoint """ model = onnx.load(path) weights = model.graph.initializer + # for w in weights: + # print(w.name, w.dims,flush=True) tensor_dict = dict((onnx_to_trt_name(w.name), np.frombuffer(w.raw_data, np.int8).reshape(w.dims)) if w.name.split('_')[-1] == 'mask' else (onnx_to_trt_name(w.name), np.frombuffer(w.raw_data, np.float32).reshape(w.dims)) for w in weights) + # for key in tensor_dict: + # print(key, tensor_dict[key].shape,flush=True) + return get_onnx_weight_dict(tensor_dict, config) def load_pytorch_weights_and_quant(path, config): """ Load the weights from the pytorch checkpoint """ - state_dict = torch.load(path, map_location='cpu')["model"] - tensor_dict = {onnx_to_trt_name(name):val.numpy() for name, val in state_dict.items()} + state_dict = torch.load(path, map_location='cpu') + # for name in state_dict: + # print(name, state_dict[name].size(),flush=True) + tensor_dict = {pt_to_trt_name(name):val.numpy() for name, val in state_dict.items()} + # for key in tensor_dict: + # print(key, tensor_dict[key].shape,flush=True) return get_onnx_weight_dict(tensor_dict, config) class BertConfig: diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py b/models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py similarity index 99% rename from models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py rename to models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py index 56ac8d1889912cb98817d5960767d94522441030..25770a77594829297bbd5f50a09d830d7f8b210e 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/builder_utils_int8.py +++ b/models/nlp/plm/bert_large_squad/ixrt/builder_utils_int8.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -15,6 +15,7 @@ # # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 + # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh b/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh index ebc8effc48246556bd7fa5edadd0ad9d35a984a0..1f41b8ca5d61dfea08f0d4d4b76e03bc0f204ba2 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh +++ b/models/nlp/plm/bert_large_squad/ixrt/ci/prepare.sh @@ -25,17 +25,6 @@ else echo "Not Support Os" fi -# install ixrt run -bash /root/data/install/ixrt-1.0.0.alpha+corex.4.3.0-linux_x86_64.run - -if [ "$1" = "nvidia" ]; then - cmake -S . -B build -DUSE_TENSORRT=true - cmake --build build -j16 -else - cmake -S . -B build - cmake --build build -j16 -fi - -pip install -r requirements.txt -mkdir -p ./python/data -ln -s /root/data/checkpoints/bert-large-uncased/ ./python/data && ln -s /root/data/datasets/squad/ ./python/data \ No newline at end of file +mkdir -p data/datasets +mkdir -p data/checkpoints +ln -s /root/data/checkpoints/bert-large-uncased data/checkpoints && ln -s /root/data/datasets/squad data/datasets \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake deleted file mode 100644 index 07c436f5e545933e1debe34a0de482512f0ffb0a..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCompiler.cmake +++ /dev/null @@ -1,15 +0,0 @@ -if(NOT COMPILER_PATH) - if (EXISTS /opt/sw_home/local/bin/clang++) - set(COMPILER_PATH /opt/sw_home/local/bin) - elseif (EXISTS /usr/local/corex/bin/clang++) - set(COMPILER_PATH /usr/local/corex/bin) - else() - message(STATUS "COMPILER_PATH is not set and we couldn't find clang compiler neither, will use system C/C++ compiler") - endif() -endif() -if (COMPILER_PATH) - set(CMAKE_CXX_COMPILER ${COMPILER_PATH}/clang++) - set(CMAKE_C_COMPILER ${COMPILER_PATH}/clang) -endif() - -message(STATUS "Use ${CMAKE_CXX_COMPILER} and ${CMAKE_C_COMPILER} as C++ and C compiler") \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake deleted file mode 100644 index 58e39e6003cb6a0545a76f9a6fab88e44fe39caa..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindCuda.cmake +++ /dev/null @@ -1,57 +0,0 @@ -# This cmake does: -# - Set CUDA_PATH -# - Find libcudart -# - Util functions like cuda_add_library, cuda_add_executable - - -# CUDA_PATH can be specified through below means shown in priority order 1. -# cmake command line argument, -DCUDA_PATH=/path/to/cuda 2. bash environment -# variable, export CUDA_PATH=/path/to/cuda -if(DEFINED ENV{CUDA_PATH}) - set(CUDA_PATH "$ENV{CUDA_PATH}") -else() - set(CUDA_PATH - "/opt/sw_home/local/cuda" - CACHE PATH "cuda installation root path") -endif() -message(STATUS "Use CUDA_PATH=${CUDA_PATH} ") - -# GPU arch -if(NOT "${CUDA_ARCH}" STREQUAL "") - set(CUDA_ARCH - ${CUDA_ARCH} - CACHE STRING "GPU architecture tag, ivcore11") -else("${CUDA_ARCH}" STREQUAL "") - set(CUDA_ARCH - "ivcore11" - CACHE STRING "GPU architecture tag, ivcore11") -endif() -message(STATUS "Use CUDA_ARCH=${CUDA_ARCH}") - -macro(cuda_add_executable) - foreach(File ${ARGN}) - if(${File} MATCHES ".*\.cu$") - set_source_files_properties(${File} PROPERTIES LANGUAGE CXX) - endif() - endforeach() - add_executable(${ARGV}) -endmacro() - -macro(cuda_add_library) - foreach(File ${ARGN}) - if(${File} MATCHES ".*\.cu$") - set_source_files_properties(${File} PROPERTIES LANGUAGE CXX) - endif() - endforeach() - add_library(${ARGV}) -endmacro() - -find_library( - CUDART_LIBRARY cudart - PATHS ${CUDA_PATH} - PATH_SUFFIXES lib/x64 lib64 lib - NO_DEFAULT_PATH) - -if (NOT USE_TRT) - set(CUDA_LIBRARIES cudart) -endif() \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake deleted file mode 100644 index 5b0f27293edaebf80cd5bfd622c363f49b36966b..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindIxrt.cmake +++ /dev/null @@ -1,19 +0,0 @@ -# This cmake file decides how to build with IxRT -# Custom IxRT Path -if(NOT "${IXRT_HOME}" STREQUAL "") - set(IXRT_INCLUDE_DIR ${IXRT_HOME}/include) - set(IXRT_LIB_DIR ${IXRT_HOME}/lib) -# From default paths -else() - set(IXRT_INCLUDE_DIR /usr/local/corex/include) - set(IXRT_LIB_DIR /usr/local/corex/lib) -endif() - -message(STATUS "IXRT_INCLUDE_DIR: ${IXRT_INCLUDE_DIR}") -message(STATUS "IXRT_LIB_DIR: ${IXRT_LIB_DIR}") - -if(EXISTS ${IXRT_INCLUDE_DIR} AND EXISTS ${IXRT_LIB_DIR}) - include_directories(${IXRT_INCLUDE_DIR}) -else() - message( FATAL_ERROR "IxRT library doesn't exist!") -endif() \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake b/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake deleted file mode 100644 index 603606996e8a310579fd86de3ea36125f19bbea1..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/cmake/FindPluginFiles.cmake +++ /dev/null @@ -1,7 +0,0 @@ -file(GLOB_RECURSE PLUGIN_FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc - ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu) - -if(DEFINED USE_TENSORRT) - list(FILTER PLUGIN_FILES EXCLUDE REGEX "${CMAKE_CURRENT_SOURCE_DIR}/src/backend/ixinfer") -endif() \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py b/models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py similarity index 82% rename from models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py rename to models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py index ce5bb98df7f60176ac5def72f4c2a5d1d54f990e..7b14e8380d6a1c7348633ffacc2cd6fb2818e0a2 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/evaluate-v1.1.py +++ b/models/nlp/plm/bert_large_squad/ixrt/evaluate-v1.1.py @@ -1,18 +1,4 @@ #!/usr/bin/env python3 -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. # # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -107,10 +93,6 @@ def evaluate(dataset, predictions, f1_acc): print("&&&& FAILED TensorRT BERT Squad Accuracy matches reference.") else: print("&&&& PASSED TensorRT BERT Squad Accuracy matches reference.") - metricResult = {"metricResult": {}} - metricResult["metricResult"]["exact_match"] = round(exact_match, 3) - metricResult["metricResult"]["f1"] = round(f1, 3) - print(metricResult) return {'exact_match': exact_match, 'f1': f1} if __name__ == '__main__': @@ -131,4 +113,4 @@ if __name__ == '__main__': with open(args.prediction_file) as prediction_file: predictions = json.load(prediction_file) f1_acc = float(args.f1_acc) - print(json.dumps(evaluate(dataset, predictions, f1_acc))) + print(json.dumps(evaluate(dataset, predictions, f1_acc))) \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/__init__.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/__init__.py similarity index 100% rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/__init__.py rename to models/nlp/plm/bert_large_squad/ixrt/helpers/__init__.py diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py similarity index 89% rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py rename to models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py index beacc625fae0f73bda3480054e4ecceca85fb240..73084f39b8de03d8cfcdfb37d31407d30d9c3176 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/calibrator.py +++ b/models/nlp/plm/bert_large_squad/ixrt/helpers/calibrator.py @@ -19,8 +19,8 @@ import tensorrt as trt import os -import pycuda.driver as cuda -import pycuda.autoinit +import cuda.cuda as cuda +import cuda.cudart as cudart import numpy as np import helpers.tokenization as tokenization import helpers.data_processing as dp @@ -80,9 +80,12 @@ class BertCalibrator(trt.IInt8LegacyCalibrator): segment_ids = features[0].segment_ids input_mask = features[0].input_mask - cuda.memcpy_htod(self.device_inputs[0], input_ids.ravel()) - cuda.memcpy_htod(self.device_inputs[1], segment_ids.ravel()) - cuda.memcpy_htod(self.device_inputs[2], input_mask.ravel()) + err, = cuda.cuMemcpyHtoD(self.device_inputs[0], input_ids.ravel(), input_ids.ravel().nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoD(self.device_inputs[1], segment_ids.ravel(), segment_ids.ravel().nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoD(self.device_inputs[2], input_mask.ravel(), input_mask.ravel().nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) self.current_index += self.batch_size return self.device_inputs diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py similarity index 98% rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py rename to models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py index 712e1a61d29a198eb276f41a9249b0c66e3786ba..88459ebfafbd84c11356c0a3dfc3838882e4b2f8 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/data_processing.py +++ b/models/nlp/plm/bert_large_squad/ixrt/helpers/data_processing.py @@ -159,14 +159,14 @@ def convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_le input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. - # while len(input_ids) < max_seq_length: - # input_ids.append(0) - # input_mask.append(0) - # segment_ids.append(0) - - # assert len(input_ids) == max_seq_length - # assert len(input_mask) == max_seq_length - # assert len(segment_ids) == max_seq_length + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length def create_int_feature(values): feature = np.asarray(values, dtype=np.int32, order=None) diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/helpers/tokenization.py b/models/nlp/plm/bert_large_squad/ixrt/helpers/tokenization.py similarity index 100% rename from models/nlp/plm/bert_large_squad/ixrt/python/helpers/tokenization.py rename to models/nlp/plm/bert_large_squad/ixrt/helpers/tokenization.py diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/inference.py b/models/nlp/plm/bert_large_squad/ixrt/inference.py similarity index 79% rename from models/nlp/plm/bert_large_squad/ixrt/python/inference.py rename to models/nlp/plm/bert_large_squad/ixrt/inference.py index ec93972d295cc3fa777ab60cf82d12401b99f7c3..1ed6e088911d9c17166b85d5b6df81a8e3d0c667 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/inference.py +++ b/models/nlp/plm/bert_large_squad/ixrt/inference.py @@ -38,8 +38,8 @@ import argparse import collections import numpy as np import tensorrt as trt -import pycuda.driver as cuda -import pycuda.autoinit +import cuda.cuda as cuda +import cuda.cudart as cudart import helpers.tokenization as tokenization import helpers.data_processing as dp @@ -153,14 +153,15 @@ if __name__ == '__main__': break if selected_profile == -1: raise RuntimeError("Could not find any profile that can run batch size {}.".format(args.batch_size)) - + # Create a stream in which to copy inputs/outputs and run inference. - stream = cuda.Stream() + err_dr, stream = cuda.cuStreamCreate(0) + assert(err_dr == cuda.CUresult.CUDA_SUCCESS) # if args.use_trt: # context.active_optimization_profile = selected_profile # else: - context.set_optimization_profile_async(selected_profile, stream.handle) + context.set_optimization_profile_async(selected_profile, stream) binding_idx_offset = selected_profile * num_binding_per_profile input_shape = (args.batch_size, max_seq_length) @@ -170,11 +171,17 @@ if __name__ == '__main__': assert context.all_binding_shapes_specified # Allocate device memory for inputs. - d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)] + d_inputs = [] + for binding in range(3): + err, ptr = cuda.cuMemAlloc(input_nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) + d_inputs.append(ptr) # Allocate output buffer by querying the size from the context. This may be different for different input shapes. - h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32) - d_output = cuda.mem_alloc(h_output.nbytes) + h_output = np.empty(tuple(context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32) + + err, d_output = cuda.cuMemAlloc(h_output.nbytes) + assert(err == cuda.CUresult.CUDA_SUCCESS) def inference(features, tokens): global h_output @@ -191,25 +198,32 @@ if __name__ == '__main__': segment_ids_batch = np.repeat(np.expand_dims(feature.segment_ids, 0), args.batch_size, axis=0) input_mask_batch = np.repeat(np.expand_dims(feature.input_mask, 0), args.batch_size, axis=0) - input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel())) - segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel())) - input_mask = cuda.register_host_memory(np.ascontiguousarray(input_mask_batch.ravel())) + input_ids = cuda.cuMemHostRegister(np.ascontiguousarray(input_ids_batch.ravel()), input_ids_batch.nbytes) + segment_ids = cuda.cuMemHostRegister(np.ascontiguousarray(segment_ids_batch.ravel()), segment_ids_batch.nbytes) + input_mask = cuda.cuMemHostRegister(np.ascontiguousarray(input_mask_batch.ravel()), input_mask.nbytes) eval_start_time = time.time() - cuda.memcpy_htod_async(d_inputs[0], input_ids, stream) - cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream) - cuda.memcpy_htod_async(d_inputs[2], input_mask, stream) + err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], input_ids, input_ids.nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], segment_ids, segment_ids.nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoDAsync(d_inputs[2], input_mask, input_mask.nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) # Run inference - context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) + context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream) # Synchronize the stream - stream.synchronize() + err, = cuda.cuStreamSynchronize(stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) eval_time_elapsed += (time.time() - eval_start_time) # Transfer predictions back from GPU - cuda.memcpy_dtoh_async(h_output, d_output, stream) - stream.synchronize() - + err, = cuda.cuMemcpyDtoHAsync(h_output, d_output, h_output.nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuStreamSynchronize(stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + # for x in h_output[0].reshape(-1,2): + # print(x) # Only retrieve and post-process the first batch batch = h_output[0] @@ -218,7 +232,7 @@ if __name__ == '__main__': end_logits = np.array(batch.squeeze()[:, 1]), feature_index = feature_index )) - + eval_time_elapsed /= len(features) # Total number of n-best predictions to generate in the nbest_predictions.json output file @@ -258,14 +272,16 @@ if __name__ == '__main__': batch_example = [] max_batch_length = 0 seq_length_list = [] - for index in tqdm(sort_index): + for index in sort_index: batch_feature.append(features_list[index]) batch_example.append(squad_examples[index]) max_batch_length = max(max_batch_length, len(features_list[index].input_ids)) if args.int8: - max_batch_length = max_seq_length - else: max_batch_length = math.ceil(max_batch_length / 2) * 2 + else: + # workround to solve bs=1 10% slow + if args.batch_size == 1: + max_batch_length = math.ceil(max_batch_length / 64) * 64 seq_length_list.append(len(features_list[index].input_ids)) if len(batch_feature) == args.batch_size: batch_input_ids = [ @@ -319,28 +335,39 @@ if __name__ == '__main__': for binding in range(3): context.set_binding_shape(binding, (args.batch_size, max_seq_length)) assert context.all_binding_shapes_specified - cuda.memcpy_htod_async(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream) - cuda.memcpy_htod_async(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), stream) - context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) - stream.synchronize() + err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel().nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel(), np.zeros((args.batch_size, max_seq_length), dtype=np.int32).ravel().nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream) + err, = cuda.cuStreamSynchronize(stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) - start_time = time.time() + infer_toal_time = 0 output_index = 0 for input_ids, segment_ids in tqdm(all_token_ids): for binding in range(3): context.set_binding_shape(binding, input_ids.shape) assert context.all_binding_shapes_specified - cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream) - cuda.memcpy_htod_async(d_inputs[1], segment_ids.ravel(), stream) - stream.synchronize() + err, = cuda.cuMemcpyHtoDAsync(d_inputs[0], input_ids.ravel(), input_ids.nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemcpyHtoDAsync(d_inputs[1], segment_ids.ravel(), segment_ids.nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuStreamSynchronize(stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + infer_start_time = time.time() + context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream) + err, = cuda.cuStreamSynchronize(stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + infer_end_time = time.time() + infer_time = infer_end_time - infer_start_time + infer_toal_time += infer_time + err, = cuda.cuMemcpyDtoHAsync(h_output, d_output, h_output.nbytes, stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuStreamSynchronize(stream) + assert(err == cuda.CUresult.CUDA_SUCCESS) - context.execute_async_v2(bindings=[0 for i in range(binding_idx_offset)] +[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle) - stream.synchronize() - - cuda.memcpy_dtoh_async(h_output, d_output, stream) - stream.synchronize() - new_h_output = np.array(h_output.reshape(-1)[:input_ids.shape[0]*input_ids.shape[1]*2]).reshape(input_ids.shape[0], input_ids.shape[1], 2) for index in range(input_ids.shape[0]): networkOutputs.append(_NetworkOutput( @@ -349,7 +376,12 @@ if __name__ == '__main__': feature_index = index )) output_index += 1 - infer_time = time.time() - start_time + for i in range(3): + err, = cuda.cuMemFree(d_inputs[i]) + assert(err == cuda.CUresult.CUDA_SUCCESS) + err, = cuda.cuMemFree(d_output) + assert(err == cuda.CUresult.CUDA_SUCCESS) + output_index = 0 for (be, bf) in zip(batch_example_list, batch_feature_list): for index in range(len(bf)): @@ -357,7 +389,7 @@ if __name__ == '__main__': [networkOutputs[output_index]], args.n_best_size, args.max_answer_length) output_index += 1 all_precision[be[index].id] = prediction - return infer_time, all_precision + return infer_toal_time, all_precision status = 0 if squad_examples: @@ -366,20 +398,18 @@ if __name__ == '__main__': features_list = [] lengths = [] - for example_index, example in tqdm(enumerate(squad_examples)): + for example_index, example in enumerate(squad_examples): features = question_features(example.doc_tokens, example.question_text) features_list.append(features[0]) lengths.append(len(features[0].input_ids)) sort_index = np.argsort(lengths) - infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions) - print(F"E2E time : {infer_time:.3f} seconds") + infer_time, all_predictions = inference_all_dynamic(features_list, squad_examples, sort_index, all_predictions) - qps = len(squad_examples)/infer_time + qps = math.ceil(len(squad_examples)/args.batch_size)*args.batch_size/infer_time print(f"Latency QPS: {qps} sentences/s") metricResult = {"metricResult": {}} - metricResult["metricResult"]["E2E time"] = round(infer_time, 3) - metricResult["metricResult"]["Latency QPS"] = round(qps, 3) + metricResult["metricResult"]["qps"] = qps print(metricResult) with open(output_prediction_file, "w") as f: @@ -415,4 +445,4 @@ if __name__ == '__main__': # question_text = input("Question (to exit, type one of {:}): ".format(EXIT_CMDS)) del context del engine - sys.exit(status) \ No newline at end of file + sys.exit(status) diff --git a/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py b/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..b370130872bdf2a94cdb87b42909a6b1ce889b58 --- /dev/null +++ b/models/nlp/plm/bert_large_squad/ixrt/load_ixrt_plugin.py @@ -0,0 +1,13 @@ +from os.path import join, dirname, exists +import tensorrt as trt +import ctypes + +def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""): + if not dynamic_path: + dynamic_path = join(dirname(trt.__file__), "lib", "libixrt_plugin.so") + if not exists(dynamic_path): + raise FileNotFoundError( + f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") + ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL) + trt.init_libnvinfer_plugins(logger, namespace) + print(f"Loaded plugin from {dynamic_path}") diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/perf.py b/models/nlp/plm/bert_large_squad/ixrt/perf.py similarity index 97% rename from models/nlp/plm/bert_large_squad/ixrt/python/perf.py rename to models/nlp/plm/bert_large_squad/ixrt/perf.py index 968a39435bd597639e427ad2ac745579250cda0f..8343c95d0a57f374f091617b8099951ed66c2ea1 100644 --- a/models/nlp/plm/bert_large_squad/ixrt/python/perf.py +++ b/models/nlp/plm/bert_large_squad/ixrt/perf.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. # All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may @@ -14,6 +14,8 @@ # under the License. # # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + + # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -55,6 +57,7 @@ class DeviceBuffer(object): def main(): parser = argparse.ArgumentParser(description='BERT Inference Benchmark') + parser.add_argument("-z", "--use_trt", action="store_false", help="Whether to use tensorRT or IxRT") parser.add_argument("-e", "--engine", help='Path to BERT TensorRT engine') parser.add_argument('-b', '--batch-size', default=[], action="append", help='Batch size(s) to benchmark. Can be specified multiple times for more than one batch size. This script assumes that the engine has been built with one optimization profile for each batch size, and that these profiles are in order of increasing batch size.', type=int) parser.add_argument('-s', '--sequence-length', default=128, help='Sequence length of the BERT model', type=int) @@ -66,7 +69,7 @@ def main(): args.batch_size = args.batch_size or [1] # Import necessary plugins for BERT TensorRT - load_ixrt_plugin(TRT_LOGGER, dynamic_path="../build/libixrt_plugin.so") + load_ixrt_plugin(TRT_LOGGER) with open(args.engine, 'rb') as f: runtime = trt.Runtime(TRT_LOGGER) diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py b/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py deleted file mode 100644 index 93301c303658d92832d68c78b61757610d6ab201..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/python/load_ixrt_plugin.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -from os.path import join, dirname, exists, abspath -import tensorrt as trt -import ctypes -import os -import subprocess - -def is_nvidia_platform(): - try: - # 尝试运行 nvidia-smi - subprocess.check_output(['nvidia-smi']) - return True - except (subprocess.CalledProcessError, FileNotFoundError): - return False - -def load_ixrt_plugin(logger=trt.Logger(trt.Logger.WARNING), namespace="", dynamic_path=""): - if not dynamic_path: - dynamic_path = join(dirname(abspath(__file__)), "..", "build", "libixrt_plugin.so") - if not exists(dynamic_path): - raise FileNotFoundError( - f"The ixrt_plugin lib {dynamic_path} is not existed, please provided effective plugin path!") - handle = ctypes.CDLL(dynamic_path, mode=ctypes.RTLD_GLOBAL) - handle.initLibNvInferPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] - handle.initLibNvInferPlugins.restype = ctypes.c_bool - handle.initLibNvInferPlugins(None, namespace.encode('utf-8')) - print(f"Loaded plugin from {dynamic_path}") \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh deleted file mode 100644 index 7a7a05c5dbca037d8f31ef6c9b707a800902df2f..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/python/script/build_engine.sh +++ /dev/null @@ -1,34 +0,0 @@ -BSZ=1 -USE_FP16=True - -# Update arguments -index=0 -options=$@ -arguments=($options) -for argument in $options -do - index=`expr $index + 1` - case $argument in - --bs) BSZ=${arguments[index]};; - --int8) USE_FP16=False;; - esac -done - -if [ "$USE_FP16" = "True" ]; then - echo 'USE_FP16=True' - python3 builder.py -x ./data/bert-large-uncased/bert_large_v1_1_fake_quant.onnx \ - -w 4096 \ - -o ./data/bert_large_384.engine \ - -s 1 384 384 \ - -b 1 ${BSZ} ${BSZ} \ - --fp16 \ - -c ./data/bert-large-uncased/bert_config.json -else - echo 'USE_INT8=True' - python3 builder_int8.py -pt ./data/bert-large-uncased/bert_large_int8_qat.bin \ - -o ./data/bert_large_384_int8.engine \ - -s 1 384 384 \ - -b 1 ${BSZ} ${BSZ} \ - -i \ - -c ./data/bert-large-uncased/bert_config.json -fi \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh deleted file mode 100644 index 550c735e85b3b80202041a4ac878e73bcfeeaa14..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference.sh +++ /dev/null @@ -1,36 +0,0 @@ -PASSAGE='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps such as recommenders, -speech and image/video on NVIDIA GPUs. It includes parsers to import models, and plugins to support novel ops and layers before applying optimizations -for inference. Today NVIDIA is open-sourcing parsers and plugins in TensorRT so that the deep learning community can customize and extend these components -to take advantage of powerful TensorRT optimizations for your apps.' -QUESTION="What is TensorRT?" - -USE_FP16=True - -# Update arguments -index=0 -options=$@ -arguments=($options) -for argument in $options -do - index=`expr $index + 1` - case $argument in - --int8) USE_FP16=False;; - esac -done - -if [ "$USE_FP16" = "True" ]; then - echo 'USE_FP16=True' - python3 inference.py -e ./data/bert_large_384.engine \ - -s 384 \ - -p $PASSAGE \ - -q $QUESTION \ - -v ./data/bert-large-uncased/vocab.txt -else - echo 'USE_INT8=True' - python3 inference.py -e ./data/bert_large_384_int8.engine \ - -s 384 \ - -p $PASSAGE \ - -q $QUESTION \ - -v ./data/bert-large-uncased/vocab.txt -fi - diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh deleted file mode 100644 index 088b1d39ed738804ab4959b2cfcb4948a02f4c33..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/python/script/inference_squad.sh +++ /dev/null @@ -1,36 +0,0 @@ -BSZ=1 -USE_FP16=True - -# Update arguments -index=0 -options=$@ -arguments=($options) -for argument in $options -do - index=`expr $index + 1` - case $argument in - --bs) BSZ=${arguments[index]};; - --int8) USE_FP16=False;; - esac -done - -if [ "$USE_FP16" = "True" ]; then - echo 'USE_FP16=True' - UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ./data/bert_large_384.engine \ - -b ${BSZ} \ - -s 384 \ - -sq ./data/squad/dev-v1.1.json \ - -v ./data/bert-large-uncased/vocab.txt \ - -o ./data/predictions-bert_large_384.json - python3 evaluate-v1.1.py ./data/squad/dev-v1.1.json ./data/predictions-bert_large_384.json 90 -else - echo 'USE_INT8=True' - UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ./data/bert_large_384_int8.engine \ - -b ${BSZ} \ - -s 384 \ - -sq ./data/squad/dev-v1.1.json \ - -v ./data/bert-large-uncased/vocab.txt \ - -o ./data/predictions-bert_large_384_int8.json \ - -i - python3 evaluate-v1.1.py ./data/squad/dev-v1.1.json ./data/predictions-bert_large_384_int8.json 88 -fi \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh deleted file mode 100644 index f19c1def4b139edc1e02b7ae595327dc367c919e..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/python/script/mdb_infer_run.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. - -index=0 -options=("$@") # 将所有参数存储到数组中 -PRECISION=fp16 -BSZ=32 - -# 循环遍历所有参数 -while [[ $index -lt ${#options[@]} ]]; do - argument=${options[$index]} - case $argument in - --bs) - ((index++)) - BSZ=${options[$index]} - ;; - --prec) - ((index++)) - PRECISION=${options[$index]} - ;; - esac - ((index++)) -done - -# 设置INT8_FLAG -INT8_FLAG="" -if [[ "$PRECISION" == "int8" ]]; then - INT8_FLAG="--int8" -fi - -echo "PREC_FLAG=$INT8_FLAG" -echo "PRECISION=$PRECISION" -echo "BSZ=$BSZ" - -# 检查环境并执行相应的脚本 -if command -v ixsmi &>/dev/null; then - echo "MR env" - cmake -S . -B build - cmake --build build -j16 -elif command -v nvidia-smi &>/dev/null; then - echo "NV env" - cmake -S . -B build -DUSE_TENSORRT=true - cmake --build build -j16 -else - echo "No driver detected" - exit 1 -fi -cd ./python/ -bash script/build_engine.sh --bs $BSZ $INT8_FLAG -bash script/inference_squad.sh --bs $BSZ $INT8_FLAG diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh deleted file mode 100644 index 1ad462a763ccf37c19a0914b6a9b684bae52232c..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/python/script/perf.sh +++ /dev/null @@ -1,23 +0,0 @@ -BSZ=1 -USE_FP16=True - -# Update arguments -index=0 -options=$@ -arguments=($options) -for argument in $options -do - index=`expr $index + 1` - case $argument in - --bs) BSZ=${arguments[index]};; - --int8) USE_FP16=False;; - esac -done - -if [ "$USE_FP16" = "True" ]; then - echo 'USE_FP16=True' - python3 perf.py -e ./data/bert_large_384.engine -b ${BSZ} -s 384 -else - echo 'USE_INT8=True' - python3 perf.py -e ./data/bert_large_384_int8.engine -b ${BSZ} -s 384 -fi \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/python/script/prepare.sh b/models/nlp/plm/bert_large_squad/ixrt/python/script/prepare.sh deleted file mode 100644 index b8cf85a35bc1b34fd4537494ee61c8a7ca788e74..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/python/script/prepare.sh +++ /dev/null @@ -1,48 +0,0 @@ -VERSION='v1.1' - -while test $# -gt 0 -do - case "$1" in - -h) echo "Usage: sh download_squad.sh [v2_0|v1_1]" - exit 0 - ;; - v2_0) VERSION='v2.0' - ;; - v1_1) VERSION='v1.1' - ;; - *) echo "Invalid argument $1...exiting" - exit 0 - ;; - esac - shift -done - -# Download the SQuAD training and dev datasets -echo "Step 1: Downloading SQuAD-${VERSION} training and dev datasets to ./data/squad" -if [ ! -d "./data" ]; then - mkdir -p data -else - echo 'data directory existed' -fi - -pushd data -if [ ! -d "./squad" ]; then - mkdir -p squad - pushd squad - wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-${VERSION}.json - wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-${VERSION}.json - popd -else - echo 'squad directory existed' -fi - -echo "Step 2: Downloading model file and config to ./data/bert-large-uncased" - -if [ ! -d "./bert-large-uncased" ]; then - wget https://drive.google.com/file/d/1eD8QBkbK6YN-_YXODp3tmpp3cZKlrPTA/view?usp=drive_link - unzip bert-large-uncased.zip -d ./ - rm -f bert-large-uncased.zip -else - echo 'bert-large-uncased directory existed' -fi -popd diff --git a/models/nlp/plm/bert_large_squad/ixrt/requirements.txt b/models/nlp/plm/bert_large_squad/ixrt/requirements.txt deleted file mode 100644 index 36a8378f4b87bbfc5b0189acc602444b005f1867..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -onnx -pycuda -six -tqdm \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_accuracy.sh b/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..a317feaa63313820fe79e0c23c57d0706314fa27 --- /dev/null +++ b/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_accuracy.sh @@ -0,0 +1,47 @@ +set -eo pipefail + +BSZ=32 +TGT=90 +USE_TRT=False + +# Update arguments +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + --use_trt) USE_TRT=${arguments[index]};; + esac +done +project_path=./ +checkpoints_path=${project_path}/data/checkpoints/bert-large-uncased +datasets_path=${project_path}/data/datasets/squad + +echo 'USE_TRT='${USE_TRT} +export USE_TRT=$USE_TRT + +echo "Step1 Build Engine FP16(bert large squad)!" +python3 builder.py -x ${checkpoints_path}/bert_large_v1_1_fake_quant.onnx \ + -w 4096 \ + -o ${checkpoints_path}/bert_large_b${BSZ}.engine \ + -s 1 384 384\ + -b 1 ${BSZ} ${BSZ}\ + --fp16 \ + -c ${checkpoints_path}/bert_config.json \ + -z ${USE_TRT} + +echo "Step2 Run dev.json and generate json" +python3 inference.py -e ${checkpoints_path}/bert_large_b${BSZ}.engine \ + -s 384 \ + -b ${BSZ} \ + -sq ${datasets_path}/squad/dev-v1.1.json \ + -v ${checkpoints_path}/vocab.txt \ + -o ${checkpoints_path}/predictions-bert_large_b${BSZ}.json \ + -z ${USE_TRT} + +echo "Step3 Inference(test F1-score)" +python3 evaluate-v1.1.py ${datasets_path}/squad/dev-v1.1.json ${checkpoints_path}/predictions-bert_large_b${BSZ}.json ${TGT} \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_performance.sh b/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..c10f0198c68aa8aafa3fa7815a5dd5d25a0a6391 --- /dev/null +++ b/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_fp16_performance.sh @@ -0,0 +1,46 @@ +set -eo pipefail + +BSZ=32 +TGT=150 +USE_TRT=False + +# Update arguments +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + --use_trt) USE_TRT=${arguments[index]};; + esac +done + +project_path=./ +checkpoints_path=${project_path}/data/checkpoints/bert-large-uncased +datasets_path=${project_path}/data/datasets/squad + +echo 'USE_TRT='${USE_TRT} +export USE_TRT=$USE_TRT + +echo "Step1 Build Engine FP16(bert large squad)!" +python3 builder.py -x ${checkpoints_path}/bert_large_v1_1_fake_quant.onnx \ + -w 4096 \ + -o ${checkpoints_path}/bert_large_b${BSZ}.engine \ + -s 1 384 384\ + -b 1 ${BSZ} ${BSZ}\ + --fp16 \ + -c ${checkpoints_path}/bert_config.json \ + -z ${USE_TRT} + +echo "Step2 Inference(test QPS)" +UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ${checkpoints_path}/bert_large_b${BSZ}.engine \ + -s 384 \ + -b ${BSZ} \ + -sq ${datasets_path}/squad/dev-v1.1.json \ + -v ${checkpoints_path}/vocab.txt \ + -o ${checkpoints_path}/predictions-bert_large_b${BSZ}.json \ + -z ${USE_TRT} \ + --target_qps ${TGT} \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_accuracy.sh b/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_accuracy.sh new file mode 100644 index 0000000000000000000000000000000000000000..00857d37c0e4b8923721608c89d0fc069602f77f --- /dev/null +++ b/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_accuracy.sh @@ -0,0 +1,47 @@ +set -eo pipefail + +BSZ=32 +TGT=88 +USE_TRT=False + +# Update arguments +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + --use_trt) USE_TRT=${arguments[index]};; + esac +done + +project_path=./ +checkpoints_path=${project_path}/data/checkpoints/bert-large-uncased +datasets_path=${project_path}/data/datasets/squad + +echo 'USE_TRT='${USE_TRT} +export USE_TRT=$USE_TRT + +echo "Step1 Build Engine Int8(bert large squad)!" +python3 builder_int8.py -pt ${checkpoints_path}/bert_large_int8_qat.bin \ + -o ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \ + -b 1 ${BSZ} ${BSZ} \ + -s 1 384 384 \ + -i \ + -c ${checkpoints_path} + +echo "Step2 Run dev.json and generate json" +python3 inference.py -e ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \ + -b ${BSZ} \ + -s 384 \ + -sq ${datasets_path}/squad/dev-v1.1.json \ + -v ${checkpoints_path}/vocab.txt \ + -o ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json \ + -z ${USE_TRT} \ + -i + +echo "Step3 Inference(test F1-score)" +python3 evaluate-v1.1.py ${datasets_path}/squad/dev-v1.1.json ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json ${TGT} \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_performance.sh b/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_performance.sh new file mode 100644 index 0000000000000000000000000000000000000000..d484f7bf2ae32ccb4f5642760d7c67eb5c0c6632 --- /dev/null +++ b/models/nlp/plm/bert_large_squad/ixrt/script/infer_bert_large_squad_int8_performance.sh @@ -0,0 +1,45 @@ +set -eo pipefail + +BSZ=32 +TGT=200 +USE_TRT=False + +# Update arguments +index=0 +options=$@ +arguments=($options) +for argument in $options +do + index=`expr $index + 1` + case $argument in + --bs) BSZ=${arguments[index]};; + --tgt) TGT=${arguments[index]};; + --use_trt) USE_TRT=${arguments[index]};; + esac +done + +project_path=./ +checkpoints_path=${project_path}/data/checkpoints/bert-large-uncased +datasets_path=${project_path}/data/datasets/squad + +echo 'USE_TRT='${USE_TRT} +export USE_TRT=$USE_TRT + +echo "Step1 Build Engine Int8(bert large squad)!" +python3 builder_int8.py -pt ${checkpoints_path}/bert_large_int8_qat.bin \ + -o ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \ + -b 1 ${BSZ} ${BSZ} \ + -s 1 384 384 \ + -i \ + -c ${checkpoints_path} + +echo "Step2 Inference(test QPS)" +UMD_ENABLEDCPRINGNUM=16 python3 inference.py -e ${checkpoints_path}/bert_large_int8_b${BSZ}.engine \ + -b ${BSZ} \ + -s 384 \ + -sq ${datasets_path}/squad/dev-v1.1.json \ + -v ${checkpoints_path}/vocab.txt \ + -o ${checkpoints_path}/predictions-bert_large_int8_b${BSZ}.json \ + -z ${USE_TRT} \ + --target_qps ${TGT} \ + -i \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc b/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc deleted file mode 100644 index ceea8d8b80468cce08d46637073504e1d3a4057f..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/api/plugin_loader.cc +++ /dev/null @@ -1,168 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include -#include -#include -#include - -#include "NvInfer.h" -#include "NvInferPlugin.h" -#include "NvInferRuntimeCommon.h" -#include "custom_fc/fcPlugin.h" -#include "emb_layernorm/embLayerNormPlugin.h" -#include "emb_layernorm/embLayerNormInt8Plugin.h" -#include "gelu/geluPlugin.h" -#include "qkv_to_context/qkvToContextInt8Plugin.h" -#include "qkv_to_context/qkvToContextPlugin.h" -#include "skip_layernorm/skipLayerNormInt8Plugin.h" -#include "skip_layernorm/skipLayerNormPlugin.h" -#include "ffn/ffnPlugin.h" - -using namespace nvinfer1; - -namespace nvinfer1 { -namespace ixrt_plugin { - -extern ILogger* gLogger; - -} // namespace plugin -} // namespace nvinfer1 - -namespace { -// This singleton ensures that each plugin is only registered once for a given -// namespace and type, and attempts of duplicate registration are ignored. -class PluginCreatorRegistry { - public: - static PluginCreatorRegistry& getInstance() { - static PluginCreatorRegistry instance; - return instance; - } - - string GetPluginUniqKey(const AsciiChar* const plugin_namespace, const AsciiChar* const plugin_name, - const AsciiChar* const plugin_version) { - stringstream os; - if (plugin_namespace[0] != '\0') { - os << plugin_namespace << "/"; - } - os << plugin_name; - if (plugin_version[0] != '\0') { - os << "/" << plugin_version; - } - return os.str(); - } - - template - void addPluginCreator(void* logger, char const* libNamespace) { - printf("start addPluginCreator %s\n", libNamespace); - // Make accesses to the plugin creator registry thread safe - std::lock_guard lock(mRegistryLock); - - std::string errorMsg; - std::string verboseMsg; - - std::unique_ptr pluginCreator{new CreatorType{}}; - pluginCreator->setPluginNamespace(libNamespace); - - nvinfer1::ixrt_plugin::gLogger = static_cast(logger); - std::string pluginType = GetPluginUniqKey(pluginCreator->getPluginNamespace(), pluginCreator->getPluginName(), - pluginCreator->getPluginVersion()); - - if (mRegistryList.find(pluginType) == mRegistryList.end()) { - bool status = getPluginRegistry()->registerCreator(*pluginCreator, libNamespace); - if (status) { - mRegistry.push(std::move(pluginCreator)); - mRegistryList.insert(pluginType); - printf("Registered plugin creator - %s\n", pluginType.c_str()); - verboseMsg = "Registered plugin creator - " + pluginType; - } else { - printf("Could not register plugin creator - %s\n", pluginType.c_str()); - errorMsg = "Could not register plugin creator - " + pluginType; - } - } else { - printf("Plugin creator already registered - %s\n", pluginType.c_str()); - verboseMsg = "Plugin creator already registered - " + pluginType; - } - - if (logger) { - if (!errorMsg.empty()) { - nvinfer1::ixrt_plugin::gLogger->log(ILogger::Severity::kERROR, errorMsg.c_str()); - } - if (!verboseMsg.empty()) { - nvinfer1::ixrt_plugin::gLogger->log(ILogger::Severity::kVERBOSE, verboseMsg.c_str()); - } - } - } - - ~PluginCreatorRegistry() { - std::lock_guard lock(mRegistryLock); - - // Release pluginCreators in LIFO order of registration. - while (!mRegistry.empty()) { - mRegistry.pop(); - } - mRegistryList.clear(); - } - - private: - PluginCreatorRegistry() {} - - std::mutex mRegistryLock; - std::stack> mRegistry; - std::unordered_set mRegistryList; - - public: - PluginCreatorRegistry(PluginCreatorRegistry const&) = delete; - void operator=(PluginCreatorRegistry const&) = delete; -}; - -template -void initializePlugin(void* logger, char const* libNamespace) { - PluginCreatorRegistry::getInstance().addPluginCreator(logger, libNamespace); -} - -} // namespace - -extern "C" { -bool initLibNvInferPlugins(void* logger, const char* libNamespace) { - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - initializePlugin(logger, libNamespace); - return true; -} -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h deleted file mode 100644 index bd094b403acf8fdc83b90ea6628c989e84815316..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/bert/bert_helper.h +++ /dev/null @@ -1,299 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#pragma once -#include -#include - -#include - -#ifndef C10_WARP_SIZE - -#ifdef __ILUVATAR__ -#define C10_WARP_SIZE 64 -#else -#define C10_WARP_SIZE 32 -#endif - -#endif - -namespace nvinfer1 { -namespace ixrt_plugin { -namespace backend { - -const float epsilon = 0.000000000001; -const unsigned int WARP_REDUCE_MASK = 0xffffffff; -const float CUDA_FLOAT_INF_NEG = -100000000.f; // FIXME later -const float CUDA_FLOAT_INF_POS = 100000000.f; // FIXME later -const int CUDA_INT_INF = 2147483647; -const int MAX_THREADS = 1024; - -__forceinline__ __device__ int8_t float2int8(float x, float quant_scale) { - float i8_f = x * quant_scale; - int32_t i8 = floorf(i8_f + 0.5); - i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8); - return int8_t(i8); -} - -inline __device__ void WelfordCombine(float val, float *mean, float *m2, float *count) { - // Use Welford Online algorithem to compute mean and variance - // For more details you can refer to: - // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm - *count += 1; - float delta1 = val - *mean; - *mean += delta1 / *count; - float delta2 = val - *mean; - *m2 += delta1 * delta2; -} - -inline __device__ void WelfordCombine(float b_mean, float b_m2, float b_count, float *mean, float *m2, float *count) { - if (b_count == 0) { - return; - } - float new_count = *count + b_count; - float nb_over_n = b_count / new_count; - float delta = b_mean - *mean; - *mean += delta * nb_over_n; - *m2 += b_m2 + delta * delta * (*count) * nb_over_n; - *count = new_count; -} - -__inline__ __device__ void WelfordWarpReduce(float thread_mean, float thread_m2, float thread_count, float *mean, - float *m2, float *count) { - *mean = thread_mean; - *m2 = thread_m2; - *count = thread_count; - for (int mask = C10_WARP_SIZE / 2; mask > 0; mask /= 2) { - float b_mean = __shfl_down_sync(0xffffffff, *mean, mask); - float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask); - float b_count = __shfl_down_sync(0xffffffff, *count, mask); - WelfordCombine(b_mean, b_m2, b_count, mean, m2, count); - } -} -// addd by pxl -// block内所有数据完成reduce -// template -__inline__ __device__ void WelfordBlockAllReduce(float thread_mean, float thread_m2, float thread_count, - float *result_mean, float *result_m2, float *result_count) { - __shared__ float mean_shared[C10_WARP_SIZE]; - __shared__ float m2_shared[C10_WARP_SIZE]; - __shared__ float count_shared[C10_WARP_SIZE]; - __shared__ float mean_result_broadcast; - __shared__ float m2_result_broadcast; - __shared__ float count_result_broadcast; - - const int lid = threadIdx.x % C10_WARP_SIZE; - const int wid = threadIdx.x / C10_WARP_SIZE; - float warp_mean = 0; - float warp_m2 = 0; - float warp_count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count); - __syncthreads(); - - if (lid == 0) { - mean_shared[wid] = warp_mean; - m2_shared[wid] = warp_m2; - count_shared[wid] = warp_count; - } - __syncthreads(); - - if (wid == 0) { - if (threadIdx.x < blockDim.x / C10_WARP_SIZE) { - warp_mean = mean_shared[lid]; - warp_m2 = m2_shared[lid]; - warp_count = count_shared[lid]; - - } else { - warp_mean = 0.f; - warp_m2 = 0.f; - warp_count = 0.f; - } - __syncwarp(); - - float block_mean = 0; - float block_m2 = 0; - float block_count = 0; - - WelfordWarpReduce(warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count); - - if (lid == 0) { - mean_result_broadcast = block_mean; - m2_result_broadcast = block_m2; - count_result_broadcast = block_count; - } - } - __syncthreads(); - *result_mean = mean_result_broadcast; - *result_m2 = m2_result_broadcast; - *result_count = count_result_broadcast; -} -__forceinline__ __device__ char4 float42char4(float4 vals, float quant_scale) { - char4 res; - res.x = float2int8(vals.x, quant_scale); - res.y = float2int8(vals.y, quant_scale); - res.z = float2int8(vals.z, quant_scale); - res.w = float2int8(vals.w, quant_scale); - return res; -} - -// load 两个 half2, 保存到 float4 -__forceinline__ __device__ void load_float4_from_half(float4 &vals, __half2 *input, int index) { - __half2 i1 = input[index * 2]; - __half2 i2 = input[index * 2 + 1]; - - vals.x = __half2float(i1.x); - vals.y = __half2float(i1.y); - vals.z = __half2float(i2.x); - vals.w = __half2float(i2.y); -} - -/* Convert vector index to 3-dim tensor index */ -__forceinline__ __host__ __device__ void decompose_3dim(int src, int dim1, int dim2, int *id0, int *id1, int *id2) { - *id2 = src % dim2; - src /= dim2; - - *id1 = src % dim1; - *id0 = src / dim1; -} - -__forceinline__ __device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2, int hidden_size, - float epsilon, float4 scale, float4 bias) { - float4 norm_value; - norm_value.x = - (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.x + bias.x; - norm_value.y = - (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.y + bias.y; - norm_value.z = - (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.z + bias.z; - norm_value.w = - (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) * scale.w + bias.w; - return norm_value; -} - -// for layer norm -__forceinline__ __device__ float4 compute_float4_norm_value(float4 vals, float mean, float m2, int hidden_size, - float epsilon, half2 scale_1, half2 scale_2, half2 bias_1, - half2 bias_2) { - float4 norm_value; - norm_value.x = - (vals.x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + __half2float(bias_1.x); - norm_value.y = - (vals.y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + __half2float(bias_1.y); - norm_value.z = - (vals.z - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_2.x) + __half2float(bias_2.x); - norm_value.w = - (vals.w - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_2.y) + __half2float(bias_2.y); - return norm_value; -} -/* Convert half2 into float2, mask inf and -inf */ -__forceinline__ __host__ __device__ float safe_half_to_float(half hval) { - return fmax(fmin(100000.f, __half2float(hval)), -100000.f); -} -__forceinline__ __device__ float4 char4addfloat4_dequant(char4 input_4, float4 residual, - float dequant_scale) { - float4 res; - res.x = __int2float_rn(input_4.x) * dequant_scale + residual.x; - res.y = __int2float_rn(input_4.y) * dequant_scale + residual.y; - res.z = __int2float_rn(input_4.z) * dequant_scale + residual.z; - res.w = __int2float_rn(input_4.w) * dequant_scale + residual.w; - return res; -} - -__forceinline__ __device__ float4 char4addhalf2_dequant(char4 input_4, half2 residual_1, half2 residual_2, - float dequant_scale) { - float4 res; - res.x = __int2float_rn(input_4.x) * dequant_scale + safe_half_to_float(residual_1.x); - res.y = __int2float_rn(input_4.y) * dequant_scale + safe_half_to_float(residual_1.y); - res.z = __int2float_rn(input_4.z) * dequant_scale + safe_half_to_float(residual_2.x); - res.w = __int2float_rn(input_4.w) * dequant_scale + safe_half_to_float(residual_2.y); - return res; -} - -// gelu -// IxinferBiasGeluI8II8OKernel -template -__forceinline__ __device__ T tanhf_exp(T x) { - // float e1 = __expf(x); - // float e2 = 1.0f / e1; - // return (e1 - e2) / (e1 + e2); - - return (2.f / (1.f + __expf(-2.f * x)) - 1.f); -} - -template -__forceinline__ __device__ T gelu(T x) { - float cdf = 0.5f * (1.0f + tanhf_exp((0.7978845608028654f * (x + 0.044715f * x * x * x)))); - return x * cdf; -} - -// softmax -__forceinline__ __host__ __device__ int log2_ceil(int value) { - int log2_value = 0; - while ((1 << log2_value) < value) ++log2_value; - return log2_value; -} -template -__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width, unsigned int mask = 0xffffffff) { -#if !(defined(__HIP_PLATFORM_HCC__) || defined(__ILUVATAR__)) - return __shfl_xor_sync(mask, value, laneMask, width); -#else - return __shfl_xor(value, laneMask, width); -#endif -} - -template -struct Add { - __device__ __forceinline__ T operator()(T a, T b) const { return a + b; } -}; - -template -struct Max { - __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; } -}; -template class ReduceOp> -__device__ __forceinline__ void warp_reduce(acc_t *sum) { - ReduceOp r; -#pragma unroll - for (int offset = REDUCE_WARP_SIZE / 2; offset > 0; offset /= 2) { - acc_t b = WARP_SHFL_XOR(*sum, offset, REDUCE_WARP_SIZE); - *sum = r(*sum, b); - } -} -/* Convert 3-dim tensor index into vector index */ -__forceinline__ __host__ __device__ int targetid_3dim(int id1, int id2, int id3, int dim2, int dim3) { - return id1 * dim2 * dim3 + id2 * dim3 + id3; -} - -/* Convert 4-dim tensor index into vector index */ -__forceinline__ __host__ __device__ int targetid_4dim(int id1, int id2, int id3, int id4, int dim2, int dim3, - int dim4) { - // return id1*(dim2*dim3*dim4) + id2*(dim3*dim4) + id3*dim4 + id4; - int res = id4; - - int ld = dim4; - res += id3 * ld; - - ld *= dim3; - res += id2 * ld; - - ld *= dim2; - res += id1 * ld; - - return res; -} - -} // namespace backend -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h deleted file mode 100644 index c0f3484255b81e5e3d60d79e981359d0fa90c1cf..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/cublas/cublas_helper.h +++ /dev/null @@ -1,312 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#pragma once -#include -#include -#include -#include - -#include - -#include "checkMacrosPlugin.h" - -namespace nvinfer1 { -namespace ixrt_plugin { -namespace backend { - -/* GPU function guard */ - -/** - * @brief cublasLt gemm without imma - * - * @tparam OutType output dtype - * @tparam ScaleType scale dtype - * @param input_a - * @param input_b - * @param output_c - * @param batch_count - * @param m - * @param n - * @param k - * @param stridea - * @param strideb - * @param stridec - * @param alpha - * @param cublasLt_handle - * @param stream - */ -template -void cublaslt_gemm(const int8_t* input_a, const int8_t* input_b, OutType* output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const ScaleType alpha, - cublasLtHandle_t cublasLt_handle, cudaStream_t stream) { - cublasOperation_t transpose = CUBLAS_OP_T; -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t compute_type = CUDA_R_32I; -#endif - cublasLtMatmulDesc_t matmul_desc; - cublasLtMatrixLayout_t desc_a = NULL; - cublasLtMatrixLayout_t desc_b = NULL; - cublasLtMatrixLayout_t desc_c = NULL; - - cudaDataType_t out_dtype; - cudaDataType_t scale_dtype; - if (std::is_same::value) { - out_dtype = CUDA_R_32I; - scale_dtype = CUDA_R_32I; - } else if (std::is_same::value) { - out_dtype = CUDA_R_8I; - scale_dtype = CUDA_R_32F; - } else { - throw std::runtime_error("Unsupported output type"); - } - -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype)); -#else - CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type)); - CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype, - sizeof(scale_dtype))); -#endif - CHECK_GPU_ERROR( - cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transpose, sizeof(transpose))); - - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_8I, k, m, k)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_8I, k, n, k)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m)); - - if (batch_count > 1) { - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, - sizeof(stridea))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, - sizeof(strideb))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, - sizeof(stridec))); - } - - ScaleType beta = ScaleType(0); - CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta, - output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream)); - - CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c)); -} - -inline void cublaslt_gemm(const half* input_a, const half* input_b, half* output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cublasLtHandle_t cublasLt_handle, cudaStream_t stream) { - cublasOperation_t transpose = CUBLAS_OP_T; -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; -#else - cudaDataType_t compute_type = CUDA_R_32F; -#endif - cublasLtMatmulDesc_t matmul_desc; - cublasLtMatrixLayout_t desc_a = NULL; - cublasLtMatrixLayout_t desc_b = NULL; - cublasLtMatrixLayout_t desc_c = NULL; - - cudaDataType_t out_dtype = CUDA_R_16F; - cudaDataType_t scale_dtype = CUDA_R_32F; - -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype)); -#else - CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type)); - CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype, - sizeof(scale_dtype))); -#endif - CHECK_GPU_ERROR( - cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transpose, sizeof(transpose))); - - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_16F, k, m, k)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_16F, k, n, k)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m)); - - if (batch_count > 1) { - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, - sizeof(stridea))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, - sizeof(strideb))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, - sizeof(stridec))); - } - - float beta = 0.0; - CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta, - output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream)); - - CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c)); -} - -template void cublaslt_gemm(const int8_t* input_a, const int8_t* input_b, int32_t* output_c, - int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb, - int64_t stridec, const int32_t alpha, cublasLtHandle_t cublasLt_handle, - cudaStream_t stream); - -template void cublaslt_gemm(const int8_t* input_a, const int8_t* input_b, int8_t* output_c, - int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb, - int64_t stridec, const float alpha, cublasLtHandle_t cublasLt_handle, - cudaStream_t stream); - -/************add by pxl *************/ -template -void cublaslt_gemm_nn(const int8_t* input_a, const int8_t* input_b, OutType* output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const ScaleType alpha, - cublasLtHandle_t cublasLt_handle, cudaStream_t stream) { -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - cublasComputeType_t compute_type = CUBLAS_COMPUTE_32I; -#else - cudaDataType_t compute_type = CUDA_R_32I; -#endif - cublasLtMatmulDesc_t matmul_desc; - cublasLtMatrixLayout_t desc_a = NULL; - cublasLtMatrixLayout_t desc_b = NULL; - cublasLtMatrixLayout_t desc_c = NULL; - - cudaDataType_t out_dtype; - cudaDataType_t scale_dtype; - if (std::is_same::value) { - out_dtype = CUDA_R_32I; - scale_dtype = CUDA_R_32I; - } else if (std::is_same::value) { - out_dtype = CUDA_R_8I; - scale_dtype = CUDA_R_32F; - } else { - throw std::runtime_error("Unsupported output type"); - } - -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype)); -#else - CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type)); - CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype, - sizeof(scale_dtype))); -#endif - - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_8I, m, k, m)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_8I, k, n, k)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m)); - - if (batch_count > 1) { - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, - sizeof(stridea))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, - sizeof(strideb))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, - sizeof(stridec))); - } - - ScaleType beta = ScaleType(0); - CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta, - output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream)); - - CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c)); -} - -template void cublaslt_gemm_nn(const int8_t* input_a, const int8_t* input_b, int32_t* output_c, - int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb, - int64_t stridec, const int32_t alpha, cublasLtHandle_t cublasLt_handle, - cudaStream_t stream); - -template void cublaslt_gemm_nn(const int8_t* input_a, const int8_t* input_b, int8_t* output_c, - int batchCount, int m, int n, int k, int64_t stridea, int64_t strideb, - int64_t stridec, const float alpha, cublasLtHandle_t cublasLt_handle, - cudaStream_t stream); - -inline void cublaslt_gemm_nn(const half* input_a, const half* input_b, half* output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cublasLtHandle_t cublasLt_handle, cudaStream_t stream) { -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; -#else - cudaDataType_t compute_type = CUDA_R_32F; -#endif - cublasLtMatmulDesc_t matmul_desc; - cublasLtMatrixLayout_t desc_a = NULL; - cublasLtMatrixLayout_t desc_b = NULL; - cublasLtMatrixLayout_t desc_c = NULL; - - cudaDataType_t out_dtype = CUDA_R_16F; - cudaDataType_t scale_dtype = CUDA_R_32F; - -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_dtype)); -#else - CHECK_GPU_ERROR(cublasLtMatmulDescCreate(&matmul_desc, compute_type)); - CHECK_GPU_ERROR(cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_dtype, - sizeof(scale_dtype))); -#endif - - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_a, CUDA_R_16F, m, k, m)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_b, CUDA_R_16F, k, n, k)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutCreate(&desc_c, out_dtype, m, n, m)); - - if (batch_count > 1) { - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_a, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, - sizeof(stridea))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_b, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, - sizeof(strideb))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, - sizeof(batch_count))); - CHECK_GPU_ERROR(cublasLtMatrixLayoutSetAttribute(desc_c, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, - sizeof(stridec))); - } - - float beta = 0.0; - CHECK_GPU_ERROR(cublasLtMatmul(cublasLt_handle, matmul_desc, &alpha, input_a, desc_a, input_b, desc_b, &beta, - output_c, desc_c, output_c, desc_c, NULL, NULL, 0, stream)); - - CHECK_GPU_ERROR(cublasLtMatmulDescDestroy(matmul_desc)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_a)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_b)); - CHECK_GPU_ERROR(cublasLtMatrixLayoutDestroy(desc_c)); -} - -} // namespace backend -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu b/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu deleted file mode 100644 index b3f0bbcb3322868b8f9ec485cb294beec2373008..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.cu +++ /dev/null @@ -1,416 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "ixinfer_gemm_helper.h" - -namespace nvinfer1::ixrt_plugin { -namespace backend { - -void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cuinferHandle_t cuinfer_handle, cudaStream_t stream) { - /* TN: input_a: m,k input_b: n,k output_c: n,m */ - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_T; - cuinferOperation_t transb = CUINFER_OP_N; - - cudaDataType_t Atype = CUDA_R_8I; - cudaDataType_t Btype = CUDA_R_8I; - cudaDataType_t Ctype = CUDA_R_8I; - cudaDataType_t computeType = CUDA_R_32I; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - - int lda = k; - int ldb = k; - int ldc = m; - - float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, nullptr, customOption); - - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error!, error type: " + std::to_string((int)status) + " !"); - } -} - -void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, const float *bias, int8_t *output_c, int batch_count, - int m, int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const float beta, const int act_type, cuinferHandle_t &cuinfer_handle, cudaStream_t &stream) { - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_T; - cuinferOperation_t transb = CUINFER_OP_N; - cudaDataType_t Atype = CUDA_R_8I; - cudaDataType_t Btype = CUDA_R_8I; - cudaDataType_t Ctype = CUDA_R_8I; - cudaDataType_t computeType = CUDA_R_32I; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption; - if (bias != nullptr) { - if (act_type == 3) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU; - } else if (act_type == 4) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU; - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS; - } - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - } - - int lda = k; - int ldb = k; - int ldc = m; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, (void *)bias, customOption); - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !"); - } -} - -void cuinfer_nn_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cuinferHandle_t cuinfer_handle, cudaStream_t stream) { - /* TN: input_a: k,m input_b: n,k output_c: n,m */ - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_N; - cuinferOperation_t transb = CUINFER_OP_N; - - cudaDataType_t Atype = CUDA_R_8I; - cudaDataType_t Btype = CUDA_R_8I; - cudaDataType_t Ctype = CUDA_R_8I; - cudaDataType_t computeType = CUDA_R_32I; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - - int lda = m; - int ldb = k; - int ldc = m; - - float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, nullptr, customOption); - - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error!"); - } -} - -void cuinfer_nt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cuinferHandle_t cuinfer_handle, cudaStream_t stream) { - /* TN: input_a: k,m input_b: k,n output_c: n,m */ - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_N; - cuinferOperation_t transb = CUINFER_OP_T; - - cudaDataType_t Atype = CUDA_R_8I; - cudaDataType_t Btype = CUDA_R_8I; - cudaDataType_t Ctype = CUDA_R_8I; - cudaDataType_t computeType = CUDA_R_32I; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - - int lda = m; - int ldb = n; - int ldc = m; - - float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, nullptr, customOption); - - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error!"); - } -} - -void cuinfer_tt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cuinferHandle_t cuinfer_handle, cudaStream_t stream) { - /* TN: input_a: k,m input_b: k,n output_c: n,m */ - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_T; - cuinferOperation_t transb = CUINFER_OP_T; - - cudaDataType_t Atype = CUDA_R_8I; - cudaDataType_t Btype = CUDA_R_8I; - cudaDataType_t Ctype = CUDA_R_8I; - cudaDataType_t computeType = CUDA_R_32I; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - - int lda = k; - int ldb = n; - int ldc = m; - - float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, nullptr, customOption); - - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error!"); - } -} - -void cuinfer_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k, - int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t handle, - cudaStream_t stream) { - /* Performs operation using cublas */ - float beta = 0.0f; - cublasSetStream(handle, stream); - cublasStatus_t status; - if (batch_count <= 1) { - status = cublasGemmEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, k, input_b, - CUDA_R_16F, k, &beta, output_c, CUDA_R_16F, m, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); - } else { - status = cublasGemmStridedBatchedEx(handle, CUBLAS_OP_T, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, k, - stridea, input_b, CUDA_R_16F, k, strideb, &beta, output_c, CUDA_R_16F, m, - stridec, batch_count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); - } - if (status != CUBLAS_STATUS_SUCCESS) { - throw std::runtime_error("cuinfer_gemm error!"); - } -} - -void cuinfer_nn_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k, - int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t handle, - cudaStream_t stream) { - /* Performs operation using cublas */ - float beta = 0.0f; - cublasSetStream(handle, stream); - cublasStatus_t status; - if (batch_count <= 1) { - // k,m n,k - status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, m, input_b, - CUDA_R_16F, k, &beta, output_c, CUDA_R_16F, m, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); - } else { - status = cublasGemmStridedBatchedEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, input_a, CUDA_R_16F, m, - stridea, input_b, CUDA_R_16F, k, strideb, &beta, output_c, CUDA_R_16F, m, - stridec, batch_count, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); - } - if (status != CUBLAS_STATUS_SUCCESS) { - throw std::runtime_error("cuinfer_gemm error!"); - } -} - -void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) { - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_T; - cuinferOperation_t transb = CUINFER_OP_N; - cudaDataType_t Atype = CUDA_R_16F; - cudaDataType_t Btype = CUDA_R_16F; - cudaDataType_t Ctype = CUDA_R_16F; - cudaDataType_t computeType = CUDA_R_32F; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption; - if (bias != nullptr) { - if (act_type == 3) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU; - } else if (act_type == 4) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU; - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS; - } - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - // std::cout << "CUINFER_BLAS_GEMM_CUSTOM_NONE" << std::endl; - } - - int lda = k; - int ldb = k; - int ldc = m; - float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, (void *)bias, customOption); - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !"); - } -} -void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, const float beta, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) { - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_T; - cuinferOperation_t transb = CUINFER_OP_N; - cudaDataType_t Atype = CUDA_R_16F; - cudaDataType_t Btype = CUDA_R_16F; - cudaDataType_t Ctype = CUDA_R_16F; - cudaDataType_t computeType = CUDA_R_32F; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption; - if (bias != nullptr) { - if (act_type == 3) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU; - } else if (act_type == 4) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU; - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS; - } - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - // std::cout << "CUINFER_BLAS_GEMM_CUSTOM_NONE" << std::endl; - } - - int lda = k; - int ldb = k; - int ldc = m; - // float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, (void *)bias, customOption); - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !"); - } -} -void cuinfer_nn_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) { - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_N; - cuinferOperation_t transb = CUINFER_OP_N; - cudaDataType_t Atype = CUDA_R_16F; - cudaDataType_t Btype = CUDA_R_16F; - cudaDataType_t Ctype = CUDA_R_16F; - cudaDataType_t computeType = CUDA_R_32F; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption; - if (bias != nullptr) { - if (act_type == 3) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU; - - } else if (act_type == 4) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU; - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS; - } - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - } - - int lda = m; - int ldb = k; - int ldc = m; - float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, (void *)bias, customOption); - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !"); - } -} -void cuinfer_nt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) { - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_N; - cuinferOperation_t transb = CUINFER_OP_T; - cudaDataType_t Atype = CUDA_R_16F; - cudaDataType_t Btype = CUDA_R_16F; - cudaDataType_t Ctype = CUDA_R_16F; - cudaDataType_t computeType = CUDA_R_32F; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption; - if (bias != nullptr) { - if (act_type == 3) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU; - - } else if (act_type == 4) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU; - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS; - } - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - } - - int lda = m; - int ldb = n; - int ldc = m; - float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, (void *)bias, customOption); - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !"); - } -} - -void cuinfer_tt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle) { - cuinferPointerMode_t cuinfer_ptr_mode = CUINFER_POINTER_MODE_HOST; - cuinferOperation_t transa = CUINFER_OP_T; - cuinferOperation_t transb = CUINFER_OP_T; - cudaDataType_t Atype = CUDA_R_16F; - cudaDataType_t Btype = CUDA_R_16F; - cudaDataType_t Ctype = CUDA_R_16F; - cudaDataType_t computeType = CUDA_R_32F; - cudaDataType_t scaleType = CUDA_R_32F; - cuinferGEMMCustomOption_t customOption; - if (bias != nullptr) { - if (act_type == 3) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_GELU; - - } else if (act_type == 4) { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS_RELU; - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_HALFBIAS; - } - } else { - customOption = CUINFER_BLAS_GEMM_CUSTOM_NONE; - } - - int lda = k; - int ldb = n; - int ldc = m; - float beta = 0.f; - - cuinferStatus_t status = - cuinferCustomGemm(cuinfer_handle, stream, cuinfer_ptr_mode, transa, transb, m, n, k, &alpha, input_a, Atype, - lda, stridea, input_b, Btype, ldb, strideb, &beta, output_c, Ctype, ldc, stridec, batch_count, - computeType, scaleType, nullptr, (void *)bias, customOption); - if (status != CUINFER_STATUS_SUCCESS) { - throw std::runtime_error("cuinferCustomGemm error, error type: " + std::to_string((int)status) + " !"); - } -} - -} // namespace backend -} // namespace nvinfer1::ixrt_plugin diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h b/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h deleted file mode 100644 index 2433b3a15ad2b4fb277dc0c5a233f28541cbb132..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/backend/ixinfer/ixinfer_gemm_helper.h +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#pragma once -#include -#include -#include -#include - -#include - -namespace nvinfer1 { -namespace ixrt_plugin { -namespace backend { - -void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cuinferHandle_t cuinfer_handle, cudaStream_t stream); - -void cuinfer_i8_gemm(const int8_t *input_a, const int8_t *input_b, const float *bias, int8_t *output_c, int batch_count, - int m, int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const float beta, const int act_type, cuinferHandle_t &cuinfer_handle, cudaStream_t &stream); - -void cuinfer_nn_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cuinferHandle_t cuinfer_handle, cudaStream_t stream); - -void cuinfer_nt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cuinferHandle_t cuinfer_handle, cudaStream_t stream); - -void cuinfer_tt_i8_gemm(const int8_t *input_a, const int8_t *input_b, int8_t *output_c, int batch_count, int m, int n, - int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - cuinferHandle_t cuinfer_handle, cudaStream_t stream); - -void cuinfer_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k, - int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t cublas_handle, - cudaStream_t stream); - -void cuinfer_nn_gemm(const half *input_a, const half *input_b, half *output_c, int batch_count, int m, int n, int k, - int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, cublasHandle_t cublas_handle, - cudaStream_t stream); - -void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle); -void cuinfer_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, const float beta, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle); -void cuinfer_nn_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle); -void cuinfer_nt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle); -void cuinfer_tt_gemm(const half *input_a, const half *input_b, const half *bias, half *output_c, int batch_count, int m, - int n, int k, int64_t stridea, int64_t strideb, int64_t stridec, const float alpha, - const int act_type, cudaStream_t &stream, cuinferHandle_t &cuinfer_handle); -} // namespace bert -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h deleted file mode 100644 index a75d902fb263c5ed484ec932e4c2c579b7db10c0..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/common/bertCommon.h +++ /dev/null @@ -1,242 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#pragma once -#include - -#include -#include -#include -#include - -#include "NvInfer.h" -#include "NvInferRuntime.h" -#include "NvInferRuntimeCommon.h" -#include "checkMacrosPlugin.h" - -namespace nvinfer1 { -namespace ixrt_plugin { -namespace bert { - -constexpr uint32_t BDIM = 0; // batch dimension -constexpr uint32_t SDIM = 1; // seq len dimension -constexpr uint32_t HDIM = 2; // hidden dimension - -#define TRT_UNUSED (void) - -template -struct CudaDeleter { - void operator()(T* buf) { IXRT_PLUGIN_CUASSERT(cudaFree(buf)); } -}; - -template -using cuda_unique_ptr = std::unique_ptr>; - -inline uint32_t getElementSize(nvinfer1::DataType t) noexcept { - switch (t) { - case nvinfer1::DataType::kINT32: - return 4; - case nvinfer1::DataType::kFLOAT: - return 4; - case nvinfer1::DataType::kHALF: - return 2; - case nvinfer1::DataType::kBOOL: - // case nvinfer1::DataType::kUINT8: - case nvinfer1::DataType::kINT8: - return 1; - default: - break; - // case DataType::kUNKNOWN: - // case DataType::kINT64: - // case DataType::kFLOAT64: - // break; - } - return 0; -} - -inline int64_t getWeightsSize(nvinfer1::Weights const& w, nvinfer1::DataType type) { - return w.count * getElementSize(type); -} - -template -using cuda_shared_ptr = std::shared_ptr; - -template -void make_cuda_shared(cuda_shared_ptr& ptr, void* cudaMem) { - ptr.reset(static_cast(cudaMem), bert::CudaDeleter()); -} - -struct WeightsWithOwnership : public nvinfer1::Weights { - ILogger* logger_; - WeightsWithOwnership() { - values = nullptr; - count = 0; - } - ~WeightsWithOwnership() { operator delete[](const_cast(values)); } - - WeightsWithOwnership(WeightsWithOwnership const&) = delete; - WeightsWithOwnership operator=(WeightsWithOwnership const&) = delete; - WeightsWithOwnership(WeightsWithOwnership const&&) = delete; - WeightsWithOwnership operator=(WeightsWithOwnership const&&) = delete; - - void convertAndCopy(nvinfer1::Weights const& src, nvinfer1::DataType type, float scale = 1) { - this->type = type; - this->count = src.count; - - if (type == nvinfer1::DataType::kFLOAT) { - auto destBuf = new float[src.count]; - this->values = destBuf; - - if (src.type == nvinfer1::DataType::kFLOAT) { - ixrt_plugin::gLogInfo << "Float Weights(Host) => Float Array(Host)" << endl; - std::copy_n(static_cast(src.values), src.count, destBuf); - } else { - IXRT_PLUGIN_ASSERT(src.type == nvinfer1::DataType::kHALF); - - ixrt_plugin::gLogInfo << "Half Weights(Host) => Float Array(Host)" << endl; - auto const s = static_cast(src.values); - auto d = static_cast(const_cast(this->values)); - - for (auto it = 0; it < src.count; it++) { - d[it] = __half2float(s[it]); - } - } - } else if (type == nvinfer1::DataType::kHALF) { - auto destBuf = new half[src.count]; - this->values = destBuf; - - if (src.type == nvinfer1::DataType::kHALF) { - ixrt_plugin::gLogInfo << "Half Weights(Host) => Half Array(Host)" << endl; - std::copy_n(static_cast(src.values), src.count, destBuf); - } else { - IXRT_PLUGIN_ASSERT(src.type == nvinfer1::DataType::kFLOAT); - - ixrt_plugin::gLogInfo << "Float Weights(Host) => Half Array(Host)" << endl; - auto const s = static_cast(src.values); - auto d = static_cast(const_cast(this->values)); - - for (auto it = 0; it < src.count; it++) { - d[it] = __float2half(s[it]); - } - } - } else if (type == nvinfer1::DataType::kINT8) { - auto destBuf = new int8_t[src.count]; - this->values = destBuf; - - if (src.type == nvinfer1::DataType::kFLOAT) { - ixrt_plugin::gLogInfo << "Float Weights(Host) => Int8 Array(Host)" << endl; - auto const s = static_cast(src.values); - auto d = static_cast(const_cast(this->values)); - - for (auto it = 0; it < src.count; it++) { - int32_t v = static_cast(std::roundf(s[it] / scale)); - d[it] = v <= -127 ? -127 : (v >= 127 ? 127 : v); - } - } else if (src.type == nvinfer1::DataType::kINT8) { - ixrt_plugin::gLogInfo << "Int8 Weights(Host) => Int8 Array(Host)" << endl; - std::copy_n(static_cast(src.values), src.count, destBuf); - } else { - throw std::runtime_error("Unsupported DataType specified for plugin."); - } - } else { - throw std::runtime_error("Unsupported DataType specified for plugin."); - } - } - - void convertAndCopy(char const*& srcBuf, size_t count, nvinfer1::DataType type) noexcept { - this->type = type; - this->count = count; - auto const nbBytes = getWeightsSize(*this, type); - auto destBuf = new char[nbBytes]; - this->values = destBuf; - - std::copy_n(srcBuf, nbBytes, destBuf); - srcBuf += nbBytes; - } -}; - -template -inline void copyToDevice(WeightsWithOwnership& hostWeights, size_t nbBytes, cuda_unique_ptr& cudaWeights) { - if (hostWeights.values) { - void* cudaMem{nullptr}; - IXRT_PLUGIN_CUASSERT(cudaMalloc(&cudaMem, nbBytes)); - IXRT_PLUGIN_CUASSERT(cudaMemcpy(cudaMem, hostWeights.values, nbBytes, cudaMemcpyHostToDevice)); - cudaWeights.reset(static_cast(cudaMem)); - } -} - -template -inline void serFromDev(char*& buffer, T const* data, size_t nbElem) { - const size_t len = sizeof(T) * nbElem; - IXRT_PLUGIN_CUASSERT(cudaMemcpy(buffer, static_cast(data), len, cudaMemcpyDeviceToHost)); - buffer += len; -} - -template -inline T* deserToDev(char const*& buffer, size_t nbElem) { - void* dev{nullptr}; - const size_t len = sizeof(T) * nbElem; - IXRT_PLUGIN_CUASSERT(cudaMalloc(&dev, len)); - IXRT_PLUGIN_CUASSERT(cudaMemcpy(dev, buffer, len, cudaMemcpyHostToDevice)); - - buffer += len; - return static_cast(dev); -} - -inline nvinfer1::DataType fieldTypeToDataType(const nvinfer1::PluginFieldType ftype) { - switch (ftype) { - case nvinfer1::PluginFieldType::kFLOAT32: { - gLogInfo << "PluginFieldType is Float32" << endl; - return nvinfer1::DataType::kFLOAT; - } - case nvinfer1::PluginFieldType::kFLOAT16: { - gLogInfo << "PluginFieldType is Float16" << endl; - return nvinfer1::DataType::kHALF; - } - case nvinfer1::PluginFieldType::kINT32: { - gLogInfo << "PluginFieldType is Int32" << endl; - return nvinfer1::DataType::kINT32; - } - case nvinfer1::PluginFieldType::kINT8: { - gLogInfo << "PluginFieldType is Int8" << endl; - return nvinfer1::DataType::kINT8; - } - default: - throw std::invalid_argument("No corresponding datatype for plugin field type"); - } -} - -inline int64_t volume(nvinfer1::Dims const& d) { - return std::accumulate(d.d, d.d + d.nbDims, int64_t{1}, std::multiplies{}); -} -} // namespace bert -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp deleted file mode 100644 index 8e705d6cdb96aef58aa1169cd6d99b5671d0d69e..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "checkMacrosPlugin.h" - -#include "NvInferRuntimeCommon.h" - -namespace nvinfer1 { -namespace ixrt_plugin { - -ILogger* gLogger{}; - -template -int32_t LogStream::Buf::sync() { - std::string s = str(); - while (!s.empty() && s.back() == '\n') { - s.pop_back(); - } - if (gLogger != nullptr) { - gLogger->log(kSeverity, s.c_str()); - } - str(""); - return 0; -} - -// These use gLogger, and therefore require initLibNvInferPlugins() to be called with a logger -// (otherwise, it will not log) -LogStream gLogError; -LogStream gLogWarning; -LogStream gLogInfo; -LogStream gLogVerbose; - -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h deleted file mode 100644 index 76d87a927516e4521ebb5233c1e2b729feab9532..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/common/checkMacrosPlugin.h +++ /dev/null @@ -1,221 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#pragma once -#include - -#include -#include -#include -#include - -#include "NvInfer.h" -#include "NvInferRuntime.h" - -// Logs failed assertion and aborts. -// Aborting is undesirable and will be phased-out from the plugin module, at which point -// PLUGIN_ASSERT will perform the same function as PLUGIN_VALIDATE. -using namespace std; - -namespace nvinfer1 { -namespace ixrt_plugin { - -#ifdef _MSC_VER -#define FN_NAME __FUNCTION__ -#else -#define FN_NAME __func__ -#endif - -#define IXRT_PLUGIN_CHECK_VALUE(value, msg) \ - { \ - if (not(value)) { \ - std::cerr << __FILE__ << " (" << __LINE__ << ")" \ - << "-" << __FUNCTION__ << " : " \ - << " Plugin assert error: " << msg << std::endl; \ - std::exit(EXIT_FAILURE); \ - } \ - } - -#define IXRT_PLUGIN_ASSERT(value) \ - { \ - if (not(value)) { \ - std::cerr << __FILE__ << " (" << __LINE__ << ")" \ - << "-" << __FUNCTION__ << " : " \ - << " Plugin assert false" << std::endl; \ - std::exit(EXIT_FAILURE); \ - } \ - } - -#define IXRT_PLUGIN_CHECK_CUDA(call) \ - do { \ - const cudaError_t error_code = call; \ - if (error_code != cudaSuccess) { \ - printf("CUDA Error:\n"); \ - printf(" File: %s\n", __FILE__); \ - printf(" Line: %d\n", __LINE__); \ - printf(" Error code: %d\n", error_code); \ - printf(" Error text: %s\n", cudaGetErrorString(error_code)); \ - exit(1); \ - } \ - } while (0) - -inline void caughtError(const std::exception& e) { std::cerr << e.what() << std::endl; } - -#define IXRT_PLUGIN_FAIL(msg) \ - do { \ - std::ostringstream stream; \ - stream << "Assertion failed: " << msg << "\n" \ - << __FILE__ << ':' << __LINE__ << "\n" \ - << "Aborting..." \ - << "\n"; \ - IXRT_PLUGIN_CHECK_CUDA(cudaDeviceReset()); \ - abort; \ - } while (0) - -inline void throwCudaError(char const* file, char const* function, int32_t line, int32_t status, char const* msg) { - std::cerr << file << " (" << line << ")" - << "-" << function << " : " << msg << std::endl; - std::exit(EXIT_FAILURE); -} - -#define IXRT_PLUGIN_CUASSERT(status_) \ - { \ - auto s_ = status_; \ - if (s_ != cudaSuccess) { \ - const char* msg = cudaGetErrorString(s_); \ - throwCudaError(__FILE__, FN_NAME, __LINE__, s_, msg); \ - } \ - } - -#undef CUINFER_CHECK -#define CUINFER_CHECK(func) \ - do { \ - cuinferStatus_t status = (func); \ - if (status != CUINFER_STATUS_SUCCESS) { \ - std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \ - << cuinferGetErrorString(status) << std::endl; \ - std::exit(EXIT_FAILURE); \ - } \ - } while (0) - -static std::string _cudaGetErrorString(cublasStatus_t error) { - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; - - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; - } - return "CUBLAS_UNKNOW"; -} - -template -void check_gpu_error(T result, char const* const func, const char* const file, int const line) { - if (result) { - throw std::runtime_error(std::string("[CUDA][ERROR] ") + +file + "(" + std::to_string(line) + - "): " + (_cudaGetErrorString(result)) + "\n"); - } -} - -#define CHECK_GPU_ERROR(val) check_gpu_error((val), #val, __FILE__, __LINE__) - -template -class LogStream : public std::ostream { - class Buf : public std::stringbuf { - public: - int32_t sync() override; - }; - - Buf buffer; - std::mutex mLogStreamMutex; - - public: - std::mutex& getMutex() { return mLogStreamMutex; } - LogStream() : std::ostream(&buffer){}; -}; - -// Use mutex to protect multi-stream write to buffer -template -LogStream& operator<<(LogStream& stream, T const& msg) { - std::lock_guard guard(stream.getMutex()); - auto& os = static_cast(stream); - os << msg; - return stream; -} - -// Special handling static numbers -template -inline LogStream& operator<<(LogStream& stream, int32_t num) { - std::lock_guard guard(stream.getMutex()); - auto& os = static_cast(stream); - os << num; - return stream; -} - -// Special handling std::endl -template -inline LogStream& operator<<(LogStream& stream, std::ostream& (*f)(std::ostream&)) { - std::lock_guard guard(stream.getMutex()); - auto& os = static_cast(stream); - os << f; - return stream; -} - -extern LogStream gLogError; -extern LogStream gLogWarning; -extern LogStream gLogInfo; -extern LogStream gLogVerbose; -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh b/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh deleted file mode 100644 index b9b9eb8e4cec752014ccfdab5b259619d9d8d945..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/common/common_def.cuh +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ - -#pragma once - -#include - -#include -namespace nvinfer1 { -namespace ixrt_plugin { -#ifdef __ILUVATAR__ -static const int kMaxThreadNbPerBlock = 1024; -static const int kMaxBlockNbPerSM = 8; -static const int kWarpSize = 64; -static const dim3 kMaxBlockDimension = {4096, 4096, 64}; -static const dim3 kMaxGridDimension = {4294967295, 65536, 65536}; -static const int kNbThreadsPerBlockGainBestPerformance = 1024; -static const int kMaxSharedMemSizePerBlock = (128 * 1024 * 4); -static const int kNbSmemLane = 64; -static const int kNbBytesPerSmemLane = 4; -#else -static const int kMaxThreadNbPerBlock = 1024; -static const int kMaxBlockNbPerSM = 8; -static const int kWarpSize = 32; -static const dim3 kMaxBlockDimension = {1024, 1024, 64}; -static const dim3 kMaxGridDimension = {2147483647, 65535, 65535}; -static const int kNbThreadsPerBlockGainBestPerformance = 256; -static const int kMaxSharedMemSizePerBlock = 48 * 1024 * 4; -static const int kNbSmemLane = 32; -static const int kNbBytesPerSmemLane = 4; -#endif - -static const int kNbCe = 4; -static const int kNbCuPerCe = 4; -static const int kNbSppPerCu = 4; - -static const float kLog2e = 1.442695040888963387; - -#define DivUp(x, y) (((x) + (y)-1) / (y)) - -__device__ __forceinline__ float floatExp(float x) { return __builtin_exp2f(kLog2e * x); } - -__device__ __forceinline__ float floatLog(float x) { return __logf(x); } - -__forceinline__ int nearest_num(int x, int value) { - if (x % value == 0) { - return x; - } else { - int padding = value - x % value; - return x + padding; - } -} -} // namespace nvinfer1::ixrt_plugin -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp deleted file mode 100644 index 29908ff168e82bc43da09260bd7d5eb4dd52f94b..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "plugin.h" -#include "checkMacrosPlugin.h" - -namespace nvinfer1 -{ -namespace ixrt_plugin -{ - -void validateRequiredAttributesExist(std::set requiredFieldNames, PluginFieldCollection const* fc) -{ - for (int32_t i = 0; i < fc->nbFields; i++) - { - requiredFieldNames.erase(fc->fields[i].name); - } - if (!requiredFieldNames.empty()) - { - std::stringstream msg{}; - msg << "PluginFieldCollection missing required fields: {"; - char const* separator = ""; - for (auto const& field : requiredFieldNames) - { - msg << separator << field; - separator = ", "; - } - msg << "}"; - std::string msg_str = msg.str(); - IXRT_PLUGIN_CHECK_VALUE(false, msg_str.c_str()); - } -} - -} // namespace ixrt_plugin -} // namespace nvinfer1 \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h deleted file mode 100644 index b24ef30067eeb17e526f9ee1430031873645dde0..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/common/plugin.h +++ /dev/null @@ -1,72 +0,0 @@ - -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#pragma once -#include -#include -#include -#include "NvInferRuntimeCommon.h" - -typedef enum -{ - STATUS_SUCCESS = 0, - STATUS_FAILURE = 1, - STATUS_BAD_PARAM = 2, - STATUS_NOT_SUPPORTED = 3, - STATUS_NOT_INITIALIZED = 4 -} pluginStatus_t; - -namespace nvinfer1 { - -namespace ixrt_plugin { - - -// Write values into buffer -template -void write(char*& buffer, const T& val) { - std::memcpy(buffer, &val, sizeof(T)); - buffer += sizeof(T); -} - -// Read values from buffer -template -T read(const char*& buffer) { - T val{}; - std::memcpy(&val, buffer, sizeof(T)); - buffer += sizeof(T); - return val; -} - -void validateRequiredAttributesExist(std::set requiredFieldNames, PluginFieldCollection const* fc); - -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h b/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h deleted file mode 100644 index 11ef7eca97ce5506712ee7993957cf3ac1eb0086..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/common/serialize.h +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#pragma once - -#include -#include -#include -#include - -#include -using std::cerr; -using std::cout; -using std::endl; - -template -inline void serialize_value(void** buffer, T const& value); - -template -inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value); - -namespace -{ - -template -struct Serializer -{ -}; - -template -struct Serializer::value || std::is_enum::value || std::is_pod::value>::type> -{ - static size_t serialized_size(T const&) - { - return sizeof(T); - } - static void serialize(void** buffer, T const& value) - { - ::memcpy(*buffer, &value, sizeof(T)); - reinterpret_cast(*buffer) += sizeof(T); - } - static void deserialize(void const** buffer, size_t* buffer_size, T* value) - { - assert(*buffer_size >= sizeof(T)); - ::memcpy(value, *buffer, sizeof(T)); - reinterpret_cast(*buffer) += sizeof(T); - *buffer_size -= sizeof(T); - } -}; - -template <> -struct Serializer -{ - static size_t serialized_size(const char* value) - { - return strlen(value) + 1; - } - static void serialize(void** buffer, const char* value) - { - ::strcpy(static_cast(*buffer), value); - reinterpret_cast(*buffer) += strlen(value) + 1; - } - static void deserialize(void const** buffer, size_t* buffer_size, const char** value) - { - *value = static_cast(*buffer); - size_t data_size = strnlen(*value, *buffer_size) + 1; - assert(*buffer_size >= data_size); - reinterpret_cast(*buffer) += data_size; - *buffer_size -= data_size; - } -}; - -template -struct Serializer, - typename std::enable_if::value || std::is_enum::value || std::is_pod::value>::type> -{ - static size_t serialized_size(std::vector const& value) - { - return sizeof(value.size()) + value.size() * sizeof(T); - } - static void serialize(void** buffer, std::vector const& value) - { - serialize_value(buffer, value.size()); - size_t nbyte = value.size() * sizeof(T); - ::memcpy(*buffer, value.data(), nbyte); - reinterpret_cast(*buffer) += nbyte; - } - static void deserialize(void const** buffer, size_t* buffer_size, std::vector* value) - { - size_t size; - deserialize_value(buffer, buffer_size, &size); - value->resize(size); - size_t nbyte = value->size() * sizeof(T); - assert(*buffer_size >= nbyte); - ::memcpy(value->data(), *buffer, nbyte); - reinterpret_cast(*buffer) += nbyte; - *buffer_size -= nbyte; - } -}; - -} // namespace - -template -inline size_t serialized_size(T const& value) -{ - return Serializer::serialized_size(value); -} - -template -inline void serialize_value(void** buffer, T const& value) -{ - return Serializer::serialize(buffer, value); -} - -template -inline void deserialize_value(void const** buffer, size_t* buffer_size, T* value) -{ - return Serializer::deserialize(buffer, buffer_size, value); -} \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp deleted file mode 100644 index cf00d620b2c3d47f0bea4bbad3f9fdc003bbe6bd..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cpp +++ /dev/null @@ -1,431 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "NvInferRuntimeCommon.h" -#include "bertCommon.h" -#include "checkMacrosPlugin.h" -#include "cuda_runtime_api.h" -#include "driver_types.h" -#include "fcPlugin.h" -#include "plugin.h" -#include "serialize.h" -#include - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; -using namespace nvinfer1::ixrt_plugin::backend; - -namespace { -char const* const kFC_VERSION{"2"}; -char const* const kFC_NAME{"CustomFCPluginDynamic_IxRT"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection FCInt8PluginDynamicCreator::mFC{}; -std::vector FCInt8PluginDynamicCreator::mPluginAttributes; - -FCInt8PluginDynamicCreator::FCInt8PluginDynamicCreator() { - mPluginAttributes.clear(); - mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("W", nullptr, PluginFieldType::kINT8, 1)); - mPluginAttributes.emplace_back(PluginField("fc_amax", nullptr, PluginFieldType::kFLOAT32, 2)); - - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -char const* FCInt8PluginDynamicCreator::getPluginName() const noexcept { return kFC_NAME; } - -char const* FCInt8PluginDynamicCreator::getPluginVersion() const noexcept { return kFC_VERSION; } - -PluginFieldCollection const* FCInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; } - -IPluginV2* FCInt8PluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept { - try { - gLogInfo << "Creating FCInt8PluginDynamicCreator..." << endl; - IXRT_PLUGIN_ASSERT(name != nullptr); - IXRT_PLUGIN_ASSERT(fc != nullptr); - - int32_t outDims = 0; - Weights W{DataType::kINT8, nullptr, 0LL}; - Weights Bias{DataType::kFLOAT, nullptr, 0LL}; - ixrt_plugin::validateRequiredAttributesExist({"out_dims", "W", "fc_amax"}, fc); - vector weight_scale; - - for (int32_t i = 0; i < fc->nbFields; i++) { - std::string fieldName(fc->fields[i].name); - if (fieldName.compare("out_dims") == 0) { - outDims = static_cast(fc->fields[i].data)[0]; - gLogInfo << "Building outDims: " << outDims << endl; - } - - if (fieldName.compare("W") == 0) { - gLogInfo << "Building W..." << endl; - W.values = fc->fields[i].data; - W.count = fc->fields[i].length; - W.type = fieldTypeToDataType(fc->fields[i].type); - gLogInfo << "Is W int8: " << (W.type == DataType::kINT8) << endl; - } - - if (fieldName.compare("Bias") == 0) { - gLogInfo << "Building Bias..." << endl; - Bias.values = fc->fields[i].data; - Bias.count = fc->fields[i].length; - Bias.type = fieldTypeToDataType(fc->fields[i].type); - gLogInfo << "Is Bias float32: " << (Bias.type == DataType::kFLOAT) << endl; - } - - if (fieldName.compare("fc_amax") == 0) { - gLogInfo << "Building fc_amax..." << endl; - for (auto j = 0; j < fc->fields[i].length; j++) { - auto value = static_cast(fc->fields[i].data)[j]; - weight_scale.emplace_back(value / 127.0); - } - } - } - - if (outDims <= 0) { - gLogInfo << "Invalid output dimension" << endl; - } - if (W.count == 0 || W.values == nullptr || W.count < outDims) { - gLogInfo << "Invalid weights" << endl; - } - - DataType type = DataType::kINT8; - return new FCInt8PluginDynamic(name, type, outDims, W, Bias, weight_scale); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2* FCInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - // This object will be deleted when the network is destroyed, which will - // call FCInt8PluginDynamic::destroy() - try { - return new FCInt8PluginDynamic(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -void FCInt8PluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(libNamespace != nullptr); - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* FCInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// REGISTER_TENSORRT_PLUGIN(FCInt8PluginDynamicCreator); -//#########################################################################// -FCInt8PluginDynamic::FCInt8PluginDynamic(std::string const name, DataType const type, int32_t const outDim, - Weights const& W, Weights const& Bias, vector const& scale) - : mLayerName(name), - mType(type), - mOutDim(outDim), - mNumParams(W.count), - mNmax(0), - mK(0), - mWdev(nullptr), - mNumBias(Bias.count), - mScale(scale), - mBiasdev(nullptr) { - if (W.type == nvinfer1::DataType::kFLOAT) { - float weight_max = std::numeric_limits::min(); - for (int64_t wb = 0, we = W.count; wb < we; ++wb) { - float val = static_cast(W.values)[wb]; - weight_max = std::max(weight_max, std::abs(val)); - } - // mWeightScale = 127 / weight_max; - } - - mW.convertAndCopy(W, DataType::kINT8, scale[0]); - copyToDevice(mW, getWeightsSize(mW, DataType::kINT8), mWdev); - if (Bias.values != nullptr) { - mBias.convertAndCopy(Bias, DataType::kFLOAT); - copyToDevice(mBias, getWeightsSize(mBias, DataType::kFLOAT), mBiasdev); - } -} - -FCInt8PluginDynamic::FCInt8PluginDynamic(std::string const name, void const* data, size_t length) - : mLayerName(name), mWdev(nullptr), mBiasdev(nullptr) { - gLogInfo << "FCInt8PluginDynamic deserialize" << endl; - - // Deserialize in the same order as serialization - deserialize_value(&data, &length, &mType); - deserialize_value(&data, &length, &mOutDim); - deserialize_value(&data, &length, &mNumParams); - deserialize_value(&data, &length, &mNmax); - deserialize_value(&data, &length, &mK); - deserialize_value(&data, &length, &mNumBias); - deserialize_value(&data, &length, &mScale); - - char const* d = static_cast(data); - - mW.convertAndCopy(d, mNumParams, DataType::kINT8); - copyToDevice(mW, getWeightsSize(mW, DataType::kINT8), mWdev); - if (mNumBias > 0) { - mBias.convertAndCopy(d, mNumBias, DataType::kFLOAT); - copyToDevice(mBias, getWeightsSize(mBias, DataType::kFLOAT), mBiasdev); - } -} - -// IPluginV2 Methods -char const* FCInt8PluginDynamic::getPluginType() const noexcept { return kFC_NAME; } - -char const* FCInt8PluginDynamic::getPluginVersion() const noexcept { return kFC_VERSION; } - -int32_t FCInt8PluginDynamic::getNbOutputs() const noexcept { return 1; } - -int32_t FCInt8PluginDynamic::initialize() noexcept { - gLogInfo << "FCInt8PluginDynamic initialize" << endl; - return 0; -} - -void FCInt8PluginDynamic::terminate() noexcept { gLogInfo << "FCInt8PluginDynamic terminate" << endl; } - -size_t FCInt8PluginDynamic::getSerializationSize() const noexcept { - return sizeof(mType) + sizeof(mOutDim) + sizeof(mNumParams) + sizeof(mNmax) + sizeof(mK) + sizeof(mNumBias) + - mScale.size() * sizeof(float) + sizeof(mScale.size()) + getElementSize(DataType::kINT8) * mNumParams + - getElementSize(DataType::kFLOAT) * mNumBias; -} - -void FCInt8PluginDynamic::serialize(void* buffer) const noexcept { - serialize_value(&buffer, mType); - serialize_value(&buffer, mOutDim); - serialize_value(&buffer, mNumParams); - serialize_value(&buffer, mNmax); - serialize_value(&buffer, mK); - serialize_value(&buffer, mNumBias); - serialize_value(&buffer, mScale); - - char* d = static_cast(buffer); - serFromDev(d, static_cast(mWdev.get()), mNumParams * getElementSize(DataType::kINT8)); - - if (mNumBias > 0) { - serFromDev(d, static_cast(mBiasdev.get()), mNumBias * getElementSize(DataType::kFLOAT)); - } -} - -void FCInt8PluginDynamic::destroy() noexcept { - gLogInfo << "FCInt8PluginDynamic destroy" << endl; - mWdev.reset(nullptr); - if (mNumBias > 0) { - mBiasdev.reset(nullptr); - } - delete this; -} - -void FCInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(libNamespace != nullptr); - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* FCInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2Ext Methods -DataType FCInt8PluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes, - int32_t nbInputs) const noexcept { - IXRT_PLUGIN_ASSERT(index == 0); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(inputTypes != nullptr); - // IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kINT8); - return inputTypes[0]; -} - -// IPluginV2DynamicExt Methods -IPluginV2DynamicExt* FCInt8PluginDynamic::clone() const noexcept { - try { - gLogInfo << "FCInt8PluginDynamic clone" << endl; - - auto* p = new FCInt8PluginDynamic(mLayerName, mType, mOutDim, mW, mBias, mScale); - p->setPluginNamespace(mNamespace.c_str()); - - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -DimsExprs FCInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, - IExprBuilder& exprBuilder) noexcept { - try { - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(outputIndex == 0); - IXRT_PLUGIN_ASSERT(inputs != nullptr); - DimsExprs ret; - ret.nbDims = 5; - ret.d[0] = inputs[0].d[0]; - ret.d[1] = inputs[0].d[1]; - ret.d[2] = exprBuilder.constant(mOutDim); - ret.d[3] = exprBuilder.constant(1); - ret.d[4] = exprBuilder.constant(1); - return ret; - } catch (std::exception const& e) { - caughtError(e); - } - return DimsExprs{}; -} - -bool FCInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept { - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(inOut != nullptr); - - PluginTensorDesc const& in = inOut[pos]; - if (pos == 0) { - return (in.type == mType) && (in.format == TensorFormat::kLINEAR); - } - PluginTensorDesc const& prev = inOut[pos - 1]; - - // output - return in.type == prev.type && in.format == prev.format; -} - -void FCInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, - DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept { - try { - // Validate input arguments - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(outputs != nullptr); - IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type); - auto const& inDims0 = inputs[0].desc.dims; - - IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5); - mK = inDims0.d[HDIM]; // hiddensize - // IXRT_PLUGIN_ASSERT(hiddenSize * mOutDim == mNumParams); - IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1); - IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1); -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferCreate(&cuinfer_handle)); -#else - CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle)); -#endif - } catch (std::exception const& e) { - caughtError(e); - } -} - -size_t FCInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept { - int32_t const B = inputs[0].dims.d[BDIM]; - int32_t const S = inputs[0].dims.d[SDIM]; - int32_t const oE = outputs[0].dims.d[HDIM]; -#ifdef __ILUVATAR__ - return B * S * oE * sizeof(int8_t); -#else - return B * S * oE * sizeof(int32_t); -#endif -} - -int32_t FCInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workSpace, - cudaStream_t stream) noexcept { - try { -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream)); -#endif - int32_t const S = inputDesc->dims.d[SDIM]; - int32_t const B = inputDesc->dims.d[BDIM]; - int32_t const E = inputDesc->dims.d[HDIM]; - int32_t const oE = outputDesc->dims.d[HDIM]; - int32_t const n = S * B; - IXRT_PLUGIN_ASSERT(n >= 0); - - float qkv_in_scale = inputDesc[0].scale; - float qkv_wei_scale = mScale[0]; - float output_scale = outputDesc[0].scale; - float qkv_out_scale; - if (mScale.size() == 2) { - qkv_out_scale = mScale[1]; - } else { - qkv_out_scale = output_scale; - } -#ifdef __ILUVATAR__ - int8_t* buffer = static_cast(workSpace); -#else - int32_t* buffer = static_cast(workSpace); -#endif - if (mType == DataType::kINT8) { - auto const* const input = static_cast(inputs[0]); - auto* output = static_cast(outputs[0]); - auto weight = static_cast(mWdev.get()); - - float dequant_scale = (qkv_in_scale * qkv_wei_scale) / qkv_out_scale; - - if (mBiasdev.get() != nullptr) { -#ifdef __ILUVATAR__ - cuinfer_i8_gemm(weight, input, nullptr, buffer, 1, oE, n, E, 0, 0, 0, dequant_scale, 0.0, 0, - cuinfer_handle, stream); - dequantGemmWithBias(buffer, static_cast(mBiasdev.get()), output, B * S, oE, qkv_out_scale, - 1.0 / output_scale, stream); -#else - cublaslt_gemm(weight, input, buffer, 1, oE, n, E, 0, 0, 0, 1, blaslt_handle, stream); - dequantGemmWithBias(buffer, static_cast(mBiasdev.get()), output, B * S, oE, dequant_scale, qkv_out_scale, - 1.0 / output_scale, stream); -#endif - - } else { -#ifdef __ILUVATAR__ - cuinfer_i8_gemm(weight, input, nullptr, output, 1, oE, n, E, 0, 0, 0, dequant_scale, 0.0, 0, - cuinfer_handle, stream); -#else - - cublaslt_gemm(weight, input, buffer, 1, oE, n, E, 0, 0, 0, 1, blaslt_handle, stream); - quantGemm(buffer, output, B * S, oE, dequant_scale, stream); -#endif - } - } else { - gLogError << "Unsupported type error, expected [kINT8], but received " << static_cast(mType) - << endl; - return STATUS_FAILURE; - } - return STATUS_SUCCESS; - } catch (std::exception const& e) { - caughtError(e); - } - return STATUS_FAILURE; -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu deleted file mode 100644 index 7e233c878814dc347e7da8e310c96fd24923e8b9..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcInt8Plugin.cu +++ /dev/null @@ -1,485 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "backend/bert/bert_helper.h" -#include "fcPlugin.h" -using namespace nvinfer1::ixrt_plugin::backend; -namespace nvinfer1 { -namespace ixrt_plugin { -namespace bert { - -template -__global__ void dequant_gemm_without_bias(const int8_t* input, int8_t* output, int hidden_size, float dequant_scale, - float quant_scale, int num_per_tca) { - float4 val[THREAD_DATA_LEN]; - - int block_start = blockIdx.x * hidden_size; - input += block_start; - output += block_start; - - char4* p_input = (char4*)input; - char4* p_output = (char4*)output; - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * num_per_tca; - - val[it].x = __int2float_rn(p_input[element_index].x) * dequant_scale; - val[it].y = __int2float_rn(p_input[element_index].y) * dequant_scale; - val[it].z = __int2float_rn(p_input[element_index].z) * dequant_scale; - val[it].w = __int2float_rn(p_input[element_index].w) * dequant_scale; - - char4 res = float42char4(val[it], quant_scale); - p_output[element_index] = res; - } -} - -template -__global__ void dequant_gemm_with_bias(const int8_t* input, const float* bias, int8_t* output, int hidden_size, - float dequant_scale, float quant_scale, int num_per_tca) { - float4 val[THREAD_DATA_LEN]; - - int block_start = blockIdx.x * hidden_size; - input += block_start; - output += block_start; - - char4* p_input = (char4*)input; - float4* p_bias = (float4*)bias; - char4* p_output = (char4*)output; - - float4 bias_val; -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * num_per_tca; - bias_val.x = p_bias[element_index].x; - bias_val.y = p_bias[element_index].y; - bias_val.z = p_bias[element_index].z; - bias_val.w = p_bias[element_index].w; - - val[it].x = __int2float_rn(p_input[element_index].x) * dequant_scale + bias_val.x; - val[it].y = __int2float_rn(p_input[element_index].y) * dequant_scale + bias_val.y; - val[it].z = __int2float_rn(p_input[element_index].z) * dequant_scale + bias_val.z; - val[it].w = __int2float_rn(p_input[element_index].w) * dequant_scale + bias_val.w; - - char4 res = float42char4(val[it], quant_scale); - p_output[element_index] = res; - } -} - -template -__global__ void dequant_gemm_with_bias(const int32_t* input, const float* bias, int8_t* output, int hidden_size, - float quant_scale1, float dequant_scale, float quant_scale2, int num_per_tca) { - float4 val[THREAD_DATA_LEN]; - - int block_start = blockIdx.x * hidden_size; - input += block_start; - output += block_start; - - int4* p_input = (int4*)input; - float4* p_bias = (float4*)bias; - char4* p_output = (char4*)output; - - float4 bias_val; -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * num_per_tca; - bias_val.x = p_bias[element_index].x; - bias_val.y = p_bias[element_index].y; - bias_val.z = p_bias[element_index].z; - bias_val.w = p_bias[element_index].w; - - char4 q_input; - q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale1); - q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale1); - q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale1); - q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale1); - - val[it].x = __int2float_rn(q_input.x) * dequant_scale + bias_val.x; - val[it].y = __int2float_rn(q_input.y) * dequant_scale + bias_val.y; - val[it].z = __int2float_rn(q_input.z) * dequant_scale + bias_val.z; - val[it].w = __int2float_rn(q_input.w) * dequant_scale + bias_val.w; - - char4 res = float42char4(val[it], quant_scale2); - p_output[element_index] = res; - } -} - -void dequantGemmWithoutBias(int8_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale, - float quant_scale, cudaStream_t stream) { - if (hidden_size > 4096) { - throw std::runtime_error("hidden_size should <= 4096"); - } - if (hidden_size / 4 % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0"); - } - int num_per_tca = 64; - dim3 gridSize(batch_seq_len); - dim3 blockSize(num_per_tca); - - int num_warp = hidden_size / num_per_tca / 4; - - switch (num_warp) { - case 1: - dequant_gemm_without_bias<1> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 2: - dequant_gemm_without_bias<2> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 3: - dequant_gemm_without_bias<3> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 4: - dequant_gemm_without_bias<4> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 5: - dequant_gemm_without_bias<5> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 6: - dequant_gemm_without_bias<6> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 7: - dequant_gemm_without_bias<7> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 8: - dequant_gemm_without_bias<8> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 9: - dequant_gemm_without_bias<9> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 10: - dequant_gemm_without_bias<10> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 11: - dequant_gemm_without_bias<11> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 12: - dequant_gemm_without_bias<12> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 13: - dequant_gemm_without_bias<13> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 14: - dequant_gemm_without_bias<14> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 15: - dequant_gemm_without_bias<15> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 16: - dequant_gemm_without_bias<16> - <<>>(input, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - default: - throw std::runtime_error("dequantGemmWithoutBias"); - break; - } -} - -void dequantGemmWithBias(int8_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size, - float dequant_scale, float quant_scale, cudaStream_t stream) { - if (hidden_size > 4096) { - throw std::runtime_error("hidden_size should <= 4096"); - } - if (hidden_size / 4 % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0"); - } - int num_per_tca = 64; - dim3 gridSize(batch_seq_len); - dim3 blockSize(num_per_tca); - - int num_warp = hidden_size / num_per_tca / 4; - - switch (num_warp) { - case 1: - dequant_gemm_with_bias<1> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 2: - dequant_gemm_with_bias<2> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 3: - dequant_gemm_with_bias<3> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 4: - dequant_gemm_with_bias<4> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 5: - dequant_gemm_with_bias<5> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 6: - dequant_gemm_with_bias<6> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 7: - dequant_gemm_with_bias<7> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 8: - dequant_gemm_with_bias<8> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 9: - dequant_gemm_with_bias<9> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 10: - dequant_gemm_with_bias<10> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 11: - dequant_gemm_with_bias<11> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 12: - dequant_gemm_with_bias<12> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 13: - dequant_gemm_with_bias<13> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 14: - dequant_gemm_with_bias<14> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 15: - dequant_gemm_with_bias<15> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - case 16: - dequant_gemm_with_bias<16> - <<>>(input, bias, output, hidden_size, dequant_scale, quant_scale, num_per_tca); - break; - default: - throw std::runtime_error("dequantGemmWithBias with int8_t input"); - break; - } -} - -void dequantGemmWithBias(int32_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size, - float quant_scale1, float dequant_scale, float quant_scale2, cudaStream_t stream) { - if (hidden_size > 4096) { - throw std::runtime_error("hidden_size should <= 4096"); - } - if (hidden_size / 4 % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0"); - } - int num_per_tca = 64; - dim3 gridSize(batch_seq_len); - dim3 blockSize(num_per_tca); - - int num_warp = hidden_size / num_per_tca / 4; - - switch (num_warp) { - case 1: - dequant_gemm_with_bias<1> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 2: - dequant_gemm_with_bias<2> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 3: - dequant_gemm_with_bias<3> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 4: - dequant_gemm_with_bias<4> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 5: - dequant_gemm_with_bias<5> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 6: - dequant_gemm_with_bias<6> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 7: - dequant_gemm_with_bias<7> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 8: - dequant_gemm_with_bias<8> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 9: - dequant_gemm_with_bias<9> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 10: - dequant_gemm_with_bias<10> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 11: - dequant_gemm_with_bias<11> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 12: - dequant_gemm_with_bias<12> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 13: - dequant_gemm_with_bias<13> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 14: - dequant_gemm_with_bias<14> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 15: - dequant_gemm_with_bias<15> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - case 16: - dequant_gemm_with_bias<16> - <<>>(input, bias, output, hidden_size, quant_scale1, dequant_scale, quant_scale2, num_per_tca); - break; - default: - throw std::runtime_error("dequantGemmWithBias with int32_t input"); - break; - } -} - -template -__global__ void quant_gemm(const int32_t* input, int8_t* output, int hidden_size, float quant_scale, int num_per_tca) { - float4 val[THREAD_DATA_LEN]; - - int block_start = blockIdx.x * hidden_size; - input += block_start; - output += block_start; - - int4* p_input = (int4*)input; - char4* p_output = (char4*)output; - - float4 bias_val; -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * num_per_tca; - char4 q_input; - q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale); - q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale); - q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale); - q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale); - - p_output[element_index] = q_input; - } -} - -void quantGemm(int32_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale, cudaStream_t stream) { - if (hidden_size > 4096) { - throw std::runtime_error("hidden_size should <= 4096"); - } - if (hidden_size / 4 % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0"); - } - int num_per_tca = 64; - dim3 gridSize(batch_seq_len); - dim3 blockSize(num_per_tca); - - int num_warp = hidden_size / num_per_tca / 4; - - switch (num_warp) { - case 1: - quant_gemm<1> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 2: - quant_gemm<2> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 3: - quant_gemm<3> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 4: - quant_gemm<4> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 5: - quant_gemm<5> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 6: - quant_gemm<6> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 7: - quant_gemm<7> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 8: - quant_gemm<8> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 9: - quant_gemm<9> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 10: - quant_gemm<10> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 11: - quant_gemm<11> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 12: - quant_gemm<12> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 13: - quant_gemm<13> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 14: - quant_gemm<14> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 15: - quant_gemm<15> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 16: - quant_gemm<16> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - default: - throw std::runtime_error("quantGemm"); - break; - } -} - -} // namespace bert -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp deleted file mode 100644 index 675415356d82188b92994919df2d7f45828ed543..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.cpp +++ /dev/null @@ -1,345 +0,0 @@ -#include "fcPlugin.h" - -#include "NvInferRuntimeCommon.h" -#ifdef __ILUVATAR__ -#include "backend/ixinfer/ixinfer_gemm_helper.h" -#endif -#include "bertCommon.h" -#include "checkMacrosPlugin.h" -#include "plugin.h" -#include "serialize.h" - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; -using namespace nvinfer1::ixrt_plugin::backend; - -namespace { -char const* const kFC_VERSION{"1"}; -char const* const kFC_NAME{"CustomFCPluginDynamic_IxRT"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection FCPluginDynamicCreator::mFC{}; -std::vector FCPluginDynamicCreator::mPluginAttributes; - -FCPluginDynamicCreator::FCPluginDynamicCreator() { - mPluginAttributes.clear(); - mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("W", nullptr, PluginFieldType::kFLOAT32, 1)); - - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -char const* FCPluginDynamicCreator::getPluginName() const noexcept { return kFC_NAME; } - -char const* FCPluginDynamicCreator::getPluginVersion() const noexcept { return kFC_VERSION; } - -PluginFieldCollection const* FCPluginDynamicCreator::getFieldNames() noexcept { return &mFC; } - -IPluginV2* FCPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept { - try { - gLogInfo << "Creating FCPluginDynamicCreator..." << endl; - IXRT_PLUGIN_ASSERT(name != nullptr); - IXRT_PLUGIN_ASSERT(fc != nullptr); - - int32_t outDims = 0; - int32_t typeId = -1; - Weights W{DataType::kFLOAT, nullptr, 0LL}; - Weights B{DataType::kFLOAT, nullptr, 0LL}; - ixrt_plugin::validateRequiredAttributesExist({"out_dims", "type_id", "W"}, fc); - - for (int32_t i = 0; i < fc->nbFields; i++) { - std::string fieldName(fc->fields[i].name); - if (fieldName.compare("out_dims") == 0) { - outDims = static_cast(fc->fields[i].data)[0]; - gLogInfo << "Building outDims: " << outDims << endl; - } - - if (fieldName.compare("type_id") == 0) { - typeId = static_cast(fc->fields[i].data)[0]; - gLogInfo << "Building typeId: " << typeId << endl; - } - - if (fieldName.compare("W") == 0) { - gLogInfo << "Building W..." << endl; - W.values = fc->fields[i].data; - W.count = fc->fields[i].length; - W.type = fieldTypeToDataType(fc->fields[i].type); - gLogInfo << "Is W float32: " << (W.type == DataType::kFLOAT) << endl; - } - - if (fieldName.compare("B") == 0) { - gLogInfo << "Building B..." << endl; - B.values = fc->fields[i].data; - B.count = fc->fields[i].length; - B.type = fieldTypeToDataType(fc->fields[i].type); - gLogInfo << "Is B float32: " << (B.type == DataType::kFLOAT) << endl; - } - } - - if (outDims <= 0) { - gLogInfo << "Invalid output dimension" << endl; - } - if (typeId < 0 || typeId > 1) { - gLogInfo << "Invalid type id" << typeId << endl; - } - if (W.count == 0 || W.values == nullptr || W.count < outDims) { - gLogInfo << "Invalid weights" << endl; - } - - DataType type = typeId == 0 ? DataType::kFLOAT : DataType::kHALF; - return new FCPluginDynamic(name, type, outDims, W, B); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2* FCPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - // This object will be deleted when the network is destroyed, which will - // call FCPluginDynamic::destroy() - try { - return new FCPluginDynamic(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -void FCPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(libNamespace != nullptr); - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* FCPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// REGISTER_TENSORRT_PLUGIN(FCPluginDynamicCreator); -//#########################################################################// -FCPluginDynamic::FCPluginDynamic(std::string const name, DataType const type, int32_t const outDim, Weights const& W, - Weights const& B) - : mLayerName(name), - mType(type), - mOutDim(outDim), - mNumParams(W.count), - mNumBias(B.count), - mWdev(nullptr), - mBdev(nullptr) { - mW.convertAndCopy(W, mType); - copyToDevice(mW, getWeightsSize(mW, mType), mWdev); - if (mNumBias) { - mB.convertAndCopy(B, mType); - copyToDevice(mB, getWeightsSize(mB, mType), mBdev); - } -} - -FCPluginDynamic::FCPluginDynamic(std::string const name, void const* data, size_t length) - : mLayerName(name), mWdev(nullptr) { - gLogInfo << "FCPluginDynamic deserialize" << endl; - - // Deserialize in the same order as serialization - deserialize_value(&data, &length, &mType); - deserialize_value(&data, &length, &mOutDim); - deserialize_value(&data, &length, &mNumParams); - deserialize_value(&data, &length, &mNumBias); - - char const* d = static_cast(data); - - mW.convertAndCopy(d, mNumParams, mType); - copyToDevice(mW, getWeightsSize(mW, mType), mWdev); - if (mNumBias) { - mB.convertAndCopy(d, mNumBias, mType); - copyToDevice(mB, getWeightsSize(mB, mType), mBdev); - } -} - -// IPluginV2 Methods -char const* FCPluginDynamic::getPluginType() const noexcept { return kFC_NAME; } - -char const* FCPluginDynamic::getPluginVersion() const noexcept { return kFC_VERSION; } - -int32_t FCPluginDynamic::getNbOutputs() const noexcept { return 1; } - -int32_t FCPluginDynamic::initialize() noexcept { - gLogInfo << "FCPluginDynamic initialize" << endl; - return 0; -} - -void FCPluginDynamic::terminate() noexcept { gLogInfo << "FCPluginDynamic terminate" << endl; } - -size_t FCPluginDynamic::getSerializationSize() const noexcept { - size_t wordSize = getElementSize(mType); - return wordSize * (mNumParams + mNumBias) + sizeof(mType) + sizeof(mOutDim) + sizeof(mNumParams) + sizeof(mNumBias); -} - -void FCPluginDynamic::serialize(void* buffer) const noexcept { - serialize_value(&buffer, mType); - serialize_value(&buffer, mOutDim); - serialize_value(&buffer, mNumParams); - serialize_value(&buffer, mNumBias); - - size_t wordSize = getElementSize(mType); - char* d = static_cast(buffer); - serFromDev(d, static_cast(mWdev.get()), mNumParams * wordSize); - if (mNumBias) { - serFromDev(d, static_cast(mBdev.get()), mNumBias * wordSize); - } -} - -void FCPluginDynamic::destroy() noexcept { - gLogInfo << "FCPluginDynamic destroy" << endl; - mWdev.reset(nullptr); - if (mNumBias) { - mBdev.reset(nullptr); - } - delete this; -} - -void FCPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(libNamespace != nullptr); - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* FCPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2Ext Methods -DataType FCPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes, - int32_t nbInputs) const noexcept { - IXRT_PLUGIN_ASSERT(index == 0); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(inputTypes != nullptr); - IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF); - return inputTypes[0]; -} - -// IPluginV2DynamicExt Methods -IPluginV2DynamicExt* FCPluginDynamic::clone() const noexcept { - try { - gLogInfo << "FCPluginDynamic clone" << endl; - - auto* p = new FCPluginDynamic(mLayerName, mType, mOutDim, mW, mB); - p->setPluginNamespace(mNamespace.c_str()); - - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -DimsExprs FCPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, - IExprBuilder& exprBuilder) noexcept { - try { - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(outputIndex == 0); - IXRT_PLUGIN_ASSERT(inputs != nullptr); - DimsExprs ret; - ret.nbDims = 5; - ret.d[0] = inputs[0].d[0]; - ret.d[1] = inputs[0].d[1]; - ret.d[2] = exprBuilder.constant(mOutDim); - ret.d[3] = exprBuilder.constant(1); - ret.d[4] = exprBuilder.constant(1); - return ret; - } catch (std::exception const& e) { - caughtError(e); - } - return DimsExprs{}; -} - -bool FCPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept { - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(inOut != nullptr); - - PluginTensorDesc const& in = inOut[pos]; - if (pos == 0) { - return (in.type == mType) && (in.format == TensorFormat::kLINEAR); - } - PluginTensorDesc const& prev = inOut[pos - 1]; - - // output - return in.type == prev.type && in.format == prev.format; -} - -void FCPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, - DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept { - try { - // Validate input arguments - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(outputs != nullptr); - IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type); - auto const& inDims0 = inputs[0].desc.dims; - - IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5); - // IXRT_PLUGIN_ASSERT(hiddenSize * mOutDim == mNumParams); - IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1); - IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1); -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferCreate(&cuinfer_handle)); -#else - CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle)); -#endif - } catch (std::exception const& e) { - caughtError(e); - } -} - -size_t FCPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept { - return 0; -} - -int32_t FCPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workSpace, - cudaStream_t stream) noexcept { - gLogInfo << "in FCPluginDynamic.." << endl; - try { -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream)); -#endif - int32_t const S = inputDesc->dims.d[SDIM]; - int32_t const B = inputDesc->dims.d[BDIM]; - int32_t const E = inputDesc->dims.d[HDIM]; - int32_t const n = S * B; - IXRT_PLUGIN_ASSERT(n >= 0); - - if (mType == DataType::kHALF) { - auto const* const input = static_cast(inputs[0]); - auto* output = static_cast(outputs[0]); - auto weight = static_cast(mWdev.get()); - half* bias = nullptr; - if (mNumBias) { - bias = static_cast(mBdev.get()); - } - -#ifdef __ILUVATAR__ - cuinfer_gemm(weight, input, bias, output, 1, mOutDim, n, E, 0, 0, 0, 1.0f, -1, stream, cuinfer_handle); -#else - cublaslt_gemm(weight, input, output, 1, mOutDim, n, E, 0, 0, 0, 1.0f, blaslt_handle, stream); -#endif - } else { - gLogError << "Unsupported type error, expected [kHALF,kFLOAT], but received " << static_cast(mType) - << endl; - return STATUS_FAILURE; - } - return STATUS_SUCCESS; - } catch (std::exception const& e) { - caughtError(e); - } - return STATUS_FAILURE; -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h deleted file mode 100644 index 2f9115dc166a087a9bfd604a9697b8db28a2c8ca..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/custom_fc/fcPlugin.h +++ /dev/null @@ -1,246 +0,0 @@ - -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#pragma once -#include -#include - -#include "NvInferRuntime.h" -#include "NvInferRuntimeCommon.h" -#include "bertCommon.h" -#include "driver_types.h" - -#ifdef __ILUVATAR__ -#include "backend/ixinfer/ixinfer_gemm_helper.h" -#else -#include "backend/cublas/cublas_helper.h" -#endif - -namespace nvinfer1 { -namespace ixrt_plugin { -namespace bert { - -void quantGemm(int32_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale, - cudaStream_t stream); - -void dequantGemmWithBias(int32_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size, - float dequant_scale1, float dequant_scale2, float quant_scale, cudaStream_t stream); - -void dequantGemmWithBias(int8_t* input, float* bias, int8_t* output, int batch_seq_len, int hidden_size, - float dequant_scale, float quant_scale, cudaStream_t stream); - -void dequantGemmWithoutBias(int8_t* input, int8_t* output, int batch_seq_len, int hidden_size, float dequant_scale, - float quant_scale, cudaStream_t stream); - -class FCPluginDynamic : public nvinfer1::IPluginV2DynamicExt { - public: - FCPluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim, - nvinfer1::Weights const& W, nvinfer1::Weights const& B); - - FCPluginDynamic(std::string const name, void const* data, size_t length); - - // It doesn't make sense to make FCPluginDynamic without arguments, so we - // delete default constructor. - FCPluginDynamic() = delete; - - // IPluginV2 Methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext Methods - nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, - int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt Methods - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; - nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept override; - - private: - std::string const mLayerName; - std::string mNamespace; - - nvinfer1::DataType mType; - size_t mOutDim; // leading dim - size_t mNumParams; - size_t mNumBias; - - bert::WeightsWithOwnership mW; - bert::cuda_unique_ptr mWdev; - bert::WeightsWithOwnership mB; - bert::cuda_unique_ptr mBdev; - -#ifdef __ILUVATAR__ - cuinferHandle_t cuinfer_handle; -#else - cublasLtHandle_t blaslt_handle; -#endif - cudaStream_t stream; -}; - -class FCPluginDynamicCreator : public nvinfer1::IPluginCreator { - public: - FCPluginDynamicCreator(); - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - - private: - static nvinfer1::PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; -}; - -class FCInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt { - public: - FCInt8PluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim, - nvinfer1::Weights const& W, nvinfer1::Weights const& Bias, vector const& scale); - - FCInt8PluginDynamic(std::string const name, void const* data, size_t length); - - // It doesn't make sense to make FCInt8PluginDynamic without arguments, so we - // delete default constructor. - FCInt8PluginDynamic() = delete; - - // IPluginV2 Methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext Methods - nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, - int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt Methods - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; - nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept override; - - private: - std::string const mLayerName; - std::string mNamespace; - - nvinfer1::DataType mType; - size_t mOutDim; // leading dim - size_t mNumParams; - int32_t mNmax; - int32_t mK; - int32_t mNumBias; - - vector mScale; - - bert::WeightsWithOwnership mW; - bert::cuda_unique_ptr mWdev; - - bert::WeightsWithOwnership mBias; - bert::cuda_unique_ptr mBiasdev; - -#ifdef __ILUVATAR__ - cuinferHandle_t cuinfer_handle; -#else - cublasLtHandle_t blaslt_handle; -#endif - cudaStream_t stream; -}; - -class FCInt8PluginDynamicCreator : public nvinfer1::IPluginCreator { - public: - FCInt8PluginDynamicCreator(); - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - - private: - static nvinfer1::PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; -}; - -} // namespace bert -} // namespace ixrt_plugin -} // namespace nvinfer1 diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp deleted file mode 100644 index 292e8a631f945c33c2ec7771e9f1136b5ee6828c..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cpp +++ /dev/null @@ -1,503 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "NvInferImpl.h" -#include "NvInferRuntimeCommon.h" -#include "checkMacrosPlugin.h" -#include "common_def.cuh" -#include "driver_types.h" -#include "embLayerNormInt8Plugin.h" -#include "plugin.h" -#include "serialize.h" - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; - -namespace { -char const* EMB_LAYER_NORM_INT8_VERSION{"2"}; -char const* EMB_LAYER_NORM_INT8_NAME{"CustomEmbLayerNormPluginDynamic_IxRT"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection EmbLayerNormInt8PluginDynamicCreator::mFC{}; -std::vector EmbLayerNormInt8PluginDynamicCreator::mPluginAttributes; - -EmbLayerNormInt8PluginDynamicCreator::EmbLayerNormInt8PluginDynamicCreator() { - mPluginAttributes.clear(); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_beta")); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_gamma")); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_word_embeddings")); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_token_type_embeddings")); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_position_embeddings")); - mPluginAttributes.emplace_back(PluginField("output_fp16")); - mPluginAttributes.emplace_back(PluginField("full_mask")); - mPluginAttributes.emplace_back(PluginField("mha_type_id")); - mPluginAttributes.emplace_back(PluginField("pad_id")); - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -char const* EmbLayerNormInt8PluginDynamicCreator::getPluginName() const noexcept { return EMB_LAYER_NORM_INT8_NAME; } - -char const* EmbLayerNormInt8PluginDynamicCreator::getPluginVersion() const noexcept { - return EMB_LAYER_NORM_INT8_VERSION; -} - -PluginFieldCollection const* EmbLayerNormInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; } - -IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamicCreator::createPlugin(char const* name, - PluginFieldCollection const* fc) noexcept { - try { - IXRT_PLUGIN_ASSERT(fc != nullptr); - gLogInfo << "EmbLayerNormInt8PluginDynamic createPlugin." << endl; - std::set const requiredAttributes{ - "bert_embeddings_layernorm_beta", "bert_embeddings_layernorm_gamma", - "bert_embeddings_word_embeddings", "bert_embeddings_token_type_embeddings", - "bert_embeddings_position_embeddings", - }; - - bool output_fp16 = false; - bool useFullMask = false; - Weights beta{}; - Weights gamma{}; - Weights word_emb{}; - Weights pos_emb{}; - Weights tok_emb{}; - int32_t mhaTypeId = 0; - int32_t pad_id = 0; - - for (auto i = 0; i < fc->nbFields; i++) { - std::string field_name(fc->fields[i].name); - if (field_name.compare("bert_embeddings_layernorm_beta") == 0) { - gLogInfo << "Building bert_embeddings_layernorm_beta..." << endl; - beta.values = fc->fields[i].data; - beta.count = fc->fields[i].length; - beta.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) { - gLogInfo << "Building bert_embeddings_layernorm_gamma..." << endl; - gamma.values = fc->fields[i].data; - gamma.count = fc->fields[i].length; - gamma.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bert_embeddings_word_embeddings") == 0) { - gLogInfo << "Building bert_embeddings_word_embeddings..." << endl; - word_emb.values = fc->fields[i].data; - word_emb.count = fc->fields[i].length; - word_emb.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bert_embeddings_token_type_embeddings") == 0) { - gLogInfo << "Building bert_embeddings_token_type_embeddings..." << endl; - tok_emb.values = fc->fields[i].data; - tok_emb.count = fc->fields[i].length; - tok_emb.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bert_embeddings_position_embeddings") == 0) { - gLogInfo << "Building bert_embeddings_position_embeddings..." << endl; - pos_emb.values = fc->fields[i].data; - pos_emb.count = fc->fields[i].length; - pos_emb.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("output_fp16") == 0) { - IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32); - output_fp16 = static_cast(fc->fields[i].data)[0] != 0; - gLogInfo << "Building output_fp16: " << output_fp16 << endl; - } - - if (field_name.compare("full_mask") == 0) { - IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32); - useFullMask = static_cast(fc->fields[i].data)[0] != 0; - gLogInfo << "Building full_mask: " << useFullMask << endl; - } - - if (field_name.compare("mha_type_id") == 0) { - mhaTypeId = *static_cast(fc->fields[i].data); - IXRT_PLUGIN_ASSERT(mhaTypeId >= 0 && mhaTypeId < 3); - gLogInfo << "Building mha typeId: " << mhaTypeId << endl; - } - - if (field_name.compare("pad_id") == 0) { - IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32) - pad_id = *static_cast(fc->fields[i].data); - } - } - gLogInfo << "Building EmbLayerNormInt8PluginDynamic Plugin..." << endl; - DataType mhaType = static_cast(mhaTypeId); - EmbLayerNormInt8PluginDynamic* p = - new EmbLayerNormInt8PluginDynamic(name, output_fp16 ? DataType::kHALF : DataType::kFLOAT, mhaType, beta, - gamma, word_emb, pos_emb, tok_emb, useFullMask, pad_id); - - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - try { - IXRT_PLUGIN_ASSERT(serialData != nullptr); - return new EmbLayerNormInt8PluginDynamic(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -void EmbLayerNormInt8PluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(pluginNamespace != nullptr); - mNamespace = pluginNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* EmbLayerNormInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// REGISTER_TENSORRT_PLUGIN(EmbLayerNormInt8PluginDynamicCreator); - -//#########################################################################// -EmbLayerNormInt8PluginDynamic::EmbLayerNormInt8PluginDynamic(std::string const& name, DataType const type, - DataType const mhaType, Weights const& beta, - Weights const& gamma, Weights const& wordEmb, - Weights const& posEmb, Weights const& tokEmb, - bool const useFullMask, int32_t padId) - : mLayerName(name), - mHiddenSize(beta.count), - mEmbType(type), - mUseFullMask(useFullMask), - mMhaType(mhaType), - mPadId(padId) { - IXRT_PLUGIN_ASSERT(beta.count == gamma.count); - IXRT_PLUGIN_ASSERT(mHiddenSize > 0U); - IXRT_PLUGIN_ASSERT(wordEmb.count % mHiddenSize == 0); - IXRT_PLUGIN_ASSERT(posEmb.count % mHiddenSize == 0); - IXRT_PLUGIN_ASSERT(tokEmb.count % mHiddenSize == 0); - mWordVocabSize = wordEmb.count / mHiddenSize; - mPosVocabSize = posEmb.count / mHiddenSize; - mTokVocabSize = tokEmb.count / mHiddenSize; - - mBeta.convertAndCopy(beta, nvinfer1::DataType::kFLOAT); - mGamma.convertAndCopy(gamma, nvinfer1::DataType::kFLOAT); - mWordEmb.convertAndCopy(wordEmb, mEmbType); - mTokEmb.convertAndCopy(tokEmb, mEmbType); - mPosEmb.convertAndCopy(posEmb, mEmbType); - - copyToDevice(mGamma, sizeof(float) * mGamma.count, mGammaDev); - copyToDevice(mBeta, sizeof(float) * mBeta.count, mBetaDev); - copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev); - copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev); - copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev); -} - -EmbLayerNormInt8PluginDynamic::EmbLayerNormInt8PluginDynamic(std::string const& name, void const* data, size_t length) - : mLayerName(name), - mGammaDev(nullptr), - mBetaDev(nullptr), - mWordEmbDev(nullptr), - mTokEmbDev(nullptr), - mPosEmbDev(nullptr) { - gLogInfo << "EmbLayerNormInt8PluginDynamic deserialize." << endl; - - // Deserialize in the same order as serialization - deserialize_value(&data, &length, &mEmbType); - deserialize_value(&data, &length, &mMhaType); - deserialize_value(&data, &length, &mHiddenSize); - deserialize_value(&data, &length, &mSeqLen); - deserialize_value(&data, &length, &mPadId); - deserialize_value(&data, &length, &mWordVocabSize); - deserialize_value(&data, &length, &mPosVocabSize); - deserialize_value(&data, &length, &mTokVocabSize); - deserialize_value(&data, &length, &mUseFullMask); - - char const* d = static_cast(data); - mBeta.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kFLOAT); - mGamma.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kFLOAT); - mWordEmb.convertAndCopy(d, mHiddenSize * mWordVocabSize, mEmbType); - mPosEmb.convertAndCopy(d, mHiddenSize * mPosVocabSize, mEmbType); - mTokEmb.convertAndCopy(d, mHiddenSize * mTokVocabSize, mEmbType); - - copyToDevice(mGamma, sizeof(float) * mGamma.count, mGammaDev); - copyToDevice(mBeta, sizeof(float) * mBeta.count, mBetaDev); - copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev); - copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev); - copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev); -} - -// IPluginV2 Methods -char const* EmbLayerNormInt8PluginDynamic::getPluginType() const noexcept { return EMB_LAYER_NORM_INT8_NAME; } - -char const* EmbLayerNormInt8PluginDynamic::getPluginVersion() const noexcept { return EMB_LAYER_NORM_INT8_VERSION; } - -int32_t EmbLayerNormInt8PluginDynamic::getNbOutputs() const noexcept { return 3; } - -int32_t EmbLayerNormInt8PluginDynamic::initialize() noexcept { return 0; } - -void EmbLayerNormInt8PluginDynamic::terminate() noexcept { - gLogInfo << "EmbLayerNormInt8PluginDynamic terminate." << endl; -} - -size_t EmbLayerNormInt8PluginDynamic::getSerializationSize() const noexcept { - size_t const wordSize = getElementSize(mEmbType); - return sizeof(mEmbType) * 2 // mEmbType, mMhaType - + sizeof(mHiddenSize) * 6 // mHiddenSize, mSeqLen, 3*VocabSize, mPadId - + sizeof(mUseFullMask) // mask type - + 2 * sizeof(float) * mHiddenSize // beta + gamma - + wordSize * mHiddenSize * mWordVocabSize // word emb - + wordSize * mHiddenSize * mPosVocabSize // pos emb - + wordSize * mHiddenSize * mTokVocabSize // tok emb - ; -} - -void EmbLayerNormInt8PluginDynamic::serialize(void* buffer) const noexcept { - serialize_value(&buffer, mEmbType); - serialize_value(&buffer, mMhaType); - serialize_value(&buffer, mHiddenSize); - serialize_value(&buffer, mSeqLen); - serialize_value(&buffer, mPadId); - serialize_value(&buffer, mWordVocabSize); - serialize_value(&buffer, mPosVocabSize); - serialize_value(&buffer, mTokVocabSize); - serialize_value(&buffer, mUseFullMask); - - char* d = static_cast(buffer); - serFromDev(d, mBetaDev.get(), mHiddenSize); - serFromDev(d, mGammaDev.get(), mHiddenSize); - size_t const wordSize = getElementSize(mEmbType); - serFromDev(d, static_cast(mWordEmbDev.get()), mHiddenSize * mWordVocabSize * wordSize); - serFromDev(d, static_cast(mPosEmbDev.get()), mHiddenSize * mPosVocabSize * wordSize); - serFromDev(d, static_cast(mTokEmbDev.get()), mHiddenSize * mTokVocabSize * wordSize); -} - -void EmbLayerNormInt8PluginDynamic::destroy() noexcept { - gLogInfo << "EmbLayerNormInt8PluginDynamic destroy." << endl; - // This gets called when the network containing plugin is destroyed - mGammaDev.reset(nullptr); - mBetaDev.reset(nullptr); - mWordEmbDev.reset(nullptr); - mPosEmbDev.reset(nullptr); - mTokEmbDev.reset(nullptr); - delete this; -} - -void EmbLayerNormInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { - try { - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* EmbLayerNormInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2Ext Methods -DataType EmbLayerNormInt8PluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes, - int32_t nbInputs) const noexcept { - IXRT_PLUGIN_ASSERT(index >= 0 && index <= 2); - if (index == 0) { - return mMhaType; - } - if (index == 1) { - return DataType::kINT8; - } - return DataType::kFLOAT; -} - -// IPluginV2DynamicExt Methods -IPluginV2DynamicExt* EmbLayerNormInt8PluginDynamic::clone() const noexcept { - try { - gLogInfo << "EmbLayerNormInt8PluginDynamic clone." << endl; - - auto p = new EmbLayerNormInt8PluginDynamic(mLayerName, mEmbType, mMhaType, mBeta, mGamma, mWordEmb, mPosEmb, - mTokEmb, mUseFullMask); - p->mSeqLen = mSeqLen; - p->setPluginNamespace(mNamespace.c_str()); - - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -DimsExprs EmbLayerNormInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, - int32_t nbInputs, IExprBuilder& exprBuilder) noexcept { - try { - // Input should be input ids and token ids and the input mask - // Output should be the embeddings tensor and mask indices - IXRT_PLUGIN_ASSERT(nbInputs == 3); - - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == 2); // BxS - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims); - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[2].nbDims); - - IXRT_PLUGIN_ASSERT(outputIndex >= 0 || outputIndex <= 2); - - if (outputIndex == 0) { - DimsExprs ret; - ret.nbDims = 5; - ret.d[0] = inputs[0].d[BDIM]; - ret.d[1] = inputs[0].d[SDIM]; - ret.d[2] = exprBuilder.constant(mHiddenSize); - ret.d[3] = exprBuilder.constant(1); - ret.d[4] = exprBuilder.constant(1); - return ret; - } - if (outputIndex == 1) { - DimsExprs ret; - ret.nbDims = 2; - ret.d[0] = inputs[0].d[BDIM]; - ret.d[1] = inputs[0].d[SDIM]; - return ret; - } - - DimsExprs ret; - ret.nbDims = 5; - ret.d[0] = inputs[0].d[BDIM]; - ret.d[1] = inputs[0].d[SDIM]; - ret.d[2] = exprBuilder.constant(mHiddenSize); - ret.d[3] = exprBuilder.constant(1); - ret.d[4] = exprBuilder.constant(1); - return ret; - - } catch (std::exception const& e) { - caughtError(e); - } - return DimsExprs{}; -} - -bool EmbLayerNormInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, - int32_t nbInputs, int32_t nbOutputs) noexcept { - // 3 inputs of size BxS - IXRT_PLUGIN_ASSERT(nbInputs == 3); - IXRT_PLUGIN_ASSERT(nbOutputs == 3); - - PluginTensorDesc const& desc = inOut[pos]; - if (desc.format != TensorFormat::kLINEAR) { - return false; - } - if (pos == 0) { - return desc.type == DataType::kINT32; - } - - PluginTensorDesc const& prev = inOut[pos - 1]; - if (pos == 1 || pos == 2) { - return desc.type == DataType::kINT32 && desc.format == prev.format; - } - - // emb_out - if (pos == 3 || pos == 4) { - return desc.type == DataType::kINT8 && desc.format == prev.format; - } - // residual - return desc.type == DataType::kFLOAT; -} - -void EmbLayerNormInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, - DynamicPluginTensorDesc const* outputs, - int32_t nbOutputs) noexcept { - gLogInfo << "EmbLayerNormInt8PluginDynamic configurePlugin." << endl; - - // Validate input arguments - IXRT_PLUGIN_ASSERT(nbOutputs == 3); - IXRT_PLUGIN_ASSERT(nbInputs == 3); - - IXRT_PLUGIN_ASSERT(inputs[0].desc.dims.nbDims == 2); - int32_t const S = inputs[0].desc.dims.d[SDIM]; - mSeqLen = S; - int32_t const B = inputs[0].desc.dims.d[BDIM]; - TRT_UNUSED B; - IXRT_PLUGIN_ASSERT(mSeqLen == static_cast(inputs[1].desc.dims.d[SDIM])); - IXRT_PLUGIN_ASSERT(B == inputs[1].desc.dims.d[BDIM]); - IXRT_PLUGIN_ASSERT(mSeqLen == static_cast(inputs[2].desc.dims.d[SDIM])); - IXRT_PLUGIN_ASSERT(B == inputs[2].desc.dims.d[BDIM]); - - IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.nbDims == 5); - IXRT_PLUGIN_ASSERT(static_cast(outputs[0].desc.dims.d[SDIM]) == mSeqLen); - IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[BDIM] == B); - IXRT_PLUGIN_ASSERT(static_cast(outputs[0].desc.dims.d[2]) == mHiddenSize); - IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[3] == 1); - IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[4] == 1); - - IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.nbDims == 2); - IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[0] == B); - IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[1] == S); - - IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.nbDims == 5); - IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[SDIM] == outputs[0].desc.dims.d[SDIM]); - IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[BDIM] == outputs[0].desc.dims.d[BDIM]); - IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[2] == outputs[0].desc.dims.d[2]); - IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[3] == 1); - IXRT_PLUGIN_ASSERT(outputs[2].desc.dims.d[4] == 1); -} - -size_t EmbLayerNormInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, - int32_t nbOutputs) const noexcept { - int32_t const B = inputs[0].dims.d[BDIM]; - int32_t const S = inputs[0].dims.d[SDIM]; - return B * S * sizeof(int32_t); -} - -int32_t EmbLayerNormInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept { - try { - int32_t const B = inputDesc->dims.d[BDIM]; - int32_t const S = inputDesc->dims.d[SDIM]; - int32_t status = STATUS_SUCCESS; - int32_t fmha_S = S; - int32_t batch_tokens = B * fmha_S; - - // Our plugin outputs only one tensor - auto const inputIds = static_cast(inputs[0]); - auto const segmentIds = static_cast(inputs[1]); - - float const* beta = mBetaDev.get(); - float const* gamma = mGammaDev.get(); - auto output = static_cast(outputs[0]); - auto mNewMask = static_cast(outputs[1]); - auto residual = static_cast(outputs[2]); - auto const wordEmb = static_cast(mWordEmbDev.get()); - auto const tokEmb = static_cast(mTokEmbDev.get()); - auto const posEmb = static_cast(mPosEmbDev.get()); - - float l0_qkv_in_amax = outputDesc[0].scale * 127; - - auto mask_idx = static_cast(workspace); - status = embLayerNorm(stream, static_cast(mHiddenSize), B, S, inputIds, segmentIds, beta, gamma, - wordEmb, posEmb, tokEmb, mWordVocabSize, mTokVocabSize, residual, output, mask_idx, - mPadId, l0_qkv_in_amax); - - IxinferMaskPad(mask_idx, mNewMask, B, S, mHiddenSize, fmha_S, batch_tokens, stream); - - if (status != cudaSuccess) { - return STATUS_FAILURE; - } - - return STATUS_SUCCESS; - } catch (std::exception const& e) { - caughtError(e); - } - return STATUS_FAILURE; -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu deleted file mode 100644 index 3aa0cd8668cf3c35ae8befde4ec3afd7f3e73e22..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.cu +++ /dev/null @@ -1,342 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "embLayerNormInt8Plugin.h" -#include "backend/bert/bert_helper.h" - -namespace nvinfer1::ixrt_plugin { -using namespace backend; -namespace bert { - -template -__global__ void IxinferResidualI8O(const float *input, int8_t *output, int hidden_size, float quant_scale) { - float4 vals[THREAD_DATA_LEN]; - int block_start = blockIdx.x * hidden_size; - - input += block_start; - output += block_start; - - float4 *p_input = (float4 *)input; - char4 *p_output = (char4 *)output; - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - vals[it].x = p_input[element_index].x; - vals[it].y = p_input[element_index].y; - vals[it].z = p_input[element_index].z; - vals[it].w = p_input[element_index].w; - - char4 res = float42char4(vals[it], quant_scale); - p_output[element_index] = res; - } -} - -template -void IxinferResidualI8OLauncher(const T *input, int8_t *output, int batch_tokens, int hidden_size, float quant_scale, - cudaStream_t stream) { - if (hidden_size > 4096) { - throw std::runtime_error("hidden_size should <= 4096"); - } - if (hidden_size / 4 % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0"); - } - dim3 gridSize(batch_tokens); - dim3 blockSize(C10_WARP_SIZE); - - int num_warp = hidden_size / C10_WARP_SIZE / 4; - - switch (num_warp) { - case 1: - IxinferResidualI8O<1><<>>(input, output, hidden_size, quant_scale); - break; - case 2: - IxinferResidualI8O<2><<>>(input, output, hidden_size, quant_scale); - break; - case 3: - IxinferResidualI8O<3><<>>(input, output, hidden_size, quant_scale); - break; - case 4: - IxinferResidualI8O<4><<>>(input, output, hidden_size, quant_scale); - break; - case 5: - IxinferResidualI8O<5><<>>(input, output, hidden_size, quant_scale); - break; - case 6: - IxinferResidualI8O<6><<>>(input, output, hidden_size, quant_scale); - break; - case 7: - IxinferResidualI8O<7><<>>(input, output, hidden_size, quant_scale); - break; - case 8: - IxinferResidualI8O<8><<>>(input, output, hidden_size, quant_scale); - break; - case 9: - IxinferResidualI8O<9><<>>(input, output, hidden_size, quant_scale); - break; - case 10: - IxinferResidualI8O<10><<>>(input, output, hidden_size, quant_scale); - break; - case 11: - IxinferResidualI8O<11><<>>(input, output, hidden_size, quant_scale); - break; - case 12: - IxinferResidualI8O<12><<>>(input, output, hidden_size, quant_scale); - break; - case 13: - IxinferResidualI8O<13><<>>(input, output, hidden_size, quant_scale); - break; - case 14: - IxinferResidualI8O<14><<>>(input, output, hidden_size, quant_scale); - break; - case 15: - IxinferResidualI8O<15><<>>(input, output, hidden_size, quant_scale); - break; - case 16: - IxinferResidualI8O<16><<>>(input, output, hidden_size, quant_scale); - break; - default: - throw std::runtime_error("IxinferResidualI8OLauncher"); - break; - } -} - -template -__global__ void IxinferBertEmbedLnKernel(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, - float *output, int *pad_mask, int *type_ids, int pad_id, int batch_size, - int seq_len, int hidden_dim, const float *scale, const float *bias) { - float4 vals[THREAD_DATA_LEN]; - int block_start = blockIdx.x * hidden_dim; - int batch_idx, seq_idx; - batch_idx = blockIdx.x / seq_len; - seq_idx = blockIdx.x % seq_len; - - int tokens_idx = blockIdx.x; - int token = tokens[tokens_idx]; - int token_type = type_ids[tokens_idx]; - - output += block_start; - - float4 *p_output = (float4 *)output; - - float4 *p_scale = (float4 *)scale; - float4 *p_bias = (float4 *)bias; - float4 *p_value = (float4 *)(token_emb + token * hidden_dim); - float4 *p_pemb = (float4 *)(pos_emb + seq_idx * hidden_dim); - float4 *p_temb = (float4 *)(type_emb + token_type * hidden_dim); - - float thread_m2 = 0; - float thread_mean = 0; - float thread_count = 0; - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - - if (token == pad_id) { - if (element_index == 0) { - pad_mask[tokens_idx] = 1; - } - vals[it] = make_float4(0.f, 0.f, 0.f, 0.f); - - } else { - if (element_index == 0) { - pad_mask[tokens_idx] = 0; - } - - vals[it].x = p_value[element_index].x + p_pemb[element_index].x + p_temb[element_index].x; - vals[it].y = p_value[element_index].y + p_pemb[element_index].y + p_temb[element_index].y; - vals[it].z = p_value[element_index].z + p_pemb[element_index].z + p_temb[element_index].z; - vals[it].w = p_value[element_index].w + p_pemb[element_index].w + p_temb[element_index].w; - WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count); - } - } - float mean = 0; - float m2 = 0; - float count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count); - mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE); - m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE); - count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE); - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - float4 scale_value = p_scale[element_index]; - float4 bias_value = p_bias[element_index]; - float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_dim, epsilon, - scale_value, bias_value); - int tokens_idx = blockIdx.x; - - int token = tokens[tokens_idx]; - if (token == pad_id) { - p_output[element_index] = make_float4(0.f, 0.f, 0.f, 0.f); - } else { - p_output[element_index] = norm_value; - } - } -} - - -void IxinferBertEmbedLn(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, float *output, - int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size, - const float *scale, const float *bias, cudaStream_t stream) { - if (hidden_size > 4096) { - throw std::runtime_error("hidden_size should <= 4096"); - } - if (hidden_size % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0"); - } - int batch_tokens = batch_size * seq_len; - dim3 gridSize(batch_tokens); - dim3 blockSize(C10_WARP_SIZE); - int num_warp = hidden_size / C10_WARP_SIZE / 4; - - switch (num_warp) { - case 1: - IxinferBertEmbedLnKernel<1> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 2: - IxinferBertEmbedLnKernel<2> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 3: - IxinferBertEmbedLnKernel<3> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 4: - IxinferBertEmbedLnKernel<4> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 5: - IxinferBertEmbedLnKernel<5> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 6: - IxinferBertEmbedLnKernel<6> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 7: - IxinferBertEmbedLnKernel<7> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 8: - IxinferBertEmbedLnKernel<8> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 9: - IxinferBertEmbedLnKernel<9> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 10: - IxinferBertEmbedLnKernel<10> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 11: - IxinferBertEmbedLnKernel<11> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 12: - IxinferBertEmbedLnKernel<12> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 13: - IxinferBertEmbedLnKernel<13> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 14: - IxinferBertEmbedLnKernel<14> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 15: - IxinferBertEmbedLnKernel<15> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 16: - IxinferBertEmbedLnKernel<16> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - default: - throw std::runtime_error("IxinferBertEmbedLn"); - break; - } -} - -cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds, - float const* beta, float const* gamma, float const* wordEmb, float const* posEmb, float const* tokEmb, int32_t const wordSize, - int32_t const tokSize, float* buffer, int8_t* output, int32_t* maskIdx, int32_t padId, float l0_qkv_in_amax) -{ - IxinferBertEmbedLn(wordEmb, posEmb, tokEmb, inputIds, buffer, maskIdx, (int*)segmentIds, - padId, B, S, E, gamma, beta, stream); - - IxinferResidualI8OLauncher(buffer, output, B*S, E, 127.0 / l0_qkv_in_amax, stream); - return cudaSuccess; -} - -void __global__ IxinferMaskPadKernel(const int32_t* mask, int8_t* new_mask, int bsz, - int ori_seq_len, int hsz, int fmha_seq_len) { - int batch_idx = blockIdx.x; - int seq_idx = blockIdx.y; - - if (seq_idx < ori_seq_len) { - if (threadIdx.x == 0) { - new_mask[batch_idx * fmha_seq_len + seq_idx] = mask[batch_idx * ori_seq_len + seq_idx]; - } - } else { - new_mask[batch_idx * fmha_seq_len + seq_idx] = 1; - } -} - -void IxinferMaskPad(int32_t* mask, int8_t* new_mask, int bsz, int ori_seq_len, int hsz, - int fmha_seq_len, int batch_tokens, cudaStream_t stream) { - if (hsz / 2 > 4096) { - throw std::runtime_error("hsz/2>4096"); - } - if (hsz % 2 != 0) { - throw std::runtime_error("hsz % 2 !=0"); - } - if (ori_seq_len > fmha_seq_len) { - throw std::runtime_error("ori_seq_len > fmha_seq_len"); - } - if (bsz * ori_seq_len > batch_tokens) { - throw std::runtime_error("bsz*ori_seq_len > batch_tokens"); - } - dim3 blockSize(bsz, fmha_seq_len); - IxinferMaskPadKernel<<>>(mask, new_mask, bsz, ori_seq_len, hsz, - fmha_seq_len); -} - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h deleted file mode 100644 index 5fee7a4326b6ce9dbd45a1f868507956db8e450f..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormInt8Plugin.h +++ /dev/null @@ -1,128 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#pragma once -#include -#include -#include -#include -#include - -#include "bertCommon.h" - -namespace nvinfer1::ixrt_plugin { -namespace bert { - -void IxinferBertEmbedLn(const float *token_emb, const float *pos_emb, const float *type_emb, const int *tokens, float *output, - int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size, - const float *scale, const float *bias, cudaStream_t stream); - -cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds, - float const* beta, float const* gamma, float const* wordEmb, float const* posEmb, float const* tokEmb, int32_t const wordSize, - int32_t const tokSize, float* buffer, int8_t* output, int32_t* maskIdx, int32_t padId, float token_embed_amax_); - -void IxinferMaskPad(int32_t* mask, int8_t* new_mask, int bsz, int ori_seq_len, int hsz, - int fmha_seq_len, int batch_tokens, cudaStream_t stream); - -class EmbLayerNormInt8PluginDynamic : public IPluginV2DynamicExt { - public: - EmbLayerNormInt8PluginDynamic(std::string const& name, nvinfer1::DataType const type, nvinfer1::DataType const mhaType, - nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& word_emb, - nvinfer1::Weights const& pos_emb, nvinfer1::Weights const& tok_emb, bool const useFullMask, int32_t padId = 0); - EmbLayerNormInt8PluginDynamic(std::string const& name, void const* data, size_t length); - EmbLayerNormInt8PluginDynamic() noexcept = delete; - ~EmbLayerNormInt8PluginDynamic() override = default; - - // IPluginV2 methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* libNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext methods - DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt methods - IPluginV2DynamicExt* clone() const noexcept override; - DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, - IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept override; - void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, - int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, - int32_t nbOutputs) const noexcept override; - int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs, - void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; - - private: - const std::string mLayerName; - std::string mNamespace; - size_t mHiddenSize; - size_t mSeqLen; - size_t mPadId; - DataType mEmbType; - bool mUseFullMask; - DataType mMhaType; - size_t mWordVocabSize, mPosVocabSize, mTokVocabSize; - cuda_unique_ptr mGammaDev; - cuda_unique_ptr mBetaDev; - cuda_unique_ptr mWordEmbDev; - cuda_unique_ptr mTokEmbDev; - cuda_unique_ptr mPosEmbDev; - // cuda_unique_ptr mNewMask; - WeightsWithOwnership mBeta; - WeightsWithOwnership mGamma; - WeightsWithOwnership mWordEmb; - WeightsWithOwnership mTokEmb; - WeightsWithOwnership mPosEmb; -}; - -class EmbLayerNormInt8PluginDynamicCreator : public IPluginCreator { - public: - EmbLayerNormInt8PluginDynamicCreator(); - - ~EmbLayerNormInt8PluginDynamicCreator() override = default; - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - PluginFieldCollection const* getFieldNames() noexcept override; - - IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override; - - IPluginV2DynamicExt* deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; - -}; - - -} // namespace bert -} //namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp deleted file mode 100644 index 499b2eefc7c691caf0234bde372412d7e69d1aef..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cpp +++ /dev/null @@ -1,495 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#include "embLayerNormPlugin.h" - -#include "NvInferImpl.h" -#include "checkMacrosPlugin.h" -#include "common_def.cuh" -#include "driver_types.h" - -#include "plugin.h" -#include "serialize.h" - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; - -namespace { -char const* EMB_LAYER_NORM_VERSION{"1"}; -char const* EMB_LAYER_NORM_NAME{"CustomEmbLayerNormPluginDynamic_IxRT"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection EmbLayerNormPluginDynamicCreator::mFC{}; -std::vector EmbLayerNormPluginDynamicCreator::mPluginAttributes; - -EmbLayerNormPluginDynamicCreator::EmbLayerNormPluginDynamicCreator() { - mPluginAttributes.clear(); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_beta")); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_layernorm_gamma")); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_word_embeddings")); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_token_type_embeddings")); - mPluginAttributes.emplace_back(PluginField("bert_embeddings_position_embeddings")); - mPluginAttributes.emplace_back(PluginField("output_fp16")); - mPluginAttributes.emplace_back(PluginField("full_mask")); - mPluginAttributes.emplace_back(PluginField("mha_type_id")); - mPluginAttributes.emplace_back(PluginField("pad_id")); - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -char const* EmbLayerNormPluginDynamicCreator::getPluginName() const noexcept { return EMB_LAYER_NORM_NAME; } - -char const* EmbLayerNormPluginDynamicCreator::getPluginVersion() const noexcept { return EMB_LAYER_NORM_VERSION; } - -PluginFieldCollection const* EmbLayerNormPluginDynamicCreator::getFieldNames() noexcept { return &mFC; } - -IPluginV2DynamicExt* EmbLayerNormPluginDynamicCreator::createPlugin(char const* name, - PluginFieldCollection const* fc) noexcept { - try { - IXRT_PLUGIN_ASSERT(fc != nullptr); - gLogInfo << "EmbLayerNormPluginDynamic createPlugin." << endl; - std::set const requiredAttributes{ - "bert_embeddings_layernorm_beta", "bert_embeddings_layernorm_gamma", - "bert_embeddings_word_embeddings", "bert_embeddings_token_type_embeddings", - "bert_embeddings_position_embeddings", - }; - - bool output_fp16 = false; - bool useFullMask = false; - Weights beta{}; - Weights gamma{}; - Weights word_emb{}; - Weights pos_emb{}; - Weights tok_emb{}; - int32_t mhaTypeId = 0; - int32_t pad_id = 0; - - for (auto i = 0; i < fc->nbFields; i++) { - std::string field_name(fc->fields[i].name); - if (field_name.compare("bert_embeddings_layernorm_beta") == 0) { - gLogInfo << "Building bert_embeddings_layernorm_beta..." << endl; - beta.values = fc->fields[i].data; - beta.count = fc->fields[i].length; - beta.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) { - gLogInfo << "Building bert_embeddings_layernorm_gamma..." << endl; - gamma.values = fc->fields[i].data; - gamma.count = fc->fields[i].length; - gamma.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bert_embeddings_word_embeddings") == 0) { - gLogInfo << "Building bert_embeddings_word_embeddings..." << endl; - word_emb.values = fc->fields[i].data; - word_emb.count = fc->fields[i].length; - word_emb.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bert_embeddings_token_type_embeddings") == 0) { - gLogInfo << "Building bert_embeddings_token_type_embeddings..." << endl; - tok_emb.values = fc->fields[i].data; - tok_emb.count = fc->fields[i].length; - tok_emb.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bert_embeddings_position_embeddings") == 0) { - gLogInfo << "Building bert_embeddings_position_embeddings..." << endl; - pos_emb.values = fc->fields[i].data; - pos_emb.count = fc->fields[i].length; - pos_emb.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("output_fp16") == 0) { - IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32); - output_fp16 = static_cast(fc->fields[i].data)[0] != 0; - gLogInfo << "Building output_fp16: " << output_fp16 << endl; - } - - if (field_name.compare("full_mask") == 0) { - IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32); - useFullMask = static_cast(fc->fields[i].data)[0] != 0; - gLogInfo << "Building full_mask: " << useFullMask << endl; - } - - if (field_name.compare("mha_type_id") == 0) { - mhaTypeId = *static_cast(fc->fields[i].data); - IXRT_PLUGIN_ASSERT(mhaTypeId >= 0 && mhaTypeId < 3); - gLogInfo << "Building mha typeId: " << mhaTypeId << endl; - } - - if (field_name.compare("pad_id") == 0) { - IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32) - pad_id = *static_cast(fc->fields[i].data); - } - } - gLogInfo << "Building EmbLayerNormPluginDynamic Plugin..." << endl; - DataType mhaType = static_cast(mhaTypeId); - EmbLayerNormPluginDynamic* p = - new EmbLayerNormPluginDynamic(name, output_fp16 ? DataType::kHALF : DataType::kFLOAT, mhaType, beta, gamma, - word_emb, pos_emb, tok_emb, useFullMask, pad_id); - - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2DynamicExt* EmbLayerNormPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - try { - IXRT_PLUGIN_ASSERT(serialData != nullptr); - return new EmbLayerNormPluginDynamic(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -void EmbLayerNormPluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(pluginNamespace != nullptr); - mNamespace = pluginNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* EmbLayerNormPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// REGISTER_TENSORRT_PLUGIN(EmbLayerNormPluginDynamicCreator); - -//#########################################################################// -EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(std::string const& name, DataType const type, - DataType const mhaType, Weights const& beta, Weights const& gamma, - Weights const& wordEmb, Weights const& posEmb, - Weights const& tokEmb, bool const useFullMask, int32_t padId) - : mLayerName(name), - mHiddenSize(beta.count), - mEmbType(type), - mUseFullMask(useFullMask), - mMhaType(mhaType), - mPadId(padId) { - IXRT_PLUGIN_ASSERT(beta.count == gamma.count); - IXRT_PLUGIN_ASSERT(mHiddenSize > 0U); - IXRT_PLUGIN_ASSERT(wordEmb.count % mHiddenSize == 0); - IXRT_PLUGIN_ASSERT(posEmb.count % mHiddenSize == 0); - IXRT_PLUGIN_ASSERT(tokEmb.count % mHiddenSize == 0); - mWordVocabSize = wordEmb.count / mHiddenSize; - mPosVocabSize = posEmb.count / mHiddenSize; - mTokVocabSize = tokEmb.count / mHiddenSize; - - mBeta.convertAndCopy(beta, nvinfer1::DataType::kHALF); - mGamma.convertAndCopy(gamma, nvinfer1::DataType::kHALF); - mWordEmb.convertAndCopy(wordEmb, mEmbType); - mTokEmb.convertAndCopy(tokEmb, mEmbType); - mPosEmb.convertAndCopy(posEmb, mEmbType); - - copyToDevice(mGamma, sizeof(half) * mGamma.count, mGammaDev); - copyToDevice(mBeta, sizeof(half) * mBeta.count, mBetaDev); - copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev); - copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev); - copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev); -} - -EmbLayerNormPluginDynamic::EmbLayerNormPluginDynamic(std::string const& name, void const* data, size_t length) - : mLayerName(name), - mGammaDev(nullptr), - mBetaDev(nullptr), - mWordEmbDev(nullptr), - mTokEmbDev(nullptr), - mPosEmbDev(nullptr) { - gLogInfo << "EmbLayerNormPluginDynamic deserialize." << endl; - - // Deserialize in the same order as serialization - deserialize_value(&data, &length, &mEmbType); - deserialize_value(&data, &length, &mMhaType); - deserialize_value(&data, &length, &mHiddenSize); - deserialize_value(&data, &length, &mSeqLen); - deserialize_value(&data, &length, &mPadId); - deserialize_value(&data, &length, &mWordVocabSize); - deserialize_value(&data, &length, &mPosVocabSize); - deserialize_value(&data, &length, &mTokVocabSize); - deserialize_value(&data, &length, &mUseFullMask); - - char const* d = static_cast(data); - mBeta.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kHALF); - mGamma.convertAndCopy(d, mHiddenSize, nvinfer1::DataType::kHALF); - mWordEmb.convertAndCopy(d, mHiddenSize * mWordVocabSize, mEmbType); - mPosEmb.convertAndCopy(d, mHiddenSize * mPosVocabSize, mEmbType); - mTokEmb.convertAndCopy(d, mHiddenSize * mTokVocabSize, mEmbType); - - copyToDevice(mGamma, sizeof(half) * mGamma.count, mGammaDev); - copyToDevice(mBeta, sizeof(half) * mBeta.count, mBetaDev); - copyToDevice(mWordEmb, getWeightsSize(mWordEmb, mEmbType), mWordEmbDev); - copyToDevice(mPosEmb, getWeightsSize(mPosEmb, mEmbType), mPosEmbDev); - copyToDevice(mTokEmb, getWeightsSize(mTokEmb, mEmbType), mTokEmbDev); -} - -// IPluginV2 Methods -char const* EmbLayerNormPluginDynamic::getPluginType() const noexcept { return EMB_LAYER_NORM_NAME; } - -char const* EmbLayerNormPluginDynamic::getPluginVersion() const noexcept { return EMB_LAYER_NORM_VERSION; } - -int32_t EmbLayerNormPluginDynamic::getNbOutputs() const noexcept { return 2; } - -int32_t EmbLayerNormPluginDynamic::initialize() noexcept { return 0; } - -void EmbLayerNormPluginDynamic::terminate() noexcept { gLogInfo << "EmbLayerNormPluginDynamic terminate." << endl; } - -size_t EmbLayerNormPluginDynamic::getSerializationSize() const noexcept { - size_t const wordSize = getElementSize(mEmbType); - return sizeof(mEmbType) * 2 // mEmbType, mMhaType - + sizeof(mHiddenSize) * 6 // mHiddenSize, mSeqLen, 3*VocabSize, mPadId - + sizeof(mUseFullMask) // mask type - + 2 * sizeof(half) * mHiddenSize // beta + gamma - + wordSize * mHiddenSize * mWordVocabSize // word emb - + wordSize * mHiddenSize * mPosVocabSize // pos emb - + wordSize * mHiddenSize * mTokVocabSize // tok emb - ; -} - -void EmbLayerNormPluginDynamic::serialize(void* buffer) const noexcept { - serialize_value(&buffer, mEmbType); - serialize_value(&buffer, mMhaType); - serialize_value(&buffer, mHiddenSize); - serialize_value(&buffer, mSeqLen); - serialize_value(&buffer, mPadId); - serialize_value(&buffer, mWordVocabSize); - serialize_value(&buffer, mPosVocabSize); - serialize_value(&buffer, mTokVocabSize); - serialize_value(&buffer, mUseFullMask); - - char* d = static_cast(buffer); - serFromDev(d, mBetaDev.get(), mHiddenSize); - serFromDev(d, mGammaDev.get(), mHiddenSize); - size_t const wordSize = getElementSize(mEmbType); - serFromDev(d, static_cast(mWordEmbDev.get()), mHiddenSize * mWordVocabSize * wordSize); - serFromDev(d, static_cast(mPosEmbDev.get()), mHiddenSize * mPosVocabSize * wordSize); - serFromDev(d, static_cast(mTokEmbDev.get()), mHiddenSize * mTokVocabSize * wordSize); -} - -void EmbLayerNormPluginDynamic::destroy() noexcept { - gLogInfo << "EmbLayerNormPluginDynamic destroy." << endl; - // This gets called when the network containing plugin is destroyed - mGammaDev.reset(nullptr); - mBetaDev.reset(nullptr); - mWordEmbDev.reset(nullptr); - mPosEmbDev.reset(nullptr); - mTokEmbDev.reset(nullptr); - delete this; -} - -void EmbLayerNormPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { - try { - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* EmbLayerNormPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2Ext Methods -DataType EmbLayerNormPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes, - int32_t nbInputs) const noexcept { - IXRT_PLUGIN_ASSERT(index == 0 || index == 1); - if (index == 0) { - IXRT_PLUGIN_ASSERT(mMhaType == DataType::kHALF || mMhaType == DataType::kFLOAT); - return mMhaType; - } - return DataType::kINT32; -} - -// IPluginV2DynamicExt Methods -IPluginV2DynamicExt* EmbLayerNormPluginDynamic::clone() const noexcept { - try { - gLogInfo << "EmbLayerNormPluginDynamic clone." << endl; - - auto p = new EmbLayerNormPluginDynamic(mLayerName, mEmbType, mMhaType, mBeta, mGamma, mWordEmb, mPosEmb, - mTokEmb, mUseFullMask); - p->mSeqLen = mSeqLen; - p->setPluginNamespace(mNamespace.c_str()); - - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -DimsExprs EmbLayerNormPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, - IExprBuilder& exprBuilder) noexcept { - try { - // Input should be input ids and token ids and the input mask - // Output should be the embeddings tensor and mask indices - IXRT_PLUGIN_ASSERT(nbInputs == 3); - - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == 2); // BxS - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims); - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[2].nbDims); - - IXRT_PLUGIN_ASSERT(outputIndex == 0 || outputIndex == 1); - - if (outputIndex == 0) { - DimsExprs ret; - ret.nbDims = 5; - ret.d[0] = inputs[0].d[0]; - ret.d[1] = inputs[0].d[1]; - ret.d[2] = exprBuilder.constant(mHiddenSize); - ret.d[3] = exprBuilder.constant(1); - ret.d[4] = exprBuilder.constant(1); - return ret; - } - - DimsExprs ret; - ret.nbDims = 2; - ret.d[0] = inputs[0].d[BDIM]; - ret.d[1] = inputs[0].d[SDIM]; - return ret; - } catch (std::exception const& e) { - caughtError(e); - } - return DimsExprs{}; -} - -bool EmbLayerNormPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept { - // 3 inputs of size BxS - IXRT_PLUGIN_ASSERT(nbInputs == 3); - IXRT_PLUGIN_ASSERT(nbOutputs == 2); - - PluginTensorDesc const& desc = inOut[pos]; - if (desc.format != TensorFormat::kLINEAR) { - return false; - } - if (pos == 0) { - return desc.type == DataType::kINT32; - } - - PluginTensorDesc const& prev = inOut[pos - 1]; - if (pos == 1 || pos == 2) { - return desc.type == DataType::kINT32 && desc.format == prev.format; - } - - // embedded sequence - if (pos == 3) { - return desc.type == mMhaType && desc.format == prev.format; - } - // mask - return desc.type == ((mMhaType == DataType::kHALF) ? DataType::kINT32 : mMhaType); -} - -void EmbLayerNormPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, - DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept { - gLogInfo << "EmbLayerNormPluginDynamic configurePlugin." << endl; - - // Validate input arguments - IXRT_PLUGIN_ASSERT(nbOutputs == 2); - IXRT_PLUGIN_ASSERT(nbInputs == 3); - - IXRT_PLUGIN_ASSERT(inputs[0].desc.dims.nbDims == 2); - int32_t const S = inputs[0].desc.dims.d[SDIM]; - mSeqLen = S; - int32_t const B = inputs[0].desc.dims.d[BDIM]; - TRT_UNUSED B; - IXRT_PLUGIN_ASSERT(mSeqLen == static_cast(inputs[1].desc.dims.d[SDIM])); - IXRT_PLUGIN_ASSERT(B == inputs[1].desc.dims.d[BDIM]); - IXRT_PLUGIN_ASSERT(mSeqLen == static_cast(inputs[2].desc.dims.d[SDIM])); - IXRT_PLUGIN_ASSERT(B == inputs[2].desc.dims.d[BDIM]); - - IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.nbDims == 5); - IXRT_PLUGIN_ASSERT(mSeqLen == outputs[0].desc.dims.d[SDIM]) - IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[BDIM] == B); - IXRT_PLUGIN_ASSERT(static_cast(outputs[0].desc.dims.d[2]) == mHiddenSize); - IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[3] == 1); - IXRT_PLUGIN_ASSERT(outputs[0].desc.dims.d[4] == 1); - - IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.nbDims == 2); - IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[0] == B); - IXRT_PLUGIN_ASSERT(outputs[1].desc.dims.d[1] == mSeqLen); -} - -size_t EmbLayerNormPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept { - return 0; -} - -int32_t EmbLayerNormPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept { - gLogInfo << "enqueue EmbLayerNormPluginDynamic.." << endl; - try { - int32_t const B = inputDesc->dims.d[BDIM]; - int32_t const S = inputDesc->dims.d[SDIM]; - int32_t status = STATUS_SUCCESS; - int32_t fmha_S = S; - int32_t batch_tokens = B * fmha_S; - - // Our plugin outputs only one tensor - auto const inputIds = static_cast(inputs[0]); - auto const segmentIds = static_cast(inputs[1]); - - half const* beta = mBetaDev.get(); - half const* gamma = mGammaDev.get(); - if (mMhaType == DataType::kFLOAT) { - gLogError << "embLayerNormPlugin float type not supported!" << endl; - return STATUS_NOT_SUPPORTED; - } else if (mMhaType == DataType::kHALF) { - auto output = static_cast(outputs[0]); - auto mNewMask = static_cast(outputs[1]); - auto const wordEmb = static_cast(mWordEmbDev.get()); - auto const tokEmb = static_cast(mTokEmbDev.get()); - auto const posEmb = static_cast(mPosEmbDev.get()); - - status = - embLayerNorm(stream, static_cast(mHiddenSize), B, S, inputIds, segmentIds, beta, gamma, - wordEmb, posEmb, tokEmb, mWordVocabSize, mTokVocabSize, output, mNewMask, mPadId); - if (status != cudaSuccess) { - return STATUS_FAILURE; - } - } - else { - gLogError << "Unsupported type error, expected [kHALF,kFLOAT], but received " - << static_cast(mMhaType) << endl; - - return STATUS_NOT_SUPPORTED; - } - - return status; - } catch (std::exception const& e) { - caughtError(e); - } - return STATUS_FAILURE; -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu deleted file mode 100644 index 5766d382a6b3bda8cd315bb71916d568e7b380b7..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.cu +++ /dev/null @@ -1,258 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "embLayerNormPlugin.h" -#include "backend/bert/bert_helper.h" - -namespace nvinfer1::ixrt_plugin { -using namespace backend; -namespace bert { - -template -__global__ void IxinferBertEmbedLnKernel(const __half *token_emb, const __half *pos_emb, const __half *type_emb, - const int *tokens, __half *output, int *pad_mask, int *type_ids, int pad_id, - int batch_size, int seq_len, int hidden_dim, const __half *scale, - const __half *bias) { - float2 vals[THREAD_DATA_LEN]; - int block_start = blockIdx.x * hidden_dim; - output += block_start; - - __half2 *p_output = (__half2 *)output; - __half2 *p_scale = (__half2 *)scale; - __half2 *p_bias = (__half2 *)bias; - - float thread_m2 = 0; - float thread_mean = 0; - float thread_count = 0; - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - - int batch_idx, seq_idx, dim_idx; - batch_idx = blockIdx.x / seq_len; - seq_idx = blockIdx.x % seq_len; - dim_idx = element_index; - int tokens_idx = blockIdx.x; - int token = tokens[tokens_idx]; - int token_type = type_ids[tokens_idx]; - - half2 value; - - if (token == pad_id) { - if (dim_idx == 0) { - pad_mask[tokens_idx] = 1; - } - value.x = __float2half(0.f); - value.y = __float2half(0.f); - - } else { - if (dim_idx == 0) { - pad_mask[tokens_idx] = 0; - } - value = ((half2 *)(token_emb + token * hidden_dim + dim_idx * 2))[0]; - half2 pemb = ((half2 *)(pos_emb + seq_idx * hidden_dim + dim_idx * 2))[0]; - half2 temb = ((half2 *)(type_emb + token_type * hidden_dim + dim_idx * 2))[0]; - - vals[it].x = __half2float(value.x) + __half2float(pemb.x) + __half2float(temb.x); - vals[it].y = __half2float(value.y) + __half2float(pemb.y) + __half2float(temb.y); - - WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count); - } - - float mean = 0; - float m2 = 0; - float count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count); - mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE); - m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE); - count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE); - m2 = rsqrtf(m2 / hidden_dim + epsilon); - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - - __half2 scale_value = p_scale[element_index]; - __half2 bias_value = p_bias[element_index]; - - float2 norm_value; - norm_value.x = (vals[it].x - mean) * m2 * __half2float(scale_value.x) + __half2float(bias_value.x); - norm_value.y = (vals[it].y - mean) * m2 * __half2float(scale_value.y) + __half2float(bias_value.y); - - __half2 res; - res.x = __float2half(norm_value.x); - res.y = __float2half(norm_value.y); - - int token = tokens[tokens_idx]; - if (token == pad_id) { - res.x = __float2half(0.f); - res.y = __float2half(0.f); - p_output[element_index] = res; - } else { - p_output[element_index] = res; - } - } - } -} - -void IxinferBertEmbedLn(const half *token_emb, const half *pos_emb, const half *type_emb, - const int *tokens, half *output, int *pad_mask, int *type_ids, int pad_id, - int batch_size, int seq_len, int hidden_size, const half *scale, const half *bias, - cudaStream_t stream) { - if (hidden_size > 2048) { - throw std::runtime_error("hidden_size should <= 2048"); - } - if (hidden_size / 2 % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size / 2 // C10_WARP_SIZE != 0"); - } - int batch_tokens = batch_size * seq_len; - dim3 gridSize(batch_tokens); - dim3 blockSize(C10_WARP_SIZE); - - int num_warp = hidden_size / C10_WARP_SIZE / 2; - - switch (num_warp) { - case 1: - IxinferBertEmbedLnKernel<1> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 2: - IxinferBertEmbedLnKernel<2> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 3: - IxinferBertEmbedLnKernel<3> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 4: - IxinferBertEmbedLnKernel<4> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 5: - IxinferBertEmbedLnKernel<5> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 6: - IxinferBertEmbedLnKernel<6> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 7: - IxinferBertEmbedLnKernel<7> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 8: - IxinferBertEmbedLnKernel<8> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 9: - IxinferBertEmbedLnKernel<9> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 10: - IxinferBertEmbedLnKernel<10> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 11: - IxinferBertEmbedLnKernel<11> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 12: - IxinferBertEmbedLnKernel<12> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 13: - IxinferBertEmbedLnKernel<13> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 14: - IxinferBertEmbedLnKernel<14> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 15: - IxinferBertEmbedLnKernel<15> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - case 16: - IxinferBertEmbedLnKernel<16> - <<>>(token_emb, pos_emb, type_emb, tokens, output, pad_mask, type_ids, - pad_id, batch_size, seq_len, hidden_size, scale, bias); - break; - default: - throw std::runtime_error("IxinferBertEmbedLn"); - break; - } -} - -cudaError_t embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds, - half const* beta, half const* gamma, half const* wordEmb, half const* posEmb, half const* tokEmb, int32_t const wordSize, - int32_t const tokSize, half* output, int32_t* maskIdx, int32_t padId) -{ - IxinferBertEmbedLn(wordEmb, posEmb, tokEmb, inputIds, output, maskIdx, (int*)segmentIds, - padId, B, S, E, gamma, beta, stream); - return cudaSuccess; -} - -void __global__ IxinferMaskPadKernel(const int32_t* mask, int32_t* new_mask, int bsz, - int ori_seq_len, int hsz, int fmha_seq_len) { - int batch_idx = blockIdx.x; - int seq_idx = blockIdx.y; - - if (seq_idx < ori_seq_len) { - if (threadIdx.x == 0) { - new_mask[batch_idx * fmha_seq_len + seq_idx] = mask[batch_idx * ori_seq_len + seq_idx]; - } - } else { - new_mask[batch_idx * fmha_seq_len + seq_idx] = 1; - } -} - -void IxinferMaskPad(int32_t* mask, int32_t* new_mask, int bsz, int ori_seq_len, int hsz, - int fmha_seq_len, int batch_tokens, cudaStream_t stream) { - if (hsz / 2 > 4096) { - throw std::runtime_error("hsz/2>4096"); - } - if (hsz % 2 != 0) { - throw std::runtime_error("hsz % 2 !=0"); - } - if (ori_seq_len > fmha_seq_len) { - throw std::runtime_error("ori_seq_len > fmha_seq_len"); - } - if (bsz * ori_seq_len > batch_tokens) { - throw std::runtime_error("bsz*ori_seq_len > batch_tokens"); - } - dim3 blockSize(bsz, fmha_seq_len); - IxinferMaskPadKernel<<>>(mask, new_mask, bsz, ori_seq_len, hsz, - fmha_seq_len); -} - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h deleted file mode 100644 index f96e7d7310613be1967072597317e45ee7dfbdb6..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/emb_layernorm/embLayerNormPlugin.h +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#pragma once -#include -#include -#include -#include -#include - - -#include "bertCommon.h" - -namespace nvinfer1::ixrt_plugin { -namespace bert { - -cudaError embLayerNorm(cudaStream_t stream, int E, int B, int S, int32_t const* inputIds, int32_t const* segmentIds, - half const* beta, half const* gamma, half const* wordEmb, half const* posEmb, half const* tokEmb, int32_t const wordSize, - int32_t const tokSize, half* output, int32_t* maskIdx, int32_t padId); - -void IxinferMaskPad(int32_t* mask, int32_t* new_mask, int bsz, int ori_seq_len, int hsz, - int fmha_seq_len, int batch_tokens, cudaStream_t stream); - -void IxinferBertEmbedLn(const half *token_emb, const half *pos_emb, const half *type_emb, const int *tokens, half *output, - int *pad_mask, int *type_ids, int pad_id, int batch_size, int seq_len, int hidden_size, - const half *scale, const half *bias, cudaStream_t stream);; - -class EmbLayerNormPluginDynamic : public IPluginV2DynamicExt { - public: - EmbLayerNormPluginDynamic(std::string const& name, nvinfer1::DataType const type, nvinfer1::DataType const mhaType, - nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& word_emb, - nvinfer1::Weights const& pos_emb, nvinfer1::Weights const& tok_emb, bool const useFullMask, int32_t padId = 0); - EmbLayerNormPluginDynamic(std::string const& name, void const* data, size_t length); - EmbLayerNormPluginDynamic() noexcept = delete; - ~EmbLayerNormPluginDynamic() override = default; - - // IPluginV2 methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* libNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext methods - DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt methods - IPluginV2DynamicExt* clone() const noexcept override; - DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, - IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept override; - void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, - int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, - int32_t nbOutputs) const noexcept override; - int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs, - void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; - - private: - const std::string mLayerName; - std::string mNamespace; - size_t mHiddenSize; - size_t mSeqLen; - size_t mPadId; - DataType mEmbType; - bool mUseFullMask; - DataType mMhaType; - size_t mWordVocabSize, mPosVocabSize, mTokVocabSize; - cuda_unique_ptr mGammaDev; - cuda_unique_ptr mBetaDev; - cuda_unique_ptr mWordEmbDev; - cuda_unique_ptr mTokEmbDev; - cuda_unique_ptr mPosEmbDev; - WeightsWithOwnership mBeta; - WeightsWithOwnership mGamma; - WeightsWithOwnership mWordEmb; - WeightsWithOwnership mTokEmb; - WeightsWithOwnership mPosEmb; -}; - -class EmbLayerNormPluginDynamicCreator : public IPluginCreator { - public: - EmbLayerNormPluginDynamicCreator(); - - ~EmbLayerNormPluginDynamicCreator() override = default; - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - PluginFieldCollection const* getFieldNames() noexcept override; - - IPluginV2DynamicExt* createPlugin(char const* name, PluginFieldCollection const* fc) noexcept override; - - IPluginV2DynamicExt* deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - private: - static PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; - -}; - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp deleted file mode 100644 index 30b47f88ae624db48d86d3d3f35327db82639012..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.cpp +++ /dev/null @@ -1,389 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "ffnPlugin.h" - -#include "NvInferRuntime.h" -#include "NvInferRuntimeCommon.h" -#ifdef __ILUVATAR__ -#include "backend/ixinfer/ixinfer_gemm_helper.h" -#endif -#include "bertCommon.h" -#include "checkMacrosPlugin.h" -#include "gelu/geluPlugin.h" -#include "plugin.h" -#include "serialize.h" - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; -using namespace nvinfer1::ixrt_plugin::backend; - -namespace { -char const* const kFFN_VERSION{"1"}; -char const* const kFFN_NAME{"CustomFFNPluginDynamic_IxRT"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection FFNPluginDynamicCreator::mFFN{}; -std::vector FFNPluginDynamicCreator::mPluginAttributes; - -FFNPluginDynamicCreator::FFNPluginDynamicCreator() { - mPluginAttributes.clear(); - mPluginAttributes.emplace_back(PluginField("out_dims", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("act_type", nullptr, PluginFieldType::kINT32, 1)); - - mFFN.nbFields = mPluginAttributes.size(); - mFFN.fields = mPluginAttributes.data(); -} - -char const* FFNPluginDynamicCreator::getPluginName() const noexcept { return kFFN_NAME; } - -char const* FFNPluginDynamicCreator::getPluginVersion() const noexcept { return kFFN_VERSION; } - -PluginFieldCollection const* FFNPluginDynamicCreator::getFieldNames() noexcept { return &mFFN; } - -IPluginV2* FFNPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept { - try { - gLogInfo << "Creating FFNPluginDynamicCreator..." << endl; - IXRT_PLUGIN_ASSERT(name != nullptr); - IXRT_PLUGIN_ASSERT(fc != nullptr); - - int32_t outDims = 0; - int32_t typeId = -1; - int32_t act_type = -1; - Weights W1{DataType::kFLOAT, nullptr, 0LL}; - Weights W2{DataType::kFLOAT, nullptr, 0LL}; - Weights B1{DataType::kFLOAT, nullptr, 0LL}; - ixrt_plugin::validateRequiredAttributesExist({"out_dims", "type_id", "W1", "W2", "B1"}, fc); - - for (int32_t i = 0; i < fc->nbFields; i++) { - std::string fieldName(fc->fields[i].name); - if (fieldName.compare("out_dims") == 0) { - outDims = static_cast(fc->fields[i].data)[0]; - gLogInfo << "Building outDims: " << outDims << endl; - } - - if (fieldName.compare("type_id") == 0) { - typeId = static_cast(fc->fields[i].data)[0]; - gLogInfo << "Building typeId: " << typeId << endl; - } - - if (fieldName.compare("W1") == 0) { - gLogInfo << "Building W1..." << endl; - W1.values = fc->fields[i].data; - W1.count = fc->fields[i].length; - W1.type = fieldTypeToDataType(fc->fields[i].type); - gLogInfo << "Is W1 float32: " << (W1.type == DataType::kFLOAT) << endl; - } - - if (fieldName.compare("W2") == 0) { - gLogInfo << "Building W2..." << endl; - W2.values = fc->fields[i].data; - W2.count = fc->fields[i].length; - W2.type = fieldTypeToDataType(fc->fields[i].type); - gLogInfo << "Is W2 float32: " << (W2.type == DataType::kFLOAT) << endl; - } - - if (fieldName.compare("B1") == 0) { - gLogInfo << "Building B1..." << endl; - B1.values = fc->fields[i].data; - B1.count = fc->fields[i].length; - B1.type = fieldTypeToDataType(fc->fields[i].type); - gLogInfo << "Is B1 float32: " << (B1.type == DataType::kFLOAT) << endl; - } - - if (fieldName.compare("act_type") == 0) { - gLogInfo << "Building act_type..." << endl; - act_type = static_cast(fc->fields[i].data)[0]; - gLogInfo << "Building act_type: " << act_type << endl; - } - } - - if (outDims <= 0) { - gLogInfo << "Invalid output dimension" << endl; - } - if (typeId < 0 || typeId > 1) { - gLogInfo << "Invalid type id" << typeId << endl; - } - if (W1.count == 0 || W1.values == nullptr) { - gLogInfo << "Invalid weights W1" << endl; - } - if (W2.count == 0 || W2.values == nullptr) { - gLogInfo << "Invalid weights W2" << endl; - } - if (B1.count == 0 || B1.values == nullptr) { - gLogInfo << "Invalid weights B1" << endl; - } - - DataType type = typeId == 0 ? DataType::kFLOAT : DataType::kHALF; - return new FFNPluginDynamic(name, type, outDims, act_type, W1, W2, B1); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2* FFNPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - // This object will be deleted when the network is destroyed, which will - // call FFNPluginDynamic::destroy() - try { - return new FFNPluginDynamic(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -void FFNPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(libNamespace != nullptr); - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* FFNPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// REGISTER_TENSORRT_PLUGIN(FFNPluginDynamicCreator); -//#########################################################################// -FFNPluginDynamic::FFNPluginDynamic(std::string const name, DataType const type, int32_t const outDim, - int32_t const act_type, Weights const& W1, Weights const& W2, Weights const& B1) - : mLayerName(name), - mType(type), - mHiddenSize(outDim), - mActType(act_type), - mWdev1(nullptr), - mWdev2(nullptr), - mBdev1(nullptr) { - mW1.convertAndCopy(W1, mType); - mW2.convertAndCopy(W2, mType); - mB1.convertAndCopy(B1, mType); - copyToDevice(mW1, getWeightsSize(mW1, mType), mWdev1); - copyToDevice(mW2, getWeightsSize(mW2, mType), mWdev2); - copyToDevice(mB1, getWeightsSize(mB1, mType), mBdev1); -} - -FFNPluginDynamic::FFNPluginDynamic(std::string const name, void const* data, size_t length) - : mLayerName(name), mWdev1(nullptr), mWdev2(nullptr), mBdev1(nullptr) { - gLogInfo << "FFNPluginDynamic deserialize" << endl; - - // Deserialize in the same order as serialization - deserialize_value(&data, &length, &mType); - deserialize_value(&data, &length, &mHiddenSize); - deserialize_value(&data, &length, &mActType); - - char const* d = static_cast(data); - - mW1.convertAndCopy(d, mHiddenSize * mHiddenSize * 4, mType); - copyToDevice(mW1, getWeightsSize(mW1, mType), mWdev1); - - mW2.convertAndCopy(d, mHiddenSize * mHiddenSize * 4, mType); - copyToDevice(mW2, getWeightsSize(mW2, mType), mWdev2); - - mB1.convertAndCopy(d, mHiddenSize * 4, mType); - copyToDevice(mB1, getWeightsSize(mB1, mType), mBdev1); -} - -// IPluginV2 Methods -char const* FFNPluginDynamic::getPluginType() const noexcept { return kFFN_NAME; } - -char const* FFNPluginDynamic::getPluginVersion() const noexcept { return kFFN_VERSION; } - -int32_t FFNPluginDynamic::getNbOutputs() const noexcept { return 1; } - -int32_t FFNPluginDynamic::initialize() noexcept { - gLogInfo << "FFNPluginDynamic initialize" << endl; - return 0; -} - -void FFNPluginDynamic::terminate() noexcept { gLogInfo << "FFNPluginDynamic terminate" << endl; } - -size_t FFNPluginDynamic::getSerializationSize() const noexcept { - size_t wordSize = getElementSize(mType); - return wordSize * (mHiddenSize * mHiddenSize * 8 + mHiddenSize * 4) + sizeof(mType) + sizeof(mHiddenSize) + - sizeof(mActType); -} - -void FFNPluginDynamic::serialize(void* buffer) const noexcept { - serialize_value(&buffer, mType); - serialize_value(&buffer, mHiddenSize); - serialize_value(&buffer, mActType); - - size_t wordSize = getElementSize(mType); - char* d = static_cast(buffer); - serFromDev(d, static_cast(mWdev1.get()), 4 * mHiddenSize * mHiddenSize * wordSize); - serFromDev(d, static_cast(mWdev2.get()), 4 * mHiddenSize * mHiddenSize * wordSize); - serFromDev(d, static_cast(mBdev1.get()), 4 * mHiddenSize * wordSize); -} - -void FFNPluginDynamic::destroy() noexcept { - gLogInfo << "FFNPluginDynamic destroy" << endl; - mWdev1.reset(nullptr); - mWdev2.reset(nullptr); - mBdev1.reset(nullptr); - delete this; -} - -void FFNPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(libNamespace != nullptr); - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* FFNPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2Ext Methods -DataType FFNPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes, - int32_t nbInputs) const noexcept { - IXRT_PLUGIN_ASSERT(index == 0); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(inputTypes != nullptr); - IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF); - return inputTypes[0]; -} - -// IPluginV2DynamicExt Methods -IPluginV2DynamicExt* FFNPluginDynamic::clone() const noexcept { - try { - gLogInfo << "FFNPluginDynamic clone" << endl; - - auto* p = new FFNPluginDynamic(mLayerName, mType, mHiddenSize, mActType, mW1, mW2, mB1); - p->setPluginNamespace(mNamespace.c_str()); - - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -DimsExprs FFNPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, - IExprBuilder& exprBuilder) noexcept { - try { - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(outputIndex == 0); - IXRT_PLUGIN_ASSERT(inputs != nullptr); - DimsExprs ret; - ret.nbDims = 5; - ret.d[0] = inputs[0].d[0]; - ret.d[1] = inputs[0].d[1]; - ret.d[2] = exprBuilder.constant(mHiddenSize); - ret.d[3] = exprBuilder.constant(1); - ret.d[4] = exprBuilder.constant(1); - return ret; - } catch (std::exception const& e) { - caughtError(e); - } - return DimsExprs{}; -} - -bool FFNPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept { - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(inOut != nullptr); - - PluginTensorDesc const& in = inOut[pos]; - if (pos == 0) { - return (in.type == mType) && (in.format == TensorFormat::kLINEAR); - } - PluginTensorDesc const& prev = inOut[pos - 1]; - - // output - return in.type == prev.type && in.format == prev.format; -} - -void FFNPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, - DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept { - try { - // Validate input arguments - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(outputs != nullptr); - IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type); - auto const& inDims0 = inputs[0].desc.dims; - - IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5); - IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1); - IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1); -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferCreate(&cuinfer_handle)); -#else - CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle)); -#endif - } catch (std::exception const& e) { - caughtError(e); - } -} - -size_t FFNPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept { - int32_t const S = inputs[0].dims.d[SDIM]; - int32_t const B = inputs[0].dims.d[BDIM]; - return B * S * 4 * mHiddenSize * sizeof(half); -} - -int32_t FFNPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workSpace, - cudaStream_t stream) noexcept { - gLogInfo << "in FFNPluginDynamic.." << endl; - try { -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream)); -#endif - int32_t const S = inputDesc->dims.d[SDIM]; - int32_t const B = inputDesc->dims.d[BDIM]; - int32_t const n = S * B; - IXRT_PLUGIN_ASSERT(n >= 0); - - if (mType == DataType::kHALF) { - auto const* const input = static_cast(inputs[0]); - auto* output = static_cast(outputs[0]); - auto weight1 = static_cast(mWdev1.get()); - auto weight2 = static_cast(mWdev2.get()); - auto bias1 = static_cast(mBdev1.get()); - auto buffer = static_cast(workSpace); - -#ifdef __ILUVATAR__ - cuinfer_gemm(weight1, input, bias1, buffer, 1, mHiddenSize * 4, n, mHiddenSize, 0, 0, 0, 1.0f, mActType, - stream, cuinfer_handle); - cuinfer_gemm(weight2, buffer, nullptr, output, 1, mHiddenSize, n, 4 * mHiddenSize, 0, 0, 0, 1.0f, -1, - stream, cuinfer_handle); -#else - cublaslt_gemm(weight1, input, buffer, 1, mHiddenSize * 4, n, mHiddenSize, 0, 0, 0, 1.0f, blaslt_handle, - stream); - computeGeluBias(buffer, buffer, bias1, 4 * mHiddenSize, n, stream); - cublaslt_gemm(weight2, buffer, output, 1, mHiddenSize, n, mHiddenSize * 4, 0, 0, 0, 1.0f, blaslt_handle, - stream); -#endif - } else { - gLogError << "Unsupported type error, expected [kHALF], but received " << static_cast(mType) - << endl; - return STATUS_FAILURE; - } - return STATUS_SUCCESS; - } catch (std::exception const& e) { - caughtError(e); - } - return STATUS_FAILURE; -} \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h deleted file mode 100644 index 21459c9bfe7ed5c1a206e8dc6b920bf17228fc29..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/ffn/ffnPlugin.h +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#pragma once -#ifdef __ILUVATAR__ -#include -#endif - -#include - -#include "NvInferRuntime.h" -#include "NvInferRuntimeCommon.h" -#include "backend/cublas/cublas_helper.h" -#include "bertCommon.h" -#include - -namespace nvinfer1::ixrt_plugin { -namespace bert { - -class FFNPluginDynamic : public nvinfer1::IPluginV2DynamicExt { - public: - FFNPluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim, - int32_t const out_type, nvinfer1::Weights const& W1, nvinfer1::Weights const& W2, - nvinfer1::Weights const& B1); - - FFNPluginDynamic(std::string const name, void const* data, size_t length); - - // It doesn't make sense to make FFNPluginDynamic without arguments, so we - // delete default constructor. - FFNPluginDynamic() = delete; - - // IPluginV2 Methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext Methods - nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, - int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt Methods - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; - nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept override; - - private: - std::string const mLayerName; - std::string mNamespace; - - nvinfer1::DataType mType; - size_t mHiddenSize; - size_t mActType; - - bert::WeightsWithOwnership mW1; - bert::WeightsWithOwnership mB1; - bert::WeightsWithOwnership mW2; - bert::cuda_unique_ptr mWdev1; - bert::cuda_unique_ptr mWdev2; - bert::cuda_unique_ptr mBdev1; - -#ifdef __ILUVATAR__ - cuinferHandle_t cuinfer_handle; -#else - cublasLtHandle_t blaslt_handle; -#endif - cudaStream_t stream; -}; - -class FFNPluginDynamicCreator : public nvinfer1::IPluginCreator { - public: - FFNPluginDynamicCreator(); - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - - private: - static nvinfer1::PluginFieldCollection mFFN; - static std::vector mPluginAttributes; - std::string mNamespace; -}; - -class FFNInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt { - public: - FFNInt8PluginDynamic(std::string const name, nvinfer1::DataType const type, int32_t const outDim, - nvinfer1::Weights const& W, nvinfer1::Weights const& Bias, vector const& scale); - - FFNInt8PluginDynamic(std::string const name, void const* data, size_t length); - - // It doesn't make sense to make FFNInt8PluginDynamic without arguments, so we - // delete default constructor. - FFNInt8PluginDynamic() = delete; - - // IPluginV2 Methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext Methods - nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, - int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt Methods - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; - nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept override; - - private: - std::string const mLayerName; - std::string mNamespace; - - nvinfer1::DataType mType; - size_t mOutDim; // leading dim - size_t mNumParams; - int32_t mNmax; - int32_t mK; - int32_t mNumBias; - - vector mScale; - - bert::WeightsWithOwnership mW; - bert::cuda_unique_ptr mWdev; - - bert::WeightsWithOwnership mBias; - bert::cuda_unique_ptr mBiasdev; - -#ifdef __ILUVATAR__ - cuinferHandle_t cuinfer_handle; -#else - cublasLtHandle_t blaslt_handle; -#endif - cudaStream_t stream; -}; - -class FFNInt8PluginDynamicCreator : public nvinfer1::IPluginCreator { - public: - FFNInt8PluginDynamicCreator(); - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - - private: - static nvinfer1::PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; -}; - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp deleted file mode 100644 index b9ae517746d9ecee513e6834f150142562385c15..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cpp +++ /dev/null @@ -1,355 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#include "geluPlugin.h" -#include "bertCommon.h" -#include "checkMacrosPlugin.h" -#include "plugin.h" -#include "serialize.h" - -#include - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; - -namespace { -char const* const kGELU_IXRT_PLUGIN_VERSION{"1"}; -char const* const kGELU_IXRT_PLUGIN_NAME{"CustomGeluPluginDynamic_IxRT"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection GeluPluginDynamicCreator::mFC{}; -std::vector GeluPluginDynamicCreator::mPluginAttributes; - -GeluPluginDynamicCreator::GeluPluginDynamicCreator() { - mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("bias", nullptr, PluginFieldType::kFLOAT32, 1)); - - // Fill PluginFieldCollection with PluginField arguments metadata - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -char const* GeluPluginDynamicCreator::getPluginName() const noexcept { return kGELU_IXRT_PLUGIN_NAME; } - -char const* GeluPluginDynamicCreator::getPluginVersion() const noexcept { return kGELU_IXRT_PLUGIN_VERSION; } - -PluginFieldCollection const* GeluPluginDynamicCreator::getFieldNames() noexcept { return &mFC; } - -IPluginV2* GeluPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept { - try { - gLogVerbose << "GeluPluginDynamicCreator createPlugin\n"; - IXRT_PLUGIN_ASSERT(fc != nullptr); - - Weights bias{DataType::kFLOAT, nullptr, 0}; - int32_t typeId = -1; - ixrt_plugin::validateRequiredAttributesExist({"type_id", "ld"}, fc); - int32_t ld = 0; - - for (int32_t i = 0; i < fc->nbFields; i++) { - IXRT_PLUGIN_ASSERT(fc->fields[i].name != nullptr); - std::string fieldName(fc->fields[i].name); - - if (fieldName.compare("type_id") == 0) { - typeId = *static_cast(fc->fields[i].data); - } - if (fieldName.compare("bias") == 0) { - bias.values = fc->fields[i].data; - bias.count = fc->fields[i].length; - bias.type = fieldTypeToDataType(fc->fields[i].type); - } - if (fieldName.compare("ld") == 0) { - ld = *static_cast(fc->fields[i].data); - } - } - - if (typeId < 0 || typeId > 3) { - gLogError << "GeluPluginDynamicCreator: invalid typeId " << typeId << std::endl; - return nullptr; - } - - return new GeluPluginDynamic(name, static_cast(typeId), bias, ld); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2* GeluPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - // This object will be deleted when the network is destroyed, which will - // call GeluPluginDynamic::destroy() - try { - return new GeluPluginDynamic(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -void GeluPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(libNamespace != nullptr); - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* GeluPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// REGISTER_TENSORRT_PLUGIN(GeluPluginDynamicCreator); -//#########################################################################// -GeluPluginDynamic::GeluPluginDynamic(const std::string name, const DataType type, Weights const& bias, const int ld) - : mLayerName(name), mType(type), mLd(ld), mNumBias(bias.count) { - if (mNumBias > 0) { - mBias.convertAndCopy(bias, DataType::kHALF); - copyToDevice(mBias, getWeightsSize(mBias, DataType::kHALF), mBiasDev); - } -} - -GeluPluginDynamic::GeluPluginDynamic(const std::string name, void const* data, size_t length) : mLayerName(name) { - gLogVerbose << "GeluPluginDynamic deserialize\n"; - deserialize_value(&data, &length, &mType); - deserialize_value(&data, &length, &mLd); - deserialize_value(&data, &length, &mNumBias); - - if (mNumBias > 0) { - IXRT_PLUGIN_ASSERT(mLd > 0); - char const* d = static_cast(data); - mBias.convertAndCopy(d, mNumBias, DataType::kHALF); - copyToDevice(mBias, getWeightsSize(mBias, DataType::kHALF), mBiasDev); - } -} - -// IPluginV2 Methods - -char const* GeluPluginDynamic::getPluginType() const noexcept { return kGELU_IXRT_PLUGIN_NAME; } - -char const* GeluPluginDynamic::getPluginVersion() const noexcept { return kGELU_IXRT_PLUGIN_VERSION; } - -int32_t GeluPluginDynamic::getNbOutputs() const noexcept { return 1; } - -int32_t GeluPluginDynamic::initialize() noexcept { - gLogVerbose << "GeluPluginDynamic initalize\n"; - return 0; -} - -void GeluPluginDynamic::terminate() noexcept { gLogVerbose << "GeluPluginDynamic terminate\n"; } - -size_t GeluPluginDynamic::getSerializationSize() const noexcept { - const size_t wordSize = getElementSize(mType); - return sizeof(mType) + sizeof(mLd) + sizeof(mNumBias) + mNumBias * sizeof(half); -} - -void GeluPluginDynamic::serialize(void* buffer) const noexcept { - serialize_value(&buffer, mType); - serialize_value(&buffer, mLd); - serialize_value(&buffer, mNumBias); - if (mNumBias > 0) { - IXRT_PLUGIN_ASSERT(mLd > 0); - char* d = static_cast(buffer); - - serFromDev(d, static_cast(mBiasDev.get()), mLd * getElementSize(DataType::kHALF)); - } -} - -void GeluPluginDynamic::destroy() noexcept { - gLogVerbose << "GeluPluginDynamic destroy\n"; - // This gets called when the network containing plugin is destroyed - if (mNumBias > 0) { - mBiasDev.reset(); - } - delete this; -} - -void GeluPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { - try { - IXRT_PLUGIN_ASSERT(libNamespace != nullptr); - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* GeluPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2Ext Methods -nvinfer1::DataType GeluPluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, - int32_t nbInputs) const noexcept { - try { - IXRT_PLUGIN_ASSERT(index == 0); - IXRT_PLUGIN_ASSERT(inputTypes != nullptr); - IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF || - inputTypes[0] == DataType::kINT8); - return inputTypes[0]; - } catch (std::exception const& e) { - caughtError(e); - } - return DataType{}; -} - -// IPluginV2DynamicExt Methods -nvinfer1::IPluginV2DynamicExt* GeluPluginDynamic::clone() const noexcept { - try { - gLogVerbose << "GeluPluginDynamic clone\n"; - auto* plugin = new GeluPluginDynamic(mLayerName, mType, mBias, mLd); - plugin->setPluginNamespace(mNamespace.c_str()); - return plugin; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, - int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept { - try { - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(outputIndex == 0); - return inputs[0]; - } catch (std::exception const& e) { - caughtError(e); - } - return DimsExprs{}; -} - -bool GeluPluginDynamic::supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, - int32_t nbInputs, int32_t nbOutputs) noexcept { - try { - IXRT_PLUGIN_ASSERT(inOut != nullptr); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(pos >= 0); - IXRT_PLUGIN_ASSERT(pos < nbInputs + nbOutputs); - } catch (std::exception const& e) { - caughtError(e); - return false; - } - - PluginTensorDesc const& input = inOut[0]; - if (pos == 0) { - return (input.type == mType) && (input.format == TensorFormat::kLINEAR); - } - if (pos == 1) { - PluginTensorDesc const& output = inOut[1]; - return (input.type == output.type) && (output.format == TensorFormat::kLINEAR) && (output.type == mType); - } - return false; -} - -void GeluPluginDynamic::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept { - gLogVerbose << "GeluPluginDynamic configurePlugin\n"; - - try { - IXRT_PLUGIN_ASSERT(in != nullptr); - IXRT_PLUGIN_ASSERT(nbInputs == 1); - IXRT_PLUGIN_ASSERT(mType == in[0].desc.type); - IXRT_PLUGIN_ASSERT(mType == DataType::kHALF || mType == DataType::kINT8); - } catch (std::exception const& e) { - caughtError(e); - } -} - -size_t GeluPluginDynamic::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, - int32_t nbOutputs) const noexcept { - return 0; -} - -template -int32_t GeluPluginDynamic::enqueueTyped(void const* input_, void* output_, int32_t const inputVolume, - cudaStream_t stream) noexcept { - TDataType const* input = static_cast(input_); - TDataType* output = static_cast(output_); - int32_t const cols = inputVolume / mLd; - int32_t const rows = mLd; - - if (mNumBias > 0) { - TDataType const* bias = static_cast(mBiasDev.get()); - return computeGeluBias(output, input, bias, rows, cols, stream); - } else { - return computeGelu(stream, inputVolume, input, output); - } -} - -int32_t GeluPluginDynamic::enqueueInt8(void const* input_, void* output_, float dequant_scale, float quant_scale, - int32_t const inputVolume, cudaStream_t stream) noexcept { - int8_t const* input = static_cast(input_); - int8_t* output = static_cast(output_); - int32_t const cols = inputVolume / mLd; - int32_t const rows = mLd; - - if (mNumBias > 0) { - half const* bias = static_cast(mBiasDev.get()); - return computeGeluI8O8Bias(output, input, bias, rows, cols, dequant_scale, quant_scale, stream); - } else { - return computeGeluI8O8(stream, inputVolume, input, output, dequant_scale, quant_scale); - } -} - -int32_t GeluPluginDynamic::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, - nvinfer1::PluginTensorDesc const* outputDesc, void const* const* inputs, - void* const* outputs, void* workspace, cudaStream_t stream) noexcept { - try { - IXRT_PLUGIN_ASSERT(inputDesc != nullptr); - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(outputs != nullptr); - } catch (std::exception const& e) { - caughtError(e); - return STATUS_FAILURE; - } - - int32_t const inputVolume = volume(inputDesc[0].dims); - int32_t batch_token_num = inputDesc[0].dims.d[BDIM] * inputDesc[0].dims.d[SDIM]; - - // Our plugin outputs only one tensor. - // Launch CUDA kernel wrapper and save its return value. - switch (mType) { - case DataType::kFLOAT: - return enqueueTyped(inputs[0], outputs[0], inputVolume, stream); - case DataType::kHALF: - return enqueueTyped(inputs[0], outputs[0], inputVolume, stream); - case DataType::kINT8: { - int8_t* input = (int8_t*)(inputs[0]); - int8_t* output = (int8_t*)(outputs[0]); - IxinferBiasGeluI8II8O(batch_token_num, stream, (int8_t*)input, (int8_t*)output, - static_cast(mBiasDev.get()), mLd, inputDesc[0].scale, - 1.0/outputDesc[0].scale); - return STATUS_SUCCESS; - } - default: - return STATUS_FAILURE; - } -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu deleted file mode 100644 index c36cac157bd49795d06c7bbb1f16bcf4b0b5cc8d..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.cu +++ /dev/null @@ -1,218 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#include "backend/bert/bert_helper.h" -#include "geluPlugin.h" - -namespace nvinfer1::ixrt_plugin { -using namespace backend; -namespace bert { -// constants for approximating the normal cdf -constexpr float A = 0.5f; -constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) -constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI) - - -template -__global__ void IxinferBiasGeluI8II8OKernel(int8_t *input, int8_t *output, const T *bias, int feature_dim, - float dequant_scale, float quant_scale) { - int block_start = blockIdx.x * feature_dim; - int start = block_start + threadIdx.x; - int end = block_start + feature_dim; - for (int i = start; i < end; i += blockDim.x) { - int input_index = i; - - float fout = gelu(float(input[input_index]) * dequant_scale + __ldg(&bias[i - block_start])); - - int output_index = i; - output[output_index] = float2int8(fout, quant_scale); - } -} - -template <> -__global__ void IxinferBiasGeluI8II8OKernel<__half>(int8_t *input, int8_t *output, const __half *bias, int feature_dim, - float dequant_scale, float quant_scale) { - // #pragma unroll - for (int block_index = 0; block_index < 2; block_index++) { - int block_start = (blockIdx.x * 2 + block_index) * feature_dim; - int start = block_start + threadIdx.x * 4; - int input_index = start; - char4 *p_input = (char4 *)(input + input_index); - half2 *p_bias = (half2 *)(bias + input_index - block_start); - float fout1 = gelu(float(p_input[0].x) * dequant_scale + __half2float(p_bias[0].x)); - float fout2 = gelu(float(p_input[0].y) * dequant_scale + __half2float(p_bias[0].y)); - float fout3 = gelu(float(p_input[0].z) * dequant_scale + __half2float(p_bias[1].x)); - float fout4 = gelu(float(p_input[0].w) * dequant_scale + __half2float(p_bias[1].y)); - - int output_index = start; - char4 out; - out.x = float2int8(fout1, quant_scale); - out.y = float2int8(fout2, quant_scale); - out.z = float2int8(fout3, quant_scale); - out.w = float2int8(fout4, quant_scale); - char4 *p_output = (char4 *)(output + output_index); - - p_output[0] = out; - } -} - -template -void IxinferBiasGeluI8II8O(int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output, const T *bias, - int feature_dim, float dequant_scale, float quant_scale) { - IxinferBiasGeluI8II8OKernel - <<>>(input, output, bias, feature_dim, dequant_scale, quant_scale); -} - -template void IxinferBiasGeluI8II8O(int, cudaStream_t, int8_t*, int8_t *, const half *, int, float, float); - -template -__global__ void geluKernel(const half a, const half b, const half c, int n, const half* input, half* output) { - const int idx = blockIdx.x * TPB + threadIdx.x; - - if (idx < n) { - const half in = input[idx]; - const half cdf = a + a * __float2half(tanh(__half2float(in * (c * in * in + b)))); - output[idx] = in * cdf; - } -} - -template -__global__ void geluKernel(const float a, const float b, const float c, int n, const float* input, float* output) { - const int idx = blockIdx.x * TPB + threadIdx.x; - - if (idx < n) { - const float in = input[idx]; - const float cdf = a + a * tanh(in * (c * in * in + b)); - output[idx] = in * cdf; - } -} - -template -__global__ void geluKernel(const float a, const float b, const float c, int n, const int8_t* input, int8_t* output, - float dequant_scale, float quant_scale) { - const int idx = blockIdx.x * TPB + threadIdx.x; - - if (idx < n) { - const float in = float(input[idx]) * dequant_scale; - const float cdf = a + a * tanh(in * (c * in * in + b)); - float i8_f = in * cdf * quant_scale; - int32_t i8 = floorf(i8_f + 0.5); - i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8); - output[idx] = int8_t(i8); - } -} - -int computeGelu(cudaStream_t stream, int n, const float* input, float* output) { - constexpr int blockSize = 256; - const int gridSize = (n + blockSize - 1) / blockSize; - geluKernel<<>>(A, B, C, n, input, output); - - return 0; -} - -int computeGelu(cudaStream_t stream, int n, const half* input, half* output) { - constexpr int blockSize = 256; - const int gridSize = (n + blockSize - 1) / blockSize; - geluKernel<<>>(A, B, C, n, input, output); - - return 0; -} - -int32_t computeGeluI8O8(cudaStream_t stream, int n, const int8_t* input, int8_t* output, float dequant_scale, - float quant_scale) { - constexpr int blockSize = 256; - const int gridSize = (n + blockSize - 1) / blockSize; - geluKernel<<>>(A, B, C, n, input, output, dequant_scale, quant_scale); - - return 0; -} - -template -__global__ void geluBiasKernel(const half a, const half b, const half c, half* output, const half* input, - const half* bias, const int ld) { - const int offset = blockIdx.x * ld; - - for (int it = threadIdx.x; it < ld; it += TPB) { - const int idx = it + offset; - const half in = input[idx] + bias[it]; - const half cdf = a + a * __float2half(tanh(__half2float(in * (c * in * in + b)))); - output[idx] = in * cdf; - } -} - -template -__global__ void geluBiasKernel(const float a, const float b, const float c, float* output, const float* input, - const float* bias, const int ld) { - const int offset = blockIdx.x * ld; - - for (int it = threadIdx.x; it < ld; it += TPB) { - const int idx = it + offset; - const float in = input[idx] + bias[it]; - const float cdf = a + a * tanh(in * (c * in * in + b)); - output[idx] = in * cdf; - } -} - -template -__global__ void geluBiasKernel(const float a, const float b, const float c, int8_t* output, const int8_t* input, - const half* bias, float dequant_scale, float quant_scale, const int ld) { - const int offset = blockIdx.x * ld; - - for (int it = threadIdx.x; it < ld; it += TPB) { - const int idx = it + offset; - const float in = float(input[idx]) * dequant_scale + __half2float(bias[it]); - const float cdf = a + a * tanh(in * (c * in * in + b)); - float i8_f = in * cdf * quant_scale; - int32_t i8 = floorf(i8_f + 0.5); - i8 = i8 < -127 ? -127 : (i8 > 127 ? 127 : i8); - output[idx] = int8_t(i8); - } -} - -int computeGeluBias(float* output, const float* input, const float* bias, const int ld, const int cols, - cudaStream_t stream) { - geluBiasKernel<256><<>>(A, B, C, output, input, bias, ld); - return cudaPeekAtLastError(); -} - -int computeGeluBias(half* output, const half* input, const half* bias, const int ld, const int cols, - cudaStream_t stream) { - geluBiasKernel<256><<>>(A, B, C, output, input, bias, ld); - return cudaPeekAtLastError(); -} - -int32_t computeGeluI8O8Bias(int8_t* output, const int8_t* input, const half* bias, const int ld, const int cols, - float dequant_scale, float quant_scale, cudaStream_t stream) { - geluBiasKernel<256><<>>(A, B, C, output, input, bias, dequant_scale, quant_scale, ld); - return cudaPeekAtLastError(); -} - -} // namespace bert -} // namespace nvinfer1::plugin diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h deleted file mode 100644 index 182fe7f36de0f3cb6bfefee49b4c04a596563003..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/gelu/geluPlugin.h +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#pragma once -#ifdef __ILUVATAR__ -#include -#endif - -#include - -#include "NvInferRuntime.h" -#include "bertCommon.h" - -namespace nvinfer1::ixrt_plugin { -namespace bert { - -template -void IxinferBiasGeluI8II8O(int batch_token_num, cudaStream_t stream, int8_t *input, int8_t *output, const T *bias, - int feature_dim, float dequant_scale, float quant_scale); - -int32_t computeGelu(cudaStream_t stream, int32_t n, float const* input, float* output); - -int32_t computeGelu(cudaStream_t stream, int32_t n, half const* input, half* output); - -int32_t computeGeluI8O8(cudaStream_t stream, int n, const int8_t* input, int8_t* output, float dequant_scale, - float quant_scale); - -int32_t computeGeluBias(float* output, float const* input, float const* bias, int32_t const ld, int32_t const cols, - cudaStream_t stream); - -int32_t computeGeluBias(half* output, half const* input, half const* bias, int32_t const ld, int32_t const cols, - cudaStream_t stream); - -int32_t computeGeluI8O8Bias(int8_t* output, const int8_t* input, const half* bias, const int ld, const int cols, - float dequant_scale, float quant_scale, cudaStream_t stream); - -class GeluPluginDynamic : public nvinfer1::IPluginV2DynamicExt { - public: - GeluPluginDynamic(const std::string name, const nvinfer1::DataType type, nvinfer1::Weights const& bias, - const int ld); - - GeluPluginDynamic(const std::string name, void const* data, size_t length); - - // It doesn't make sense to make GeluPluginDynamic without arguments, so we delete - // default constructor. - GeluPluginDynamic() = delete; - - // IPluginV2 Methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext Methods - nvinfer1::DataType getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, - int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt Methods - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; - nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination(int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept override; - - private: - // Helper method for enqueue() - template - int32_t enqueueTyped(void const* input, void* output, int32_t const inputVolume, cudaStream_t stream) noexcept; - int32_t enqueueInt8(void const* input_, void* output_, float dequant_scale, float quant_scale, - int32_t const inputVolume, cudaStream_t stream) noexcept; - - const std::string mLayerName; - std::string mNamespace; - - nvinfer1::DataType mType; - bert::WeightsWithOwnership mBias; - bert::cuda_unique_ptr mBiasDev; - size_t mLd; - size_t mNumBias; -}; - -class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator { - public: - GeluPluginDynamicCreator(); - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2* deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - - private: - static nvinfer1::PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; -}; - -} // namespace bert -} // namespace nvinfer1::plugin diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp deleted file mode 100644 index c3a25ba1b2a655ef7bd5bd708a5d8dc5289d32c6..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cpp +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#include "qkvToContextInt8Plugin.h" - -#include "NvInferRuntime.h" -#include "bertCommon.h" -#include "checkMacrosPlugin.h" -#include "driver_types.h" -#include "plugin.h" -#include "serialize.h" -#include - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; - -namespace { -char const* const kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION{"3"}; -char const* const kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic_IxRT"}; -} // namespace - -PluginFieldCollection QKVToContextInt8PluginDynamicCreator::mFC{}; -std::vector QKVToContextInt8PluginDynamicCreator::mPluginAttributes; - -constexpr uint32_t IIDX = 0; // index of the input tensor -constexpr uint32_t MIDX = 1; // index of the mask -/* -dq_probs: -_arrange_qkv_amax -_softmax_in_amax -_softmax_out_amax -*/ -QKVToContextInt8PluginDynamicCreator::QKVToContextInt8PluginDynamicCreator() { - mPluginAttributes.clear(); - mPluginAttributes.emplace_back(PluginField("hidden_size", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("dq_probs", nullptr, PluginFieldType::kFLOAT32, 3)); - - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -char const* QKVToContextInt8PluginDynamicCreator::getPluginName() const noexcept { - return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME; -} - -char const* QKVToContextInt8PluginDynamicCreator::getPluginVersion() const noexcept { - return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION; -} - -PluginFieldCollection const* QKVToContextInt8PluginDynamicCreator::getFieldNames() noexcept { return &mFC; } - -IPluginV2* QKVToContextInt8PluginDynamicCreator::createPlugin(char const* name, - PluginFieldCollection const* fc) noexcept { - try { - int32_t hiddenSize = 0; - // Since numHeads must always exist or validateRequiredAttributes will fail, - // we can set numHeads to -1 so that static analysis tools don't warn about - // a division by zero in QKVToContextInt8PluginDynamic constructor. - int32_t numHeads{-1}; - - vector dqProbs; - - ixrt_plugin::validateRequiredAttributesExist({"hidden_size", "num_heads"}, fc); - - for (int32_t i = 0; i < fc->nbFields; i++) { - std::string field_name(fc->fields[i].name); - - if (field_name.compare("hidden_size") == 0) { - hiddenSize = *static_cast(fc->fields[i].data); - IXRT_PLUGIN_CHECK_VALUE(hiddenSize > 0, - ("QKV: Invalid hiddenSize " + std::to_string(hiddenSize)).c_str()); - gLogInfo << "Building hiddenSize: " << hiddenSize << endl; - } - if (field_name.compare("num_heads") == 0) { - numHeads = *static_cast(fc->fields[i].data); - IXRT_PLUGIN_CHECK_VALUE(numHeads > 0, ("QKV: Invalid numHeads " + std::to_string(numHeads)).c_str()); - gLogInfo << "Building numHeads: " << numHeads << endl; - } - if (field_name.compare("dq_probs") == 0) { - IXRT_PLUGIN_CHECK_VALUE(fc->fields[i].length > 0, - ("QKV: dpProbs can not be empty, error: [dpProbs.length == 0]!")); - gLogInfo << "Building dqProbs: ["; - for (auto j = 0; j < fc->fields[i].length; j++) { - dqProbs.emplace_back(static_cast((fc->fields[i].data))[j]); - gLogInfo << std::setprecision(5) << dqProbs[j]; - } - gLogInfo << "]" << endl; - } - } - - QKVToContextInt8PluginDynamic* p = new QKVToContextInt8PluginDynamic(name, hiddenSize, numHeads, dqProbs); - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2* QKVToContextInt8PluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - try { - // This object will be deleted when the network is destroyed, which will - // call QKVToContextInt8PluginDynamic::destroy() noexcept - return new QKVToContextInt8PluginDynamic(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -void QKVToContextInt8PluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept { - mNamespace = libNamespace; -} - -char const* QKVToContextInt8PluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// REGISTER_TENSORRT_PLUGIN(QKVToContextInt8PluginDynamicCreator); -//#########################################################################// -QKVToContextInt8PluginDynamic::QKVToContextInt8PluginDynamic(std::string const& name, int32_t const hiddenSize, - int32_t const numHeads, vector const dqProbs) - : mLayerName(name), - mS(0), - mB(0), - mHeadSize(hiddenSize / numHeads), - mHiddenSize(hiddenSize), - mNumHeads(numHeads), - mDqProbs(dqProbs) {} - -QKVToContextInt8PluginDynamic::QKVToContextInt8PluginDynamic(std::string const& name, void const* data, size_t length) - : mLayerName(name) { - gLogInfo << "deserialize QKVToContextInt8PluginDynamic" << endl; - deserialize_value(&data, &length, &mNumHeads); - deserialize_value(&data, &length, &mHeadSize); - deserialize_value(&data, &length, &mHiddenSize); - deserialize_value(&data, &length, &mDqProbs); -} - -// IPluginV2 Methods -char const* QKVToContextInt8PluginDynamic::getPluginType() const noexcept { - return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_NAME; -} - -char const* QKVToContextInt8PluginDynamic::getPluginVersion() const noexcept { - return kQKV_TO_CONTEXT_INT8_IXRT_PLUGIN_VERSION; -} - -int32_t QKVToContextInt8PluginDynamic::getNbOutputs() const noexcept { return 1; } - -int32_t QKVToContextInt8PluginDynamic::initialize() noexcept { return 0; } - -void QKVToContextInt8PluginDynamic::terminate() noexcept {} - -size_t QKVToContextInt8PluginDynamic::getSerializationSize() const noexcept { - return sizeof(mNumHeads) + sizeof(mHeadSize) + sizeof(mHiddenSize) + mDqProbs.size() * sizeof(float) + - sizeof(mDqProbs.size()); -} - -void QKVToContextInt8PluginDynamic::serialize(void* buffer) const noexcept { - serialize_value(&buffer, mNumHeads); - serialize_value(&buffer, mHeadSize); - serialize_value(&buffer, mHiddenSize); - serialize_value(&buffer, mDqProbs); -} - -void QKVToContextInt8PluginDynamic::destroy() noexcept { delete this; } - -void QKVToContextInt8PluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; } - -char const* QKVToContextInt8PluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2Ext Methods -DataType QKVToContextInt8PluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, - int32_t nbInputs) const noexcept { - IXRT_PLUGIN_ASSERT(index == 0) - return DataType::kINT8; -} - -// IPluginV2DynamicExt Methods -nvinfer1::IPluginV2DynamicExt* QKVToContextInt8PluginDynamic::clone() const noexcept { - try { - QKVToContextInt8PluginDynamic* ret = - new QKVToContextInt8PluginDynamic(mLayerName, mHiddenSize, mNumHeads, mDqProbs); - - ret->setPluginNamespace(mNamespace.c_str()); - return ret; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -DimsExprs QKVToContextInt8PluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, - int32_t nbInputs, IExprBuilder& exprBuilder) noexcept { - // input [B, S, 3*E] int8 - // pad_mask [B, S] int8 - - // output [B, S, E] int8 - IXRT_PLUGIN_ASSERT(outputIndex == 0); - // Copy over everything - DimsExprs output(inputs[IIDX]); - // Divide last dim by three - auto const* three = exprBuilder.constant(3); - output.d[HDIM] = exprBuilder.constant(mHiddenSize); - return output; -} -bool QKVToContextInt8PluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, - int32_t nbInputs, int32_t nbOutputs) noexcept { - IXRT_PLUGIN_ASSERT(nbInputs == 2); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - return (inOut[pos].type == DataType::kINT8) && (inOut[pos].format == TensorFormat::kLINEAR); -} - -void QKVToContextInt8PluginDynamic::configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, - DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept { - IXRT_PLUGIN_ASSERT(nbInputs == 2); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - PluginTensorDesc const& inDesc = in[IIDX].desc; - PluginTensorDesc const& outDesc = out[0].desc; - IXRT_PLUGIN_ASSERT(inDesc.dims.nbDims == 5) - IXRT_PLUGIN_ASSERT(inDesc.dims.d[HDIM] == 3 * mHiddenSize); - IXRT_PLUGIN_ASSERT(inDesc.dims.d[3] == 1); - IXRT_PLUGIN_ASSERT(inDesc.dims.d[4] == 1); - - PluginTensorDesc const& maskDesc = in[MIDX].desc; - IXRT_PLUGIN_ASSERT(maskDesc.dims.nbDims == 2); - IXRT_PLUGIN_ASSERT(maskDesc.dims.d[0] == inDesc.dims.d[0]); - IXRT_PLUGIN_ASSERT(maskDesc.dims.d[1] == inDesc.dims.d[1]); - - const int32_t S = inDesc.dims.d[SDIM]; - - IXRT_PLUGIN_ASSERT(outDesc.dims.nbDims == 5); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[BDIM] == inDesc.dims.d[BDIM]); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[SDIM] == S); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[HDIM] == mHiddenSize); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[3] == 1); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[4] == 1); - -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferCreate(&cuinfer_handle)); -#else - CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle)); -#endif -} - -size_t QKVToContextInt8PluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, - int32_t nbOutputs) const noexcept { - const int32_t B = inputs[0].dims.d[BDIM]; - const int32_t S = inputs->dims.d[SDIM]; - const int32_t E = inputs->dims.d[HDIM]; - IXRT_PLUGIN_ASSERT(E == 3 * mHiddenSize); - int64_t buffer_size = B * S * E * sizeof(int8_t) + B * S * S * mNumHeads * sizeof(int8_t); -#ifndef __ILUVATAR__ - buffer_size += B * S * S * mNumHeads * sizeof(int32_t); -#endif - return buffer_size; -} - -int32_t QKVToContextInt8PluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept { - try { -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferSetStream(cuinfer_handle, 0)); -#endif - int32_t const B = inputDesc[0].dims.d[BDIM]; - int32_t const S = inputDesc[0].dims.d[SDIM]; - - float qkv_out_amax_ = inputDesc[0].scale * 127; - float linear_in_amax_ = outputDesc[0].scale * 127; - float arrange_qkv_amax_ = mDqProbs[0]; - float softmax_in_amax_ = mDqProbs[1]; - float softmax_out_amax_ = mDqProbs[2]; - - int8_t* qkv_buffer_ = (int8_t*)inputs[0]; - int8_t* qkv_out_ = (int8_t*)outputs[0]; - int8_t* mask_ = (int8_t*)inputs[1]; - - int64_t buffer_size = B * S * mHiddenSize; - int64_t buffer_size2 = B * S * S * mNumHeads; - int8_t* q_buffer_ = static_cast(workspace); - int8_t* k_buffer_ = q_buffer_ + buffer_size; - int8_t* v_buffer_ = k_buffer_ + buffer_size; - int8_t* qk_buffer_ = v_buffer_ + buffer_size; - -#ifdef __ILUVATAR__ - auto status = - fused_multihead_attetion_int8(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qkv_out_, - qk_buffer_, B, S, mHeadSize, mNumHeads, mHiddenSize, arrange_qkv_amax_, - softmax_in_amax_, softmax_out_amax_, linear_in_amax_, cuinfer_handle, stream); -#else - int32_t* qk_out_ = reinterpret_cast(qk_buffer_ + buffer_size2); - auto status = - fused_multihead_attetion_int8(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qk_out_, qkv_out_, - qk_buffer_, B, S, mHeadSize, mNumHeads, mHiddenSize, arrange_qkv_amax_, - softmax_in_amax_, softmax_out_amax_, linear_in_amax_, blaslt_handle, stream); -#endif - if (status != cudaSuccess) { - return STATUS_FAILURE; - } - return STATUS_SUCCESS; - } catch (std::exception const& e) { - caughtError(e); - return STATUS_FAILURE; - } -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu deleted file mode 100644 index 2330debf3e1bee647c70336b35729699b90ad06e..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.cu +++ /dev/null @@ -1,488 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "backend/bert/bert_helper.h" -#include "backend/cublas/cublas_helper.h" -#ifdef __ILUVATAR__ -#include "backend/ixinfer/ixinfer_gemm_helper.h" -#endif -#include "qkvToContextInt8Plugin.h" - -using namespace nvinfer1::ixrt_plugin::backend; - -namespace nvinfer1::ixrt_plugin { -namespace bert { -const int _max_thread_per_block = 1024; -const float _quant_range = 127.0; - -__global__ void IxinferArrangeEncselfQkvI8II8ONoBias(const int8_t *ori_qkv, int8_t *new_qkv, int max_batch_dim, - int batch_seq_len, int dim_per_head, int head_num) { - int hidden_size = dim_per_head * head_num; - int batch_id = blockIdx.x / batch_seq_len; - int token_id = blockIdx.x % batch_seq_len; - - int i = threadIdx.x; // 1个线程处理4个数据 - - int head_id = (i * 4) / dim_per_head; - int dim_id = (i * 4) % dim_per_head; - int target_id = targetid_4dim(batch_id, head_id, token_id, dim_id, head_num, batch_seq_len, dim_per_head); - -#pragma unroll - for (int qkv_idx = 0; qkv_idx < 3; qkv_idx++) { - char4 *p_ori_qkv = (char4 *)(ori_qkv + (blockIdx.x * 3 + qkv_idx) * hidden_size); - int qkv_offset = max_batch_dim * qkv_idx; - char4 *p_new_qkv = (char4 *)(new_qkv + qkv_offset + target_id); - p_new_qkv[0] = p_ori_qkv[i]; - } -} - -template -__global__ void IxinferCorrelationSoftmaxEncselfI8II8OKernel(int8_t *correlation, const int8_t *src_padding_mask, - int batch_seq_len, float quant_scale, - float dequant_scale) { - constexpr int next_power_of_two = 1 << log2_elements; - constexpr int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; - constexpr int WARP_ITERATIONS = next_power_of_two / SOFT_WARP_SIZE; - int local_idx = threadIdx.x; - - for (int warp_idx = 0; warp_idx < WARP_BATCH; ++warp_idx) { - int start_idx = (blockIdx.x * gridDim.y * WARP_BATCH * gridDim.z * batch_seq_len + - (blockIdx.y + gridDim.y * warp_idx) * gridDim.z * batch_seq_len + blockIdx.z * batch_seq_len); - - char4 *p_correlation = (char4 *)(correlation + start_idx); - char4 *p_src_padding_mask = (char4 *)(src_padding_mask + blockIdx.x * batch_seq_len); - - // load data from global memory - // float - float4 elements[WARP_ITERATIONS]; -#pragma unroll - for (int it = 0; it < WARP_ITERATIONS; ++it) { - int element_index = local_idx + it * SOFT_WARP_SIZE; - if (element_index < batch_seq_len / 4) { - char4 mask = p_src_padding_mask[element_index]; - char4 correlation_value = p_correlation[element_index]; - - elements[it].x = - mask.x ? -INFINITY : (float)correlation_value.x * dequant_scale; - elements[it].y = - mask.y ? -INFINITY : (float)correlation_value.y * dequant_scale; - elements[it].z = - mask.z ? -INFINITY : (float)correlation_value.z * dequant_scale; - elements[it].w = - mask.w ? -INFINITY : (float)correlation_value.w * dequant_scale; - - } else { - elements[it].x = -INFINITY; - elements[it].y = -INFINITY; - elements[it].z = -INFINITY; - elements[it].w = -INFINITY; - } - } - - // compute max_value - float max_value = elements[0].x; - max_value = (max_value > elements[0].y) ? max_value : elements[0].y; - max_value = (max_value > elements[0].z) ? max_value : elements[0].z; - max_value = (max_value > elements[0].w) ? max_value : elements[0].w; - -#pragma unroll - for (int it = 1; it < WARP_ITERATIONS; ++it) { - max_value = (max_value > elements[it].x) ? max_value : elements[it].x; - max_value = (max_value > elements[it].y) ? max_value : elements[it].y; - max_value = (max_value > elements[it].z) ? max_value : elements[it].z; - max_value = (max_value > elements[it].w) ? max_value : elements[it].w; - } - - warp_reduce(&max_value); - - // exp sum - float sum = 0.0f; -#pragma unroll - for (int it = 0; it < WARP_ITERATIONS; ++it) { - elements[it].x = __expf(elements[it].x - max_value); - elements[it].y = __expf(elements[it].y - max_value); - elements[it].z = __expf(elements[it].z - max_value); - elements[it].w = __expf(elements[it].w - max_value); - - sum += (elements[it].x + elements[it].y + elements[it].z + elements[it].w); - } - - warp_reduce(&sum); - sum = 1.0f / sum; - // store result -#pragma unroll - for (int it = 0; it < WARP_ITERATIONS; ++it) { - int element_index = local_idx + it * SOFT_WARP_SIZE; - char4 correlation_value; - if (element_index < batch_seq_len / 4) { - correlation_value.x = float2int8(elements[it].x * sum, quant_scale); - correlation_value.y = float2int8(elements[it].y * sum, quant_scale); - correlation_value.z = float2int8(elements[it].z * sum, quant_scale); - correlation_value.w = float2int8(elements[it].w * sum, quant_scale); - - p_correlation[element_index] = correlation_value; - - } else { - break; - } - } - } -} - -void IxinferCorrelationSoftmaxEncselfI8II8O(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream, - int8_t *correlation, const int8_t *src_padding_mask, float quant_scale, - float dequant_scale) { - const int NUM_INT8_SOFTMAX_BATCH_WARP = 4; - if (batch_seq_len > 512) { - throw std::runtime_error("batch_seq_len should <= 512"); - } - if (head_num % NUM_INT8_SOFTMAX_BATCH_WARP != 0) { - throw std::runtime_error("head_num % NUM_INT8_SOFTMAX_BATCH_WARP !0"); - } - if (batch_seq_len % 4 != 0) { - throw std::runtime_error("batch_seq_len % 4 != 0"); - } - - int log2_elements = log2_ceil(batch_seq_len / 4); - int next_power_of_two = 1 << log2_elements; - int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; - // dim3 blockSize(batch_size, head_num / NUM_INT8_SOFTMAX_BATCH_WARP, - // batch_seq_len); - // - dim3 grid(batch_size, head_num / NUM_INT8_SOFTMAX_BATCH_WARP, batch_seq_len); - - dim3 block(SOFT_WARP_SIZE); - - switch (log2_elements) { - case 0: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<0, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - - break; - - case 1: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<1, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - - case 2: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<2, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - - case 3: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<3, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - - case 4: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<4, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - - case 5: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<5, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - - case 6: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<6, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - case 7: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<7, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - case 8: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<8, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - case 9: - IxinferCorrelationSoftmaxEncselfI8II8OKernel<9, NUM_INT8_SOFTMAX_BATCH_WARP> - <<>>(correlation, src_padding_mask, batch_seq_len, quant_scale, dequant_scale); - break; - default: - throw std::runtime_error( - "ker_correlation_softmax_encself_i8I_i8O_ix_ " - "NotImplementedError"); - break; - } -} - - -__global__ void IxinferArrangeAttenOutputI8II8OKernel(const int8_t *ori_q, int8_t *new_q, int beam_size, - int dim_per_head, int head_num, float quant_scale, - float dequant_scale) { - int hidden_size = dim_per_head * head_num; - -#pragma unroll - for (int blockin = 0; blockin < 4; blockin++) { - int batch_id = (blockIdx.x * 4 + blockin) / beam_size; - // note, for encoder, beam_id is token_id; for decoder, beam_id is beam_id - int beam_id = (blockIdx.x * 4 + blockin) % beam_size; - int i = threadIdx.x; - int out_index = (blockIdx.x * 4 + blockin) * hidden_size + i; - int head_id = i / dim_per_head; - int dim_id = i % dim_per_head; - - char4 *p_ori_q = (char4 *)ori_q; - char4 *p_new_q = (char4 *)new_q; - char4 value; - - value = p_ori_q[targetid_4dim(batch_id, head_id, beam_id, dim_id, head_num, beam_size, dim_per_head)]; - value.x = float2int8(value.x * dequant_scale, quant_scale); - value.y = float2int8(value.y * dequant_scale, quant_scale); - value.z = float2int8(value.z * dequant_scale, quant_scale); - value.w = float2int8(value.w * dequant_scale, quant_scale); - p_new_q[out_index] = value; - } -} - -void IxinferArrangeAttenOutputI8II8O(int batch_token_num, int hidden_size, cudaStream_t stream, const int8_t *ori_q, - int8_t *new_q, int beam_size, int dim_per_head, int head_num, - int max_thread_per_block, float quant_scale, float dequant_scale) { - int qual_hidden_size = hidden_size >> 2; - int qual_dim_per_head = dim_per_head >> 2; - IxinferArrangeAttenOutputI8II8OKernel<<>>( - ori_q, new_q, beam_size, qual_dim_per_head, head_num, quant_scale, dequant_scale); -} - -#ifdef __ILUVATAR__ -cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer, - int8_t* v_buffer, int8_t* qkv_out, int8_t* qk_buffer, - int batch_size, int batch_seq_len, int head_dim, int head_num, - int hidden_size, float arrange_qkv_amax, float softmax_in_amax, - float softmax_out_amax, float linear_in_amax, cuinferHandle_t& cuinfer_handle, - cudaStream_t& stream) { - int batch_token_num = batch_size * batch_seq_len; - int max_batch_dim = batch_token_num * hidden_size; - - float scaleCtx = linear_in_amax / _quant_range; - float scaleArrange = arrange_qkv_amax / _quant_range; - float scaleSoftin = softmax_in_amax / _quant_range; - float scaleSoftout = softmax_out_amax / _quant_range; - - float scaleBmm1 = scaleArrange * scaleArrange / scaleSoftin * sqrt(1.f / head_dim); - float scaleBmm2 = scaleSoftout * scaleArrange / scaleCtx; - - IxinferArrangeEncselfQkvI8II8ONoBias<<>>( - qkv_buffer, q_buffer, max_batch_dim, batch_seq_len, head_dim, head_num); - - switch (head_dim) { - case 64: - case 128: - case 192: - case 256: { - cuinferFlashAttnConfigInfo flashAttnInfo; - flashAttnInfo.scaling = sqrt(1.f / (head_dim * 1.0)); - flashAttnInfo.quantParam.q_amax = arrange_qkv_amax; - flashAttnInfo.quantParam.k_amax = arrange_qkv_amax; - flashAttnInfo.quantParam.v_amax = arrange_qkv_amax; - flashAttnInfo.quantParam.p_amax = softmax_out_amax; - flashAttnInfo.quantParam.o_amax = linear_in_amax; - - cuinferTensorDescriptor_t qDesc, kDesc, vDesc, maskDesc, oDesc; - CUINFER_CHECK(cuinferCreateTensorDescriptor(&qDesc)); - CUINFER_CHECK(cuinferCreateTensorDescriptor(&kDesc)); - CUINFER_CHECK(cuinferCreateTensorDescriptor(&vDesc)); - CUINFER_CHECK(cuinferCreateTensorDescriptor(&maskDesc)); - CUINFER_CHECK(cuinferCreateTensorDescriptor(&oDesc)); - - CUINFER_CHECK(cuinferSetTensor4dDescriptor(qDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, - CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len, - head_dim)); - CUINFER_CHECK(cuinferSetTensor4dDescriptor(kDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, - CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len, - head_dim)); - CUINFER_CHECK(cuinferSetTensor4dDescriptor(vDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, - CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len, - head_dim)); - CUINFER_CHECK(cuinferSetTensor4dDescriptor(maskDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, - CUINFER_DATA_INT8, batch_size, 1, 1, batch_seq_len)); - CUINFER_CHECK(cuinferSetTensor4dDescriptor(oDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, - CUINFER_DATA_INT8, batch_size, head_num, batch_seq_len, - head_dim)); - - CUINFER_CHECK(cuinferFMHAForwardEx(cuinfer_handle, flashAttnInfo, qDesc, q_buffer, kDesc, k_buffer, vDesc, - v_buffer, maskDesc, mask, oDesc, qk_buffer)); - break; - } - default: { - cuinfer_i8_gemm(k_buffer, q_buffer, nullptr, qkv_buffer, batch_size * head_num, batch_seq_len, - batch_seq_len, head_dim, batch_seq_len * head_dim, batch_seq_len * head_dim, - batch_seq_len * batch_seq_len, scaleBmm1, 0.0, 0, cuinfer_handle, stream); - - IxinferCorrelationSoftmaxEncselfI8II8O(batch_size, batch_seq_len, head_num, stream, qkv_buffer, mask, - 1.0 / scaleSoftout, scaleSoftin); - - cuinfer_nn_i8_gemm(v_buffer, qkv_buffer, qk_buffer, batch_size * head_num, head_dim, batch_seq_len, - batch_seq_len, batch_seq_len * head_dim, batch_seq_len * batch_seq_len, - batch_seq_len * head_dim, scaleBmm2, cuinfer_handle, stream); - break; - } - } - - IxinferArrangeAttenOutputI8II8O(batch_token_num, hidden_size, stream, qk_buffer, qkv_out, batch_seq_len, head_dim, - head_num, _max_thread_per_block, 1.f, 1.f); - return cudaSuccess; -} -#else -template -__global__ void quant_qkv_gemm(const int32_t* input, int8_t* output, int hidden_size, float quant_scale, int num_per_tca) { - float4 val[THREAD_DATA_LEN]; - - int block_id = blockIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z; - int block_start = block_id * hidden_size; - input += block_start; - output += block_start; - - int4* p_input = (int4*)input; - char4* p_output = (char4*)output; - - float4 bias_val; -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * num_per_tca; - char4 q_input; - q_input.x = float2int8(p_input[element_index].x*1.0, quant_scale); - q_input.y = float2int8(p_input[element_index].y*1.0, quant_scale); - q_input.z = float2int8(p_input[element_index].z*1.0, quant_scale); - q_input.w = float2int8(p_input[element_index].w*1.0, quant_scale); - - p_output[element_index] = q_input; - } -} - -void quantQKVGemm(int32_t* input, int8_t* output, int batch_size, int head_num, int batch_seq_len, int hidden_size, float dequant_scale, cudaStream_t stream) { - if (hidden_size > 4096) { - throw std::runtime_error("hidden_size should <= 4096"); - } - int num_per_tca = min(hidden_size / 4, C10_WARP_SIZE); - dim3 gridSize(batch_size, head_num, batch_seq_len); - dim3 blockSize(num_per_tca); - - int num_warp = hidden_size / num_per_tca / 4; - switch (num_warp) { - case 1: - quant_qkv_gemm<1> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 2: - quant_qkv_gemm<2> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 3: - quant_qkv_gemm<3> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 4: - quant_qkv_gemm<4> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 5: - quant_qkv_gemm<5> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 6: - quant_qkv_gemm<6> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 7: - quant_qkv_gemm<7> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 8: - quant_qkv_gemm<8> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 9: - quant_qkv_gemm<9> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 10: - quant_qkv_gemm<10> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 11: - quant_qkv_gemm<11> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 12: - quant_qkv_gemm<12> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 13: - quant_qkv_gemm<13> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 14: - quant_qkv_gemm<14> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 15: - quant_qkv_gemm<15> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - case 16: - quant_qkv_gemm<16> - <<>>(input, output, hidden_size, dequant_scale, num_per_tca); - break; - default: - throw std::runtime_error("quantQKVGemm"); - break; - } -} - - -cudaError_t fused_multihead_attetion_int8(int8_t *qkv_buffer, int8_t *mask, int8_t *q_buffer, int8_t *k_buffer, - int8_t *v_buffer, int32_t *qk_out, int8_t *qkv_out, int8_t *qk_buffer, int batch_size, - int batch_seq_len, int head_dim, int head_num, int hidden_size, - float arrange_qkv_amax, float softmax_in_amax, float softmax_out_amax, - float linear_in_amax, cublasLtHandle_t &cublas_lt_handle, - cudaStream_t &stream) { - int batch_token_num = batch_size * batch_seq_len; - int max_batch_dim = batch_token_num * hidden_size; - - float scaleCtx = linear_in_amax / _quant_range; - float scaleArrange = arrange_qkv_amax / _quant_range; - float scaleSoftin = softmax_in_amax / _quant_range; - float scaleSoftout = softmax_out_amax / _quant_range; - - float scaleBmm1 = scaleArrange * scaleArrange / scaleSoftin * sqrt(1.f / head_dim); - float scaleBmm2 = scaleSoftout * scaleArrange / scaleCtx; - - IxinferArrangeEncselfQkvI8II8ONoBias<<>>( - qkv_buffer, q_buffer, max_batch_dim, batch_seq_len, head_dim, head_num); - - cublaslt_gemm(k_buffer, q_buffer, qk_out, batch_size * head_num, batch_seq_len, batch_seq_len, head_dim, - batch_seq_len * head_dim, batch_seq_len * head_dim, batch_seq_len * batch_seq_len, 1, - cublas_lt_handle, stream); - quantQKVGemm(qk_out, qk_buffer, batch_size, head_num, batch_seq_len, batch_seq_len, scaleBmm1, stream); - - IxinferCorrelationSoftmaxEncselfI8II8O(batch_size, batch_seq_len, head_num, stream, qk_buffer, mask, - 1.0 / scaleSoftout, scaleSoftin); - - cublaslt_gemm_nn(v_buffer, qk_buffer, qk_out, batch_size * head_num, head_dim, batch_seq_len, batch_seq_len, - batch_seq_len * head_dim, batch_seq_len * batch_seq_len, batch_seq_len * head_dim, 1, - cublas_lt_handle, stream); - quantQKVGemm(qk_out, q_buffer, batch_size, head_num, batch_seq_len, head_dim, scaleBmm2, stream); - - IxinferArrangeAttenOutputI8II8O(batch_token_num, hidden_size, stream, q_buffer, qkv_out, batch_seq_len, head_dim, - head_num, _max_thread_per_block, 1.f, 1.f); - return cudaSuccess; -} -#endif -} // namespace bert -} // namespace nvinfer1::ixrt_plugin diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h deleted file mode 100644 index b5c501fc35e06d259c62391dbaa43f7c3473481e..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextInt8Plugin.h +++ /dev/null @@ -1,164 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#pragma once -#include -#include "NvInferRuntime.h" -#include "bertCommon.h" -#include -#include -#ifdef __ILUVATAR__ -#include "ixinfer.h" -#endif - -namespace nvinfer1::ixrt_plugin -{ -namespace bert -{ - -#ifdef __ILUVATAR__ -cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer, - int8_t* v_buffer, int8_t* qkv_out, int8_t* qk_buffer, - int batch_size, int batch_seq_len, int head_dim, int head_num, - int hidden_size, float arrange_qkv_amax, float softmax_in_amax, - float softmax_out_amax, float linear_in_amax, cuinferHandle_t& cuinfer_handle, - cudaStream_t& stream); -#else -cudaError_t fused_multihead_attetion_int8(int8_t* qkv_buffer, int8_t* mask, int8_t* q_buffer, int8_t* k_buffer, - int8_t* v_buffer, int32_t* qk_out, int8_t* qkv_out, int8_t* qk_buffer, - int batch_size, int batch_seq_len, int head_dim, int head_num, - int hidden_size, float arrange_qkv_amax, float softmax_in_amax, - float softmax_out_amax, float linear_in_amax, - cublasLtHandle_t& cublas_lt_handle, cudaStream_t& stream); -#endif - -void IxinferCorrelationSoftmaxEncselfI8II8O(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream, - int8_t *correlation, const int8_t *src_padding_mask, float quant_scale, - float dequant_scale); - -void IxinferArrangeAttenOutputI8II8O(int batch_token_num, int hidden_size, cudaStream_t stream, const int8_t *ori_q, - int8_t *new_q, int beam_size, int dim_per_head, int head_num, - int max_thread_per_block, float quant_scale, float dequant_scale); -class QKVToContextInt8PluginDynamic : public nvinfer1::IPluginV2DynamicExt -{ -public: - QKVToContextInt8PluginDynamic(std::string const& name, int32_t const hiddenSize, int32_t const numHeads, - vector const dqProbs); - - QKVToContextInt8PluginDynamic(std::string const& name, void const* data, size_t length); - - // It doesn't make sense to make QKVToContextInt8PluginDynamic without arguments, so we - // delete default constructor. - QKVToContextInt8PluginDynamic() = delete; - - // IPluginV2 Methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext Methods - nvinfer1::DataType getOutputDataType( - int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt Methods - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; - nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination( - int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; - -protected: - void createMHARunner() noexcept; - int32_t getSMVersion() const noexcept; - -private: - std::string const& mLayerName; - std::string mNamespace; - - int32_t mS; - int32_t mB; - int32_t mSM; - int32_t mHeadSize; - int32_t mHiddenSize; - int32_t mNumHeads; - - cuda_unique_ptr mQkvBias; - - vector mDqProbs; - bool mUseInt8ScaleMax{true}; - -#ifdef __ILUVATAR__ - cuinferHandle_t cuinfer_handle; -#else - cublasLtHandle_t blaslt_handle; -#endif -}; - -class QKVToContextInt8PluginDynamicCreator : public nvinfer1::IPluginCreator -{ -public: - QKVToContextInt8PluginDynamicCreator(); - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2* deserializePlugin( - char const* name, void const* serialData, size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - -private: - static nvinfer1::PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; -}; - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp deleted file mode 100644 index a69fb957ceb7a2d6bb7d4e5edc46fbe9fc8ca63c..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cpp +++ /dev/null @@ -1,388 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#include "qkvToContextPlugin.h" - -#include "bertCommon.h" -#include "checkMacrosPlugin.h" -#include "common_def.cuh" -#include "cuda_runtime_api.h" -#include "driver_types.h" -#include "plugin.h" -#include "serialize.h" -#include -#include - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; - -namespace { -char const* const kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION{"1"}; -char const* const kQKV_TO_CONTEXT_VAR_SEQLEN_IXRT_PLUGIN_VERSION{"2"}; -char const* const kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME{"CustomQKVToContextPluginDynamic_IxRT"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection QKVToContextPluginDynamicCreator::mFC{}; -std::vector QKVToContextPluginDynamicCreator::mPluginAttributes; - -constexpr uint32_t IIDX = 0; // index of the input tensor -constexpr uint32_t MIDX = 1; // index of the mask - -QKVToContextPluginDynamicCreator::QKVToContextPluginDynamicCreator() { - mPluginAttributes.emplace_back(PluginField("type_id", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("hidden_size", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("num_heads", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("has_mask", nullptr, PluginFieldType::kINT32, 1)); - mPluginAttributes.emplace_back(PluginField("dq_probs", nullptr, PluginFieldType::kFLOAT32, 1)); - - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -char const* QKVToContextPluginDynamicCreator::getPluginName() const noexcept { - return kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME; -} - -char const* QKVToContextPluginDynamicCreator::getPluginVersion() const noexcept { - return kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION; -} - -PluginFieldCollection const* QKVToContextPluginDynamicCreator::getFieldNames() noexcept { return &mFC; } - -IPluginV2* QKVToContextPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept { - try { - gLogInfo << "Creating QKV2ContextPlugin..." << endl; - IXRT_PLUGIN_ASSERT(fc != nullptr); - int32_t hiddenSize = 0; - // Since numHeads must always exist or validateRequiredAttributes will fail, - // we can set numHeads to -1 so that static analysis tools don't warn about - // a division by zero in QKVToContextPluginDynamic constructor. - int32_t numHeads{-1}; - bool hasMask = false; - int32_t typeId = -1; - - float dqProbs = -1; - - IXRT_PLUGIN_ASSERT(fc->fields != nullptr); - ixrt_plugin::validateRequiredAttributesExist({"type_id", "hidden_size", "num_heads", "has_mask"}, fc); - - for (int32_t i = 0; i < fc->nbFields; i++) { - IXRT_PLUGIN_ASSERT(fc->fields[i].name != nullptr); - IXRT_PLUGIN_ASSERT(fc->fields[i].data != nullptr); - std::string field_name(fc->fields[i].name); - - if (field_name.compare("type_id") == 0) { - typeId = *static_cast(fc->fields[i].data); - IXRT_PLUGIN_CHECK_VALUE(typeId >= 0 && typeId <= 2, - ("QKV: Invalid TypeId " + std::to_string(typeId)).c_str()); - gLogInfo << "Building typeId: " << typeId << endl; - } - if (field_name.compare("hidden_size") == 0) { - hiddenSize = *static_cast(fc->fields[i].data); - IXRT_PLUGIN_CHECK_VALUE(hiddenSize > 0, - ("QKV: Invalid hiddenSize " + std::to_string(hiddenSize)).c_str()); - gLogInfo << "Building hiddenSize: " << hiddenSize << endl; - } - if (field_name.compare("num_heads") == 0) { - numHeads = *static_cast(fc->fields[i].data); - IXRT_PLUGIN_CHECK_VALUE(numHeads > 0, ("QKV: Invalid numHeads " + std::to_string(numHeads)).c_str()); - gLogInfo << "Building numHeads: " << numHeads << endl; - } - if (field_name.compare("has_mask") == 0) { - auto hasMaskValue = *static_cast(fc->fields[i].data); - IXRT_PLUGIN_CHECK_VALUE(hasMaskValue == 0 || hasMaskValue == 1, - ("QKV: Invalid hasMask " + std::to_string(hasMaskValue)).c_str()); - hasMask = static_cast(hasMaskValue); - gLogInfo << "Building hasMask: " << hasMask << endl; - } - } - - gLogInfo << "Building the Plugin..." << endl; - auto type = static_cast(typeId); - auto* p = new QKVToContextPluginDynamic(name, type, hiddenSize, numHeads, dqProbs, hasMask); - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2* QKVToContextPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - // This object will be deleted when the network is destroyed, which will - // call QKVToContextPluginDynamic::destroy() - return new QKVToContextPluginDynamic(name, serialData, serialLength); -} - -void QKVToContextPluginDynamicCreator::setPluginNamespace(char const* libNamespace) noexcept { - mNamespace = libNamespace; -} - -char const* QKVToContextPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// REGISTER_TENSORRT_PLUGIN(QKVToContextPluginDynamicCreator); -//#########################################################################// -QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, const DataType type, - const int32_t hiddenSize, const int32_t numHeads, - float const dqProbs, bool hasImask) - : mLayerName(name), - mS(0), - mB(0), - mHeadSize(hiddenSize / numHeads), - mHiddenSize(hiddenSize), - mNumHeads(numHeads), - mHasImask(hasImask), - mType(type) - -{ - // -} - -QKVToContextPluginDynamic::QKVToContextPluginDynamic(const std::string name, void const* data, size_t length) - : mLayerName(name) { - gLogInfo << "QKV Deser Start" << endl; - deserialize_value(&data, &length, &mType); - deserialize_value(&data, &length, &mNumHeads); - deserialize_value(&data, &length, &mHeadSize); - deserialize_value(&data, &length, &mHasImask); - deserialize_value(&data, &length, &mHiddenSize); - deserialize_value(&data, &length, &mS); - deserialize_value(&data, &length, &mB); - - gLogInfo << "QKV Deser done" << endl; -} - -// IPluginV2 Methods -char const* QKVToContextPluginDynamic::getPluginType() const noexcept { return kQKV_TO_CONTEXT_IXRT_PLUGIN_NAME; } - -char const* QKVToContextPluginDynamic::getPluginVersion() const noexcept { return kQKV_TO_CONTEXT_IXRT_PLUGIN_VERSION; } - -int32_t QKVToContextPluginDynamic::getNbOutputs() const noexcept { return 1; } - -int32_t QKVToContextPluginDynamic::initialize() noexcept { return 0; } - -void QKVToContextPluginDynamic::terminate() noexcept {} - -size_t QKVToContextPluginDynamic::getSerializationSize() const noexcept { - return sizeof(mNumHeads) + sizeof(mHeadSize) + sizeof(DataType) + sizeof(mHasImask) + sizeof(mHiddenSize) + - sizeof(mS) + sizeof(mB); -} - -void QKVToContextPluginDynamic::serialize(void* buffer) const noexcept { - serialize_value(&buffer, mType); - serialize_value(&buffer, mNumHeads); - serialize_value(&buffer, mHeadSize); - serialize_value(&buffer, mHasImask); - serialize_value(&buffer, mHiddenSize); - serialize_value(&buffer, mS); - serialize_value(&buffer, mB); -} - -void QKVToContextPluginDynamic::destroy() noexcept { delete this; } - -void QKVToContextPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; } - -char const* QKVToContextPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2Ext Methods -DataType QKVToContextPluginDynamic::getOutputDataType(int32_t index, nvinfer1::DataType const* inputTypes, - int32_t /*nbInputs*/) const noexcept { - IXRT_PLUGIN_ASSERT(index == 0); - IXRT_PLUGIN_ASSERT(inputTypes[0] == DataType::kFLOAT || inputTypes[0] == DataType::kHALF || - inputTypes[0] == DataType::kINT8); - return inputTypes[0]; -} - -// IPluginV2DynamicExt Methods -nvinfer1::IPluginV2DynamicExt* QKVToContextPluginDynamic::clone() const noexcept { - gLogInfo << "QKV Clone" << endl; - - QKVToContextPluginDynamic* ret = nullptr; - ret = new QKVToContextPluginDynamic(mLayerName, mType, mHiddenSize, mNumHeads, mDqProbs, mHasImask); - - ret->setPluginNamespace(mNamespace.c_str()); - gLogInfo << "QKV Clone done" << endl; - return ret; -} - -DimsExprs QKVToContextPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, - int32_t /*nbInputs*/, IExprBuilder& exprBuilder) noexcept { - // Input is BxSx3*N*H, output should be BxSxN*H - IXRT_PLUGIN_ASSERT(outputIndex == 0); - // Copy over everything - DimsExprs output(inputs[IIDX]); - // Divide last dim by three - auto const* three = exprBuilder.constant(3); - output.d[HDIM] = exprBuilder.constant(mHiddenSize); - return output; -} -bool QKVToContextPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t /*nbOutputs*/) noexcept { - IXRT_PLUGIN_ASSERT(pos >= 0); - IXRT_PLUGIN_ASSERT(pos < 2 + mHasImask); - IXRT_PLUGIN_ASSERT(nbInputs == 1 + mHasImask); - auto const* in = inOut; - auto const* out = inOut + nbInputs; - - if (pos == 0) { - return (in->type == mType) && (in->format == TensorFormat::kLINEAR); - } - - // pos==1 - if ((mHasImask && pos == 1)) // pos 1 is the mask - { - auto const* inMask = &inOut[1]; - - // detect full mask and check that it was produced - return (inMask->type == DataType::kINT32) && // precision - (inMask->format == TensorFormat::kLINEAR); // format - } - - if (!mHasImask || pos == 2) // output pos - { - return (in->type == out->type) && (out->format == TensorFormat::kLINEAR); - } - - return false; -} -void QKVToContextPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, - DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept { - IXRT_PLUGIN_ASSERT(nbInputs == 1 + mHasImask); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - PluginTensorDesc const& inDesc = in[IIDX].desc; - TRT_UNUSED inDesc; - PluginTensorDesc const& outDesc = out->desc; - TRT_UNUSED outDesc; - IXRT_PLUGIN_ASSERT(mType == inDesc.type); - IXRT_PLUGIN_ASSERT(mType == outDesc.type); - IXRT_PLUGIN_ASSERT(inDesc.dims.nbDims == 5) - IXRT_PLUGIN_ASSERT(inDesc.dims.d[HDIM] == 3 * mHiddenSize); - IXRT_PLUGIN_ASSERT(inDesc.dims.d[3] == 1); - IXRT_PLUGIN_ASSERT(inDesc.dims.d[4] == 1); - if (mHasImask) { - PluginTensorDesc const& maskDesc = in[MIDX].desc; - TRT_UNUSED maskDesc; - IXRT_PLUGIN_ASSERT(maskDesc.dims.nbDims == 2); - IXRT_PLUGIN_ASSERT(maskDesc.dims.d[0] == inDesc.dims.d[0]); - IXRT_PLUGIN_ASSERT(maskDesc.dims.d[1] == inDesc.dims.d[1]); - } - - const int32_t S = inDesc.dims.d[SDIM]; - const int32_t B = inDesc.dims.d[BDIM] <= 0 ? in->max.d[BDIM] : inDesc.dims.d[BDIM]; - mS = S; - mB = B; - - IXRT_PLUGIN_ASSERT(outDesc.dims.nbDims == 5); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[BDIM] == inDesc.dims.d[BDIM]); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[SDIM] == mS); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[HDIM] == mHiddenSize); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[3] == 1); - IXRT_PLUGIN_ASSERT(outDesc.dims.d[4] == 1); -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferCreate(&cuinfer_handle)); -#else - CHECK_GPU_ERROR(cublasLtCreate(&blaslt_handle)); -#endif -} - -size_t QKVToContextPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept { - const int32_t B = inputs->dims.d[BDIM]; - const int32_t S = inputs->dims.d[SDIM]; - const int32_t E = inputs->dims.d[2]; - int32_t fmha_S = S; - int64_t buffer_size = B * fmha_S * E; -#ifndef __ILUVATAR__ - buffer_size += B * S * S * mNumHeads; -#endif - return 4 * buffer_size * sizeof(mType); -} - -inline void print_element(half* x, int num, string name) { - printf("%s: \n", name.c_str()); - half* out = (half*)malloc(num * sizeof(half)); - cudaMemcpy(out, x, num * sizeof(half), cudaMemcpyDeviceToHost); - for (auto i = 0; i < num; i++) { - printf("%f\n", __half2float(out[i])); - } - printf("\n"); -} - -int32_t QKVToContextPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept { - gLogInfo << "in QKVToContextPluginDynamic.." << endl; - int32_t S = inputDesc->dims.d[SDIM]; - int32_t B = inputDesc->dims.d[BDIM]; - int32_t status = STATUS_SUCCESS; -#ifdef __ILUVATAR__ - CUINFER_CHECK(cuinferSetStream(cuinfer_handle, stream)); -#endif - - try { - if (mType != DataType::kHALF) { - gLogError << "embLayerNormPlugin infer type{" << int(mType) << "} not supported!" << endl; - return STATUS_NOT_SUPPORTED; - } - half* qkv_buffer_ = (half*)inputs[0]; - half* qkv_out_ = (half*)outputs[0]; - // [B, fmha_S] - int32_t* mask_ = mHasImask ? (int32_t*)inputs[1] : nullptr; - int fmha_seq_len = S; - - int64_t buffer_size = B * fmha_seq_len * mHiddenSize; - half* q_buffer_ = reinterpret_cast(workspace); - half* k_buffer_ = q_buffer_ + buffer_size; - half* v_buffer_ = k_buffer_ + buffer_size; - - - // [B, S, 3*E, 1, 1] [B, fmha_S] -#ifdef __ILUVATAR__ - auto status = - fused_multihead_attetion(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qkv_out_, B, mHeadSize, - mNumHeads, mHiddenSize, S, fmha_seq_len, cuinfer_handle, stream); -#else - half* qk_out_ = v_buffer_ + buffer_size; - auto status = - fused_multihead_attetion(qkv_buffer_, mask_, q_buffer_, k_buffer_, v_buffer_, qk_out_, qkv_out_, B, mHeadSize, - mNumHeads, mHiddenSize, S, fmha_seq_len, blaslt_handle, stream); -#endif - if (status != cudaSuccess) { - return STATUS_FAILURE; - } - return STATUS_SUCCESS; - - } catch (std::exception const& e) { - caughtError(e); - return STATUS_FAILURE; - } -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu deleted file mode 100644 index fb9455c6c2f1dfcdc3e75fec03c16eb4169ed2db..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.cu +++ /dev/null @@ -1,317 +0,0 @@ -#include "qkvToContextPlugin.h" -#include "backend/bert/bert_helper.h" -#ifdef __ILUVATAR__ -#include "backend/ixinfer/ixinfer_gemm_helper.h" -#else -#include "backend/cublas/cublas_helper.h" -#endif - -using namespace nvinfer1::ixrt_plugin::backend; - -namespace nvinfer1::ixrt_plugin { -namespace bert { - -void __global__ IxinferArrangeEncQkvKernel(half *ori_qkv, half *new_q, half *new_k, half *new_v, - int head_dim, int head_num, int batch_seq_len, int fmha_seq_len) { - int hidden_size = head_dim * head_num; - int batch_id = blockIdx.x; - int token_id = blockIdx.y; - - int i = threadIdx.x; // 1个线程处理2个数据 - int head_id = (i * 2) / head_dim; - int dim_id = (i * 2) % head_dim; - - half2 *p_ori_qkv = (half2 *)(ori_qkv + batch_id * batch_seq_len * hidden_size * 3 + token_id * hidden_size * 3); - half2 *p_new_qkv; - - int target_id = batch_id * head_num * fmha_seq_len * head_dim + head_id * fmha_seq_len * head_dim + - token_id * head_dim + dim_id; - /* q */ - p_new_qkv = (half2 *)(new_q + target_id); - p_new_qkv[0] = p_ori_qkv[i]; - /* k */ - p_ori_qkv += hidden_size / 2; - p_new_qkv = (half2 *)(new_k + target_id); - p_new_qkv[0] = p_ori_qkv[i]; - /* v */ - p_ori_qkv += hidden_size / 2; - p_new_qkv = (half2 *)(new_v + target_id); - p_new_qkv[0] = p_ori_qkv[i]; -} - -void IxinferArrangeEncQkv(half *ori_qkv, half *new_q, half *new_k, half *new_v, int bsz, - int head_num, int head_dim, int ori_seq_len, int fmha_seq_len, cudaStream_t stream) { - int hsz = head_num * head_dim; - if (hsz / 2 > 4096) { - throw std::runtime_error("hidden_size / 2 > 4096"); - } - if (hsz % 2 != 0) { - throw std::runtime_error("hsz % 2 != 0"); - } - if (head_dim % 2 != 0) { - throw std::runtime_error("head_dim %2 != 0"); - } - dim3 blockSize(bsz, ori_seq_len); - IxinferArrangeEncQkvKernel<<>>(ori_qkv, new_q, new_k, new_v, head_dim, - head_num, ori_seq_len, fmha_seq_len); -} - -__global__ void IxinferEncAttnOutArrangeKernel(const half *ori_q, half *new_q, const int bsz, const int ori_seq_len, - const int fmha_seq_len, const int head_num, const int head_dim) { - half2 *p_ori_q = (half2 *)ori_q; - half2 *p_new_q = (half2 *)new_q; - - int batch_token_num = ori_seq_len * head_dim * head_num; - int hidden_size = head_dim * head_num; - int date_length = bsz * ori_seq_len * head_num * head_dim; - - int elem_idx = threadIdx.x + blockIdx.x * blockDim.x; - while (elem_idx < date_length / 2) { - int half_elem_idx = elem_idx * 2; - - int bsz_idx = half_elem_idx / batch_token_num; - int seq_idx = half_elem_idx % batch_token_num / hidden_size; - int head_idx = half_elem_idx % batch_token_num % hidden_size / head_dim; - int dim_idx = half_elem_idx % batch_token_num % hidden_size % head_dim; - - int src_index = bsz_idx * head_num * fmha_seq_len * head_dim + head_idx * fmha_seq_len * head_dim + - seq_idx * head_dim + dim_idx; - - p_new_q[elem_idx] = p_ori_q[src_index / 2]; - - elem_idx += gridDim.x * blockDim.x; - } -} - -void IxinferEncAttnOutArrange(half *ori_q, half *new_q, int bsz, int ori_seq_len, int fmha_seq_len, int head_num, - int head_dim, cudaStream_t stream) { - if (bsz * ori_seq_len * head_num * head_dim % 2 != 0) { - throw std::runtime_error("bsz * ori_seq_len * head_num * head_dim % 2 != 0"); - } - int data_length = bsz * ori_seq_len * head_num * head_dim / 2; - int num_threads = 512; - int num_blocks = ((data_length - 1 + num_threads) / num_threads); - num_blocks = std::min(num_blocks, 128); - IxinferEncAttnOutArrangeKernel<<>>(ori_q, new_q, bsz, ori_seq_len, fmha_seq_len, - head_num, head_dim); -} - - -template -__global__ void IxinferCorrelationSoftmaxEncselfKernel(__half *correlation, const int *src_padding_mask, - const int batch_seq_len) { - constexpr int next_power_of_two = 1 << log2_elements; - constexpr int SOFT_WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; - constexpr int WARP_ITERATIONS = next_power_of_two / SOFT_WARP_SIZE; - - int head_num = blockDim.y; - int seq_len = gridDim.y; - int start_idx = (blockIdx.x * head_num * seq_len * batch_seq_len + threadIdx.y * seq_len * batch_seq_len + - blockIdx.y * batch_seq_len); - - half2 *p_correlation = (half2 *)(correlation + start_idx); - int32_t *p_mask = (int32_t *)(src_padding_mask + blockIdx.x * batch_seq_len); - - int local_idx = threadIdx.x; - - float2 elements[WARP_ITERATIONS]; -#pragma unroll - for (int it = 0; it < WARP_ITERATIONS; ++it) { - int element_index = local_idx + it * SOFT_WARP_SIZE; - if (element_index < batch_seq_len / 2) { - half2 correlation_value = p_correlation[element_index]; - - elements[it].x = - p_mask[element_index * 2] ? -INFINITY : __half2float(correlation_value.x); - elements[it].y = p_mask[element_index * 2 + 1] ? -INFINITY - : __half2float(correlation_value.y); - - } else { - elements[it].x = -INFINITY; - elements[it].y = -INFINITY; - } - } - - float max_value = elements[0].x; - max_value = (max_value > elements[0].y) ? max_value : elements[0].y; - -#pragma unroll - for (int it = 1; it < WARP_ITERATIONS; ++it) { - max_value = (max_value > elements[it].x) ? max_value : elements[it].x; - max_value = (max_value > elements[it].y) ? max_value : elements[it].y; - } - - warp_reduce(&max_value); - - float sum = 0.0f; -#pragma unroll - for (int it = 0; it < WARP_ITERATIONS; ++it) { - elements[it].x = __expf(elements[it].x - max_value); - elements[it].y = __expf(elements[it].y - max_value); - - sum += (elements[it].x + elements[it].y); - } - - warp_reduce(&sum); - sum = 1.0f / sum; - -#pragma unroll - for (int it = 0; it < WARP_ITERATIONS; ++it) { - int element_index = local_idx + it * SOFT_WARP_SIZE; - half2 correlation_value; - if (element_index < batch_seq_len / 2) { - correlation_value.x = __float2half(elements[it].x * sum); - correlation_value.y = __float2half(elements[it].y * sum); - - p_correlation[element_index] = correlation_value; - - } else { - break; - } - } -} - -void IxinferCorrelationSoftmaxEncself(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream, - __half *correlation, const int *src_padding_mask) { - if (batch_seq_len > 4096) { - throw std::runtime_error("batch_seq_len should <= 4096"); - } - if (batch_seq_len % 2 != 0) { - throw std::runtime_error("batch_seq_len % 2 != 0"); - } - - int log2_elements = log2_ceil(batch_seq_len / 2); - int next_power_of_two = 1 << log2_elements; - int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; - - dim3 grid(batch_size, batch_seq_len); - - dim3 block(WARP_SIZE, head_num); - - switch (log2_elements) { - case 0: - IxinferCorrelationSoftmaxEncselfKernel<0> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - - case 1: - IxinferCorrelationSoftmaxEncselfKernel<1> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - - case 2: - IxinferCorrelationSoftmaxEncselfKernel<2> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - - case 3: - IxinferCorrelationSoftmaxEncselfKernel<3> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - - case 4: - IxinferCorrelationSoftmaxEncselfKernel<4> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - - case 5: - IxinferCorrelationSoftmaxEncselfKernel<5> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - - case 6: - IxinferCorrelationSoftmaxEncselfKernel<6> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - case 7: - IxinferCorrelationSoftmaxEncselfKernel<7> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - case 8: - IxinferCorrelationSoftmaxEncselfKernel<8> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - case 9: - IxinferCorrelationSoftmaxEncselfKernel<9> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - case 10: - IxinferCorrelationSoftmaxEncselfKernel<10> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - case 11: - IxinferCorrelationSoftmaxEncselfKernel<11> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - case 12: - IxinferCorrelationSoftmaxEncselfKernel<12> - <<>>(correlation, src_padding_mask, batch_seq_len); - break; - default: - throw std::runtime_error("IxinferCorrelationSoftmaxEncself NotImplementedError"); - break; - } -} - -#ifdef __ILUVATAR__ -cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, - half* q_buffer, half* k_buffer, half* v_buffer, half* qkv_out, - int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len, - cuinferHandle_t &cuinfer_handle, cudaStream_t &stream) { - /* qkv arrange*/ - // bsz,ori_seq_len,3*hsz -> 3*(bsz,head_num,fmha_seq_len,head_dim) - IxinferArrangeEncQkv(qkv_buffer, q_buffer, k_buffer, v_buffer, bsz, head_num, head_dim, ori_seq_len, - fmha_seq_len, stream); - - cuinferTensorDescriptor_t qDesc, kDesc, vDesc, maskDesc, oDesc; - cuinferDataType_t _cuinferCompType = cuinferDataType_t::CUINFER_DATA_FLOAT; - cuinferDataType_t _cuinferDataType = cuinferDataType_t::CUINFER_DATA_HALF; - cuinferDataType_t _cuinferMaskType = cuinferDataType_t::CUINFER_DATA_INT32; - cuinferCreateTensorDescriptor(&qDesc); - cuinferCreateTensorDescriptor(&kDesc); - cuinferCreateTensorDescriptor(&vDesc); - cuinferCreateTensorDescriptor(&maskDesc); - cuinferCreateTensorDescriptor(&oDesc); - - cuinferSetTensor4dDescriptor(qDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num, - fmha_seq_len, head_dim); - cuinferSetTensor4dDescriptor(kDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num, - fmha_seq_len, head_dim); - cuinferSetTensor4dDescriptor(vDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num, - fmha_seq_len, head_dim); - cuinferSetTensor4dDescriptor(maskDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferMaskType, bsz, 1, 1, - fmha_seq_len); - cuinferSetTensor4dDescriptor(oDesc, cuinferTensorFormat_t::CUINFER_TENSOR_NCHW, _cuinferDataType, bsz, head_num, - fmha_seq_len, head_dim); - - cuinferFMHAParam fmha_param; - cuinferFMHAForward(cuinfer_handle, fmha_param, _cuinferCompType, _cuinferDataType, _cuinferMaskType, qDesc, - q_buffer, kDesc, k_buffer, vDesc, v_buffer, maskDesc, mask, oDesc, q_buffer, true); - - IxinferEncAttnOutArrange(q_buffer, qkv_out, bsz, ori_seq_len, fmha_seq_len, head_num, head_dim, stream); - return cudaSuccess; -} -#else -cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, - half* q_buffer, half* k_buffer, half* v_buffer, half* qk_out, half* qkv_out, - int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len, - cublasLtHandle_t &blaslt_handle, cudaStream_t &stream) { - /* qkv arrange*/ - // bsz,ori_seq_len,3*hsz -> 3*(bsz,head_num,fmha_seq_len,head_dim) - IxinferArrangeEncQkv(qkv_buffer, q_buffer, k_buffer, v_buffer, bsz, head_num, head_dim, ori_seq_len, - fmha_seq_len, stream); - - cublaslt_gemm(k_buffer, q_buffer, qk_out, bsz * head_num, fmha_seq_len, fmha_seq_len, head_dim, - fmha_seq_len * head_dim, fmha_seq_len * head_dim, fmha_seq_len * fmha_seq_len, 1.0/sqrt(head_dim*1.0), blaslt_handle, stream); - - IxinferCorrelationSoftmaxEncself(bsz, fmha_seq_len, head_num, stream, qk_out, mask); - - cublaslt_gemm_nn(v_buffer, qk_out, q_buffer, bsz * head_num, head_dim, fmha_seq_len, fmha_seq_len, - fmha_seq_len * head_dim, fmha_seq_len * fmha_seq_len, fmha_seq_len * head_dim, 1.0f, blaslt_handle, stream); - - IxinferEncAttnOutArrange(q_buffer, qkv_out, bsz, ori_seq_len, fmha_seq_len, head_num, head_dim, stream); - return cudaSuccess; -} -#endif -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h deleted file mode 100644 index aaee52b710d275427188b9bf8174bfc9b924faaf..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/qkv_to_context/qkvToContextPlugin.h +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#pragma once -#ifdef __ILUVATAR__ -#include -#endif -#include - -#include "NvInferRuntime.h" -#include "bertCommon.h" - - -namespace nvinfer1::ixrt_plugin { -namespace bert { -#ifdef __ILUVATAR__ -cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, - half* q_buffer, half* k_buffer, half* v_buffer, half* qkv_out, - int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len, - cuinferHandle_t &cuinfer_handle, cudaStream_t &stream); -#else -cudaError_t fused_multihead_attetion(half* qkv_buffer, int32_t* mask, - half* q_buffer, half* k_buffer, half* v_buffer, half* qk_out, half* qkv_out, - int bsz, int head_dim, int head_num, int hsz, int ori_seq_len, int fmha_seq_len, - cublasLtHandle_t &blaslt_handle, cudaStream_t &stream); -#endif - -void IxinferArrangeEncQkv(half *ori_qkv, half *new_q, half *new_k, half *new_v, int bsz, - int head_num, int head_dim, int ori_seq_len, int fmha_seq_len, cudaStream_t stream); - -void IxinferEncAttnOutArrange(half *ori_q, half *new_q, int bsz, int ori_seq_len, int fmha_seq_len, int head_num, - int head_dim, cudaStream_t stream); - -void IxinferCorrelationSoftmaxEncself(int batch_size, int batch_seq_len, int head_num, cudaStream_t stream, - half *correlation, const int *src_padding_mask); - -class QKVToContextPluginDynamic : public nvinfer1::IPluginV2DynamicExt -{ -public: - QKVToContextPluginDynamic(const std::string name, const nvinfer1::DataType type, const int32_t hiddenSize, - const int32_t numHeads, float const dqProbs, bool hasImask = false); - - QKVToContextPluginDynamic(const std::string name, void const* data, size_t length); - - // It doesn't make sense to make QKVToContextPluginDynamic without arguments, so we - // delete default constructor. - QKVToContextPluginDynamic() = delete; - - // IPluginV2 Methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext Methods - nvinfer1::DataType getOutputDataType( - int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt Methods - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; - nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination( - int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; - -private: - const std::string mLayerName; - std::string mNamespace; - - int32_t mS; - int32_t mB; - int32_t mSM; - int32_t mHeadSize; - int32_t mHiddenSize; - int32_t mNumHeads; - bool mHasImask; - nvinfer1::DataType mType; - float mDqProbs; -#ifdef __ILUVATAR__ - cuinferHandle_t cuinfer_handle; -#else - cublasLtHandle_t blaslt_handle; -#endif - cudaStream_t stream; - - half* query_; -}; - -class QKVToContextPluginDynamicCreator : public nvinfer1::IPluginCreator -{ -public: - QKVToContextPluginDynamicCreator(); - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2* deserializePlugin( - char const* name, void const* serialData, size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - -private: - static nvinfer1::PluginFieldCollection mFC; - static vector mPluginAttributes; - std::string mNamespace; -}; - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp deleted file mode 100644 index 6e4e5a37e148b4ad3719cbce8bef4e3261a83c89..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cpp +++ /dev/null @@ -1,404 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "skipLayerNormInt8Plugin.h" - -#include "NvInferRuntime.h" -#include "checkMacrosPlugin.h" -#include "driver_types.h" -#include "plugin.h" -#include "serialize.h" - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; - -// Clip plugin specific constants -namespace { -char const* kSKIP_LAYER_NORM_INT8_VERSION_HFACE{"3"}; -char const* kSKIP_LAYER_NORM_INT8_VERSION_MTRON{"4"}; -char const* kSKIP_LAYER_NORM_INT8_NAME{"CustomSkipLayerNormPluginDynamic_IxRT"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection SkipLayerNormInt8PluginBaseCreator::mFC{}; -std::vector SkipLayerNormInt8PluginBaseCreator::mPluginAttributes; - -constexpr auto param_type = DataType::kFLOAT; - -SkipLayerNormInt8PluginBaseCreator::SkipLayerNormInt8PluginBaseCreator() { - mPluginAttributes.clear(); - mPluginAttributes.emplace_back(PluginField("beta")); - mPluginAttributes.emplace_back(PluginField("gamma")); - mPluginAttributes.emplace_back(PluginField("bias")); - mPluginAttributes.emplace_back(PluginField("output_fp32")); - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -SkipLayerNormInt8PluginHFaceCreator::SkipLayerNormInt8PluginHFaceCreator() : SkipLayerNormInt8PluginBaseCreator() {} - -char const* SkipLayerNormInt8PluginBaseCreator::getPluginName() const noexcept { return kSKIP_LAYER_NORM_INT8_NAME; } - -PluginFieldCollection const* SkipLayerNormInt8PluginBaseCreator::getFieldNames() noexcept { return &mFC; } - -void SkipLayerNormInt8PluginBaseCreator::setPluginNamespace(char const* libNamespace) noexcept { - mNamespace = libNamespace; -} - -char const* SkipLayerNormInt8PluginBaseCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -char const* SkipLayerNormInt8PluginHFaceCreator::getPluginVersion() const noexcept { - return kSKIP_LAYER_NORM_INT8_VERSION_HFACE; -} - -bool buildBetaAndGamma(PluginFieldCollection const* fc, Weights& beta, Weights& gamma, Weights& bias) { - ixrt_plugin::validateRequiredAttributesExist({"beta", "gamma"}, fc); - - bool output_fp32 = false; - - for (int32_t i = 0; i < fc->nbFields; i++) { - std::string field_name(fc->fields[i].name); - - if (field_name.compare("beta") == 0) { - gLogInfo << "Building beta..." << endl; - beta.values = fc->fields[i].data; - beta.count = fc->fields[i].length; - beta.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("gamma") == 0) { - gLogInfo << "Building gamma..." << endl; - gamma.values = fc->fields[i].data; - gamma.count = fc->fields[i].length; - gamma.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bias") == 0) { - gLogInfo << "Building bias..." << endl; - bias.values = fc->fields[i].data; - bias.count = fc->fields[i].length; - bias.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("output_fp32") == 0) { - IXRT_PLUGIN_ASSERT(fc->fields[i].type == PluginFieldType::kINT32); - output_fp32 = (static_cast(fc->fields[i].data)[0] == 1); - gLogInfo << "Building output_fp32" << output_fp32 << endl; - } - } - - IXRT_PLUGIN_CHECK_VALUE(beta.values != nullptr, "SkipLayerNorm: invalid beta"); - IXRT_PLUGIN_CHECK_VALUE(beta.count > 0, "SkipLayerNorm: invalid beta"); - - IXRT_PLUGIN_CHECK_VALUE(gamma.values != nullptr, "SkipLayerNorm: invalid gamma"); - IXRT_PLUGIN_CHECK_VALUE(gamma.count > 0, "SkipLayerNorm: invalid gamma"); - return output_fp32; -} - -IPluginV2* SkipLayerNormInt8PluginHFaceCreator::createPlugin(char const* name, - PluginFieldCollection const* fc) noexcept { - try { - gLogInfo << "SkipLayerNormInt8PluginHFaceCreator createPlugin" << endl; - - Weights beta{DataType::kFLOAT, nullptr, 0}; - Weights gamma{DataType::kFLOAT, nullptr, 0}; - Weights bias{DataType::kFLOAT, nullptr, 0}; - bool output_fp32 = buildBetaAndGamma(fc, beta, gamma, bias); - return new SkipLayerNormInt8PluginHFace(name, beta, gamma, bias, output_fp32); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -IPluginV2* SkipLayerNormInt8PluginHFaceCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - // This object will be deleted when the network is destroyed, which will - // call SkipLayerNormInterleavedPlugin::destroy() - try { - gLogInfo << "SkipLayerNormInterleavedPluginHFaceCreator deserializePlugin" << endl; - return new SkipLayerNormInt8PluginHFace(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -// REGISTER_TENSORRT_PLUGIN(SkipLayerNormInt8PluginHFaceCreator); -//#########################################################################// -SkipLayerNormInt8PluginBase::SkipLayerNormInt8PluginBase(std::string const& name, Weights const& beta, - Weights const& gamma, Weights const& bias, bool output_fp32) - : mLayerName(name), - mGammaDev(nullptr), - mBetaDev(nullptr), - mBiasDev(nullptr), - mLd(beta.count), - mParamsOnDevice(false), - output_fp32(output_fp32) { - IXRT_PLUGIN_ASSERT(mLd > 0); - IXRT_PLUGIN_ASSERT(beta.count == gamma.count); - // dataType for beta, gamma weights is always fp16 - mParamWordsize = getElementSize(param_type); - - mBeta.convertAndCopy(beta, param_type); - mGamma.convertAndCopy(gamma, param_type); - - mHasBias = (bias.values != nullptr); - if (mHasBias) { - mBias.convertAndCopy(bias, param_type); - } - - copyToDevice(mGamma, getWeightsSize(mGamma, param_type), mGammaDev); - copyToDevice(mBeta, getWeightsSize(mBeta, param_type), mBetaDev); - if (mHasBias) { - copyToDevice(mBias, getWeightsSize(mBias, param_type), mBiasDev); - } -} - -SkipLayerNormInt8PluginBase::SkipLayerNormInt8PluginBase(std::string const& name, void const* data, size_t length) - : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mParamsOnDevice(false) { - // Deserialize in the same order as serialization - deserialize_value(&data, &length, &mLd); - deserialize_value(&data, &length, &mHasBias); - deserialize_value(&data, &length, &output_fp32); - - mParamWordsize = getElementSize(param_type); - - char const* d = static_cast(data); - mBeta.convertAndCopy(d, mLd, param_type); - mGamma.convertAndCopy(d, mLd, param_type); - - if (mHasBias) { - mBias.convertAndCopy(d, mLd, param_type); - } - - copyToDevice(mGamma, getWeightsSize(mGamma, param_type), mGammaDev); - copyToDevice(mBeta, getWeightsSize(mBeta, param_type), mBetaDev); - if (mHasBias) { - copyToDevice(mBias, getWeightsSize(mBias, param_type), mBiasDev); - } -} - -SkipLayerNormInt8PluginHFace::SkipLayerNormInt8PluginHFace(std::string const& name, Weights const& beta, - Weights const& gamma, Weights const& bias, bool output_fp32) - : SkipLayerNormInt8PluginBase(name, beta, gamma, bias, output_fp32) {} - -SkipLayerNormInt8PluginHFace::SkipLayerNormInt8PluginHFace(std::string const& name, void const* data, size_t length) - : SkipLayerNormInt8PluginBase(name, data, length) { - gLogInfo << "SkipLayerNormInt8PluginHFace deserialize" << endl; -} - -// IPluginV2 Methods -char const* SkipLayerNormInt8PluginBase::getPluginType() const noexcept { return kSKIP_LAYER_NORM_INT8_NAME; } - -size_t SkipLayerNormInt8PluginBase::getSerializationSize() const noexcept { - const size_t biasSize = mHasBias ? (mLd * mParamWordsize) : 0; - return 2 * mParamWordsize * mLd + sizeof(mLd) + sizeof(mHasBias) + sizeof(output_fp32) + biasSize; -} - -void SkipLayerNormInt8PluginBase::serialize(void* buffer) const noexcept { - try { - serialize_value(&buffer, mLd); - serialize_value(&buffer, mHasBias); - serialize_value(&buffer, output_fp32); - - char* d = static_cast(buffer); - serFromDev(d, static_cast(mBetaDev.get()), mLd * mParamWordsize); - serFromDev(d, static_cast(mGammaDev.get()), mLd * mParamWordsize); - if (mHasBias) { - serFromDev(d, static_cast(mBiasDev.get()), mLd * mParamWordsize); - } - } catch (std::exception const& e) { - caughtError(e); - } -} - -void SkipLayerNormInt8PluginBase::destroy() noexcept { - try { - // This gets called when the network containing plugin is destroyed - mGammaDev.reset(nullptr); - mBetaDev.reset(nullptr); - if (mHasBias) { - mBiasDev.reset(nullptr); - } - delete this; - } catch (std::exception const& e) { - caughtError(e); - } -} - -void SkipLayerNormInt8PluginBase::setPluginNamespace(char const* libNamespace) noexcept { mNamespace = libNamespace; } - -char const* SkipLayerNormInt8PluginBase::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// HFace -int32_t SkipLayerNormInt8PluginHFace::initialize() noexcept { - gLogInfo << "SkipLayerNormInterleavedPluginHFace initialize" << endl; - return 0; -} - -void SkipLayerNormInt8PluginHFace::terminate() noexcept { - gLogInfo << "SkipLayerNormInterleavedPluginHFace terminate" << endl; -} - -void SkipLayerNormInt8PluginHFace::destroy() noexcept { - gLogInfo << "SkipLayerNormInterleavedPluginHFace destroy" << endl; - SkipLayerNormInt8PluginBase::destroy(); -} - -char const* SkipLayerNormInt8PluginHFace::getPluginVersion() const noexcept { - return kSKIP_LAYER_NORM_INT8_VERSION_HFACE; -} - -int32_t SkipLayerNormInt8PluginHFace::getNbOutputs() const noexcept { return 2; } - -// IPluginV2Ext Methods -DataType SkipLayerNormInt8PluginBase::getOutputDataType(int32_t index, DataType const* inputTypes, - int32_t nbInputs) const noexcept { - try { - IXRT_PLUGIN_ASSERT(inputTypes != nullptr); - IXRT_PLUGIN_ASSERT(index >= 0 && index < getNbOutputs()); - IXRT_PLUGIN_ASSERT(nbInputs == 3); - if (index == 0) { - return output_fp32 ? DataType::kFLOAT : DataType::kINT8; - } - return DataType::kFLOAT; - } catch (std::exception const& e) { - caughtError(e); - } - return DataType{}; -} - -// IPluginV2DynamicExt Methods -DimsExprs SkipLayerNormInt8PluginBase::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, - int32_t nbInputs, IExprBuilder& exprBuilder) noexcept { - try { - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(nbInputs == 3); - IXRT_PLUGIN_ASSERT(outputIndex >= 0 && outputIndex < getNbOutputs()); - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims); - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims); - return inputs[0]; - } catch (std::exception const& e) { - caughtError(e); - } - return DimsExprs{}; -} - -bool SkipLayerNormInt8PluginBase::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, - int32_t nbInputs, int32_t nbOutputs) noexcept { - try { - IXRT_PLUGIN_ASSERT(inOut != nullptr); - IXRT_PLUGIN_ASSERT(nbInputs == 3); - IXRT_PLUGIN_ASSERT(nbOutputs == getNbOutputs()); - IXRT_PLUGIN_ASSERT(pos >= 0 && pos < (nbInputs + nbOutputs)); - - PluginTensorDesc const& desc = inOut[pos]; - if (pos == 2 || pos == 4 || (output_fp32 && pos == 3)) { - return desc.type == DataType::kFLOAT && desc.format == TensorFormat::kLINEAR; - } - return desc.type == DataType::kINT8 && desc.format == TensorFormat::kLINEAR; - } catch (std::exception const& e) { - caughtError(e); - } - return false; -} - -void SkipLayerNormInt8PluginBase::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, - DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept { - try { - // Validate input arguments - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(outputs != nullptr); - IXRT_PLUGIN_ASSERT(nbOutputs == getNbOutputs()); - IXRT_PLUGIN_ASSERT(nbInputs == 3); - - auto const& inDims0 = inputs[0].desc.dims; - auto const& inDims1 = inputs[1].desc.dims; - auto const& inDims2 = inputs[2].desc.dims; - TRT_UNUSED inDims1; - TRT_UNUSED inDims2; - - IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims1.nbDims); - IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d)); - IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims2.nbDims); - IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims2.d)); - - mParamWordsize = getElementSize(param_type); - } catch (std::exception const& e) { - caughtError(e); - } -} - -size_t SkipLayerNormInt8PluginBase::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, - int32_t nbOutputs) const noexcept { - return 0; -} - -// HFace IPluginV2DynamicExt Methods -IPluginV2DynamicExt* SkipLayerNormInt8PluginHFace::clone() const noexcept { - try { - gLogInfo << "SkipLayerNormInterleavedPluginHFace clone" << endl; - auto* p = new SkipLayerNormInt8PluginHFace(mLayerName, mBeta, mGamma, mBias, output_fp32); - p->initialize(); - p->setPluginNamespace(mNamespace.c_str()); - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -int32_t SkipLayerNormInt8PluginHFace::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept { - try { - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(outputs != nullptr); - auto const iDesc = inputDesc[0]; - auto const oDesc = outputDesc[0]; - - const int32_t B = iDesc.dims.d[0]; - const int32_t S = iDesc.dims.d[1]; - const int32_t E = iDesc.dims.d[2]; - int batch_token_num = B * S; - float const dqScaleIn = iDesc.scale; - IXRT_PLUGIN_ASSERT(dqScaleIn > 1e-9); - float const qScale = oDesc.scale; - int8_t const* input = static_cast(inputs[0]); - int8_t const* skip = static_cast(inputs[1]); - float* residual = (float*)inputs[2]; - float const* gamma = static_cast(mGammaDev.get()); - float const* beta = static_cast(mBetaDev.get()); - float const* bias = static_cast(mBiasDev.get()); - float* residual_out = static_cast(outputs[1]); - - if (!output_fp32) { - int8_t* output = static_cast(outputs[0]); - skipLayerNormI8II8O(input, gamma, beta, bias, output, residual, residual_out, batch_token_num, E, - dqScaleIn, 1.0 / qScale, 1024, stream, true); - } else { - float* output = static_cast(outputs[0]); - skipLayerNormI8IF32O(input, gamma, beta, bias, output, residual, residual_out, batch_token_num, E, - 1.0 / dqScaleIn, 1.0 / qScale, 1024, stream, true); - } - return STATUS_SUCCESS; - } catch (std::exception const& e) { - caughtError(e); - } - return STATUS_FAILURE; -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu deleted file mode 100644 index 7cd3e56418726a3b8e32b7835560771aee873cca..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.cu +++ /dev/null @@ -1,361 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#include "backend/bert/bert_helper.h" -#include "skipLayerNormInt8Plugin.h" -using namespace nvinfer1::ixrt_plugin::backend; - -namespace nvinfer1::ixrt_plugin { -namespace bert { - -template -__global__ void skipLayernormI8II8OKernel(const int8_t *input, const float *scale, const float *bias, - const float *residual_bias, int8_t *output, float *residual, float* residual_out, - int hidden_size, float dequant_scale, float quant_scale, - bool is_post_ln) { - // register - // process 2 data - float4 vals[THREAD_DATA_LEN]; - int block_start = blockIdx.x * hidden_size / 4; - char4 *p_input = (char4 *)input; - char4 *p_output = (char4 *)output; - float4 *p_residual = (float4 *)residual; - float4 *p_residual_out = (float4 *)residual_out; - float4 *p_scale = (float4 *)scale; - float4 *p_bias = (float4 *)bias; - float4 *p_residual_bias = (float4 *)residual_bias; - // one line start - p_input += block_start; - p_output += block_start; - p_residual += block_start; - p_residual_out += block_start; - - float thread_m2 = 0; - float thread_mean = 0; - float thread_count = 0; - - // load data from global memory -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - // vals = dequant(input) + residual + bias - p_residual_out[element_index].x = p_residual[element_index].x + p_residual_bias[element_index].x; - p_residual_out[element_index].y = p_residual[element_index].y + p_residual_bias[element_index].y; - p_residual_out[element_index].z = p_residual[element_index].z + p_residual_bias[element_index].z; - p_residual_out[element_index].w = p_residual[element_index].w + p_residual_bias[element_index].w; - vals[it] = char4addfloat4_dequant(p_input[element_index], p_residual_out[element_index], dequant_scale); - WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count); - } - - // mean var - float mean = 0; - float m2 = 0; - float count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count); - mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE); - m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE); - count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE); - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_size, epsilon, - p_scale[element_index], p_bias[element_index]); - - p_residual_out[element_index].x = norm_value.x; - p_residual_out[element_index].y = norm_value.y; - p_residual_out[element_index].z = norm_value.z; - p_residual_out[element_index].w = norm_value.w; - - char4 res = float42char4(norm_value, quant_scale); - p_output[element_index] = res; - } -} - -template -__global__ void skipLayernormI8IF32OKernel(const int8_t *input, const float *scale, const float *bias, - const float *residual_bias, float *output, float *residual, float* residual_out, - int hidden_size, float dequant_scale, float quant_scale, - bool is_post_ln) { - // register - // process 2 data - float4 vals[THREAD_DATA_LEN]; - int block_start = blockIdx.x * hidden_size / 4; - char4 *p_input = (char4 *)input; - float4 *p_output = (float4 *)output; - float4 *p_residual = (float4 *)residual; - float4 *p_residual_out = (float4 *)residual_out; - float4 *p_scale = (float4 *)scale; - float4 *p_bias = (float4 *)bias; - float4 *p_residual_bias = (float4 *)residual_bias; - // one line start - p_input += block_start; - p_output += block_start; - p_residual += block_start; - p_residual_out += block_start; - - float thread_m2 = 0; - float thread_mean = 0; - float thread_count = 0; - - // load data from global memory -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - // vals = dequant(input) + residual + bias - p_residual_out[element_index].x = p_residual[element_index].x + p_residual_bias[element_index].x; - p_residual_out[element_index].y = p_residual[element_index].y + p_residual_bias[element_index].y; - p_residual_out[element_index].z = p_residual[element_index].z + p_residual_bias[element_index].z; - p_residual_out[element_index].w = p_residual[element_index].w + p_residual_bias[element_index].w; - vals[it] = char4addfloat4_dequant(p_input[element_index], p_residual_out[element_index], dequant_scale); - WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].z, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].w, &thread_mean, &thread_m2, &thread_count); - } - - // mean var - float mean = 0; - float m2 = 0; - float count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count); - mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE); - m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE); - count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE); - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - float4 norm_value = compute_float4_norm_value(vals[it], mean, m2, hidden_size, epsilon, - p_scale[element_index], p_bias[element_index]); - - p_output[element_index].x = norm_value.x; - p_output[element_index].y = norm_value.y; - p_output[element_index].z = norm_value.z; - p_output[element_index].w = norm_value.w; - } -} - - -void skipLayerNormI8II8O(const int8_t *input, const float *scale, const float *bias, const float *residual_bias, - int8_t *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale, - float quant_scale, int max_thread_per_block, cudaStream_t stream, - bool is_post_ln) { - - if (hidden_size > 1024) { - throw std::runtime_error("hidden_size should <= 1024"); - } - if (hidden_size % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0"); - } - dim3 gridSize(batch_tokens); - dim3 blockSize(C10_WARP_SIZE); - - int num_warp = hidden_size / C10_WARP_SIZE / 4; - - switch (num_warp) { - case 1: - skipLayernormI8II8OKernel<1> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 2: - skipLayernormI8II8OKernel<2> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 3: - skipLayernormI8II8OKernel<3> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 4: - skipLayernormI8II8OKernel<4> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 5: - skipLayernormI8II8OKernel<5> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 6: - skipLayernormI8II8OKernel<6> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 7: - skipLayernormI8II8OKernel<7> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 8: - skipLayernormI8II8OKernel<8> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 9: - skipLayernormI8II8OKernel<9> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 10: - skipLayernormI8II8OKernel<10> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 11: - skipLayernormI8II8OKernel<11> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 12: - skipLayernormI8II8OKernel<12> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 13: - skipLayernormI8II8OKernel<13> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 14: - skipLayernormI8II8OKernel<14> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 15: - skipLayernormI8II8OKernel<15> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 16: - skipLayernormI8II8OKernel<16> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - default: - throw std::runtime_error("skipLayernormI8II8OKernel"); - break; - } -} - -void skipLayerNormI8IF32O(const int8_t *input, const float *scale, const float *bias, const float *residual_bias, - float *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale, - float quant_scale, int max_thread_per_block, cudaStream_t stream, - bool is_post_ln) { - if (hidden_size > 1024) { - throw std::runtime_error("hidden_size should <= 1024"); - } - if (hidden_size % C10_WARP_SIZE != 0) { - throw std::runtime_error("hidden_size // C10_WARP_SIZE != 0"); - } - dim3 gridSize(batch_tokens); - dim3 blockSize(C10_WARP_SIZE); - - int num_warp = hidden_size / C10_WARP_SIZE / 4; - - switch (num_warp) { - case 1: - skipLayernormI8IF32OKernel<1> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 2: - skipLayernormI8IF32OKernel<2> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 3: - skipLayernormI8IF32OKernel<3> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 4: - skipLayernormI8IF32OKernel<4> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 5: - skipLayernormI8IF32OKernel<5> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 6: - skipLayernormI8IF32OKernel<6> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 7: - skipLayernormI8IF32OKernel<7> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 8: - skipLayernormI8IF32OKernel<8> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 9: - skipLayernormI8IF32OKernel<9> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 10: - skipLayernormI8IF32OKernel<10> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 11: - skipLayernormI8IF32OKernel<11> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 12: - skipLayernormI8IF32OKernel<12> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 13: - skipLayernormI8IF32OKernel<13> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 14: - skipLayernormI8IF32OKernel<14> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 15: - skipLayernormI8IF32OKernel<15> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - case 16: - skipLayernormI8IF32OKernel<16> - <<>>(input, scale, bias, residual_bias, output, residual, residual_out, hidden_size, - dequant_scale, quant_scale, is_post_ln); - break; - default: - throw std::runtime_error("skipLayernormI8II8OKernel"); - break; - } -} - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h deleted file mode 100644 index f752f59f5e590b00485568f07c43cd47ea4586a1..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormInt8Plugin.h +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -*/ -#pragma once - -#include -#include -#include "NvInferRuntime.h" -#include "bertCommon.h" - -namespace nvinfer1::ixrt_plugin { -namespace bert { - - -void skipLayerNormI8II8O(const int8_t *input, const float *scale, const float *bias, const float *residual_bias, - int8_t *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale, - float quant_scale, int max_thread_per_block, cudaStream_t stream, - bool is_post_ln); - -void skipLayerNormI8IF32O(const int8_t *input, const float *scale, const float *bias, const float *residual_bias, - float *output, float *residual, float* residual_out, int batch_tokens, int hidden_size, float dequant_scale, - float quant_scale, int max_thread_per_block, cudaStream_t stream, - bool is_post_ln); - -class SkipLayerNormInt8PluginBase : public nvinfer1::IPluginV2DynamicExt -{ -public: - SkipLayerNormInt8PluginBase( - std::string const& name, nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias, bool output_fp32); - - SkipLayerNormInt8PluginBase(std::string const& name, void const* data, size_t length); - - // It doesn't make sense to make SkipLayerNormInterleavedPlugin without - // arguments, so we delete default constructor. - SkipLayerNormInt8PluginBase() = delete; - - // IPluginV2 Methods - char const* getPluginType() const noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* pluginNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext Methods - nvinfer1::DataType getOutputDataType( - int32_t index, nvinfer1::DataType const* inputTypes, int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt Methods - nvinfer1::DimsExprs getOutputDimensions(int32_t outputIndex, nvinfer1::DimsExprs const* inputs, int32_t nbInputs, - nvinfer1::IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination( - int32_t pos, nvinfer1::PluginTensorDesc const* inOut, int32_t nbInputs, int32_t nbOutputs) noexcept override; - void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, int32_t nbInputs, - nvinfer1::DynamicPluginTensorDesc const* out, int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int32_t nbInputs, - nvinfer1::PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept override; - -protected: - std::string const& mLayerName; - std::string mNamespace; - - bert::cuda_unique_ptr mGammaDev; - bert::cuda_unique_ptr mBetaDev; - size_t mLd{}; // leading dim - bert::WeightsWithOwnership mGamma; - bert::WeightsWithOwnership mBeta; - - size_t mParamWordsize{}; - bool mParamsOnDevice{}; - bool mHasBias{}; - cuda_unique_ptr mBiasDev; - WeightsWithOwnership mBias; - bool output_fp32{}; -}; - -class SkipLayerNormInt8PluginHFace : public SkipLayerNormInt8PluginBase -{ -public: - SkipLayerNormInt8PluginHFace( - std::string const& name, nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias, bool output_fp32); - - SkipLayerNormInt8PluginHFace(std::string const& name, void const* data, size_t length); - - // It doesn't make sense to make SkipLayerNormInterleavedPlugin without - // arguments, so we delete default constructor. - SkipLayerNormInt8PluginHFace() = delete; - - // IPluginV2DynamicExt Methods - nvinfer1::IPluginV2DynamicExt* clone() const noexcept override; - int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; - - // IPluginV2 Methods - int32_t initialize() noexcept override; - void terminate() noexcept override; - void destroy() noexcept override; - int32_t getNbOutputs() const noexcept override; - char const* getPluginVersion() const noexcept override; -}; - -class SkipLayerNormInt8PluginBaseCreator : public nvinfer1::IPluginCreator -{ -public: - SkipLayerNormInt8PluginBaseCreator(); - - char const* getPluginName() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - -private: - static nvinfer1::PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; -}; - -class SkipLayerNormInt8PluginHFaceCreator : public SkipLayerNormInt8PluginBaseCreator -{ -public: - SkipLayerNormInt8PluginHFaceCreator(); - - char const* getPluginVersion() const noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - nvinfer1::IPluginV2* deserializePlugin( - char const* name, void const* serialData, size_t serialLength) noexcept override; -}; - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp deleted file mode 100644 index 4ca63061c499490f3bf679fedb0c44bf16e961eb..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cpp +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#include "skipLayerNormPlugin.h" - -#include "bertCommon.h" -#include "checkMacrosPlugin.h" -#include "plugin.h" -#include "serialize.h" - -using namespace nvinfer1; -using namespace nvinfer1::ixrt_plugin; -using namespace nvinfer1::ixrt_plugin::bert; - -namespace { -char const* kSKIP_LAYER_NORM_VERSION{"1"}; -char const* kSKIP_LAYER_NORM_NAME{"CustomSkipLayerNormPluginDynamic_IxRT"}; -char const* kSKIP_LAYER_NORM_VAR_SEQLEN_VERSION{"2"}; -} // namespace - -// Static class fields initialization -PluginFieldCollection SkipLayerNormPluginDynamicCreator::mFC{}; -std::vector SkipLayerNormPluginDynamicCreator::mPluginAttributes; - -// REGISTER_TENSORRT_PLUGIN(SkipLayerNormPluginDynamicCreator); - -static inline DataType getParamWordType(DataType cfgType) noexcept { - if (cfgType == DataType::kINT8) { - return DataType::kHALF; - } - - return cfgType; -} - -SkipLayerNormPluginDynamicCreator::SkipLayerNormPluginDynamicCreator() { - mPluginAttributes.clear(); - mPluginAttributes.emplace_back(PluginField("ld")); - mPluginAttributes.emplace_back(PluginField("type_id")); - mPluginAttributes.emplace_back(PluginField("beta")); - mPluginAttributes.emplace_back(PluginField("gamma")); - mPluginAttributes.emplace_back(PluginField("bias")); - mFC.nbFields = mPluginAttributes.size(); - mFC.fields = mPluginAttributes.data(); -} - -char const* SkipLayerNormPluginDynamicCreator::getPluginName() const noexcept { return kSKIP_LAYER_NORM_NAME; } - -char const* SkipLayerNormPluginDynamicCreator::getPluginVersion() const noexcept { return kSKIP_LAYER_NORM_VERSION; } - -PluginFieldCollection const* SkipLayerNormPluginDynamicCreator::getFieldNames() noexcept { return &mFC; } - -IPluginV2* SkipLayerNormPluginDynamicCreator::createPlugin(char const* name, PluginFieldCollection const* fc) noexcept { - try { - gLogInfo << "SkipLayerNormPluginDynamicCreator createPlugin" << endl; - - int32_t ld = 0; - Weights beta{DataType::kFLOAT, nullptr, 0}; - Weights gamma{DataType::kFLOAT, nullptr, 0}; - Weights bias{DataType::kFLOAT, nullptr, 0}; - int32_t typeId = -1; - - IXRT_PLUGIN_ASSERT(fc != nullptr); - - ixrt_plugin::validateRequiredAttributesExist({"type_id", "beta", "ld", "gamma"}, fc); - - for (int32_t i = 0; i < fc->nbFields; i++) { - std::string field_name(fc->fields[i].name); - if (field_name.compare("ld") == 0) { - ld = *static_cast(fc->fields[i].data); - gLogInfo << "Building ld: " << ld << endl; - } - - if (field_name.compare("type_id") == 0) { - typeId = *static_cast(fc->fields[i].data); - gLogInfo << "Building typeId: " << typeId << endl; - } - - if (field_name.compare("beta") == 0) { - gLogInfo << "Building beta..." << endl; - beta.values = fc->fields[i].data; - beta.count = fc->fields[i].length; - beta.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("gamma") == 0) { - gLogInfo << "Building gamma..." << endl; - gamma.values = fc->fields[i].data; - gamma.count = fc->fields[i].length; - gamma.type = fieldTypeToDataType(fc->fields[i].type); - } - - if (field_name.compare("bias") == 0) { - gLogInfo << "Building bias..." << endl; - bias.values = fc->fields[i].data; - bias.count = fc->fields[i].length; - bias.type = fieldTypeToDataType(fc->fields[i].type); - } - } - gLogInfo << "Type " << typeId << endl; - - IXRT_PLUGIN_CHECK_VALUE(typeId >= 0 && typeId <= 3, - ("SkipLayerNorm: Invalid type ID: " + std::to_string(typeId)).c_str()); - - IXRT_PLUGIN_CHECK_VALUE(beta.values != nullptr, "SkipLayerNorm: invalid beta"); - IXRT_PLUGIN_CHECK_VALUE(beta.count > 0, "SkipLayerNorm: invalid beta"); - - IXRT_PLUGIN_CHECK_VALUE(gamma.values != nullptr, "SkipLayerNorm: invalid gamma"); - IXRT_PLUGIN_CHECK_VALUE(gamma.count > 0, "SkipLayerNorm: invalid gamma"); - - IXRT_PLUGIN_CHECK_VALUE(typeId == (int)DataType::kHALF, "typeId != DataType::kHALF error"); - - return new SkipLayerNormPluginDynamic(name, static_cast(typeId), ld, beta, gamma, bias); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -nvinfer1::IPluginV2* SkipLayerNormPluginDynamicCreator::deserializePlugin(char const* name, void const* serialData, - size_t serialLength) noexcept { - try { - return new SkipLayerNormPluginDynamic(name, serialData, serialLength); - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -void SkipLayerNormPluginDynamicCreator::setPluginNamespace(char const* pluginNamespace) noexcept { - try { - mNamespace = pluginNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* SkipLayerNormPluginDynamicCreator::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -//#########################################################################// -SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string name, const DataType type, int32_t const ld, - Weights const& beta, Weights const& gamma, Weights const& bias) - : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mHiddenSize(ld), mType(type), mBiasDev(nullptr) { - IXRT_PLUGIN_ASSERT(mType == nvinfer1::DataType::kFLOAT || mType == nvinfer1::DataType::kHALF || - mType == nvinfer1::DataType::kINT8); - - mCfgType = mType == DataType::kINT8 ? DataType::kHALF : mType; - mParamWordsize = getElementSize(mCfgType); - - mBeta.convertAndCopy(beta, mCfgType); - mGamma.convertAndCopy(gamma, mCfgType); - - mHasBias = (bias.values != nullptr); - if (mHasBias) { - mBias.convertAndCopy(bias, mCfgType); - } - - copyToDevice(mGamma, getWeightsSize(mGamma, mCfgType), mGammaDev); - copyToDevice(mBeta, getWeightsSize(mBeta, mCfgType), mBetaDev); - if (mHasBias) { - copyToDevice(mBias, getWeightsSize(mBias, mCfgType), mBiasDev); - } -} - -SkipLayerNormPluginDynamic::SkipLayerNormPluginDynamic(const std::string& name, void const* data, size_t length) - : mLayerName(name), mGammaDev(nullptr), mBetaDev(nullptr), mBiasDev(nullptr) { - gLogInfo << "SkipLayerNormPluginDynamic deserialize" << endl; - - // Deserialize in the same order as serialization - deserialize_value(&data, &length, &mType); - deserialize_value(&data, &length, &mCfgType); - deserialize_value(&data, &length, &mHiddenSize); - deserialize_value(&data, &length, &mHasBias); - - IXRT_PLUGIN_ASSERT(mCfgType == nvinfer1::DataType::kFLOAT || mCfgType == nvinfer1::DataType::kHALF); - mParamWordsize = getElementSize(mCfgType); - - char const* d = static_cast(data); - mBeta.convertAndCopy(d, mHiddenSize, mCfgType); - mGamma.convertAndCopy(d, mHiddenSize, mCfgType); - if (mHasBias) { - mBias.convertAndCopy(d, mHiddenSize, mCfgType); - } - - copyToDevice(mGamma, getWeightsSize(mGamma, mCfgType), mGammaDev); - copyToDevice(mBeta, getWeightsSize(mBeta, mCfgType), mBetaDev); - if (mHasBias) { - copyToDevice(mBias, getWeightsSize(mBias, mCfgType), mBiasDev); - } -} - -// IPluginV2Ext Methods -DataType SkipLayerNormPluginDynamic::getOutputDataType(int32_t index, DataType const* inputTypes, - int32_t nbInputs) const noexcept { - try { - IXRT_PLUGIN_ASSERT(inputTypes != nullptr); - IXRT_PLUGIN_ASSERT(index == 0); - IXRT_PLUGIN_ASSERT(nbInputs == 2); - return inputTypes[0]; - } catch (std::exception const& e) { - caughtError(e); - } - return DataType{}; -} - -// IPluginV2 Methods -char const* SkipLayerNormPluginDynamic::getPluginType() const noexcept { return kSKIP_LAYER_NORM_NAME; } - -char const* SkipLayerNormPluginDynamic::getPluginVersion() const noexcept { return kSKIP_LAYER_NORM_VERSION; } - -int32_t SkipLayerNormPluginDynamic::getNbOutputs() const noexcept { return 1; } -int32_t SkipLayerNormPluginDynamic::initialize() noexcept { - gLogInfo << "SkipLayerNormPluginDynamic initialize" << endl; - return 0; -} - -void SkipLayerNormPluginDynamic::terminate() noexcept { gLogInfo << "SkipLayerNormPluginDynamic terminate" << endl; } - -size_t SkipLayerNormPluginDynamic::getSerializationSize() const noexcept { - const size_t biasSize = mHasBias ? (mHiddenSize * mParamWordsize) : 0; - return 2 * mParamWordsize * mHiddenSize + 2 * sizeof(DataType) + sizeof(mHiddenSize) + biasSize + sizeof(mHasBias); -} - -void SkipLayerNormPluginDynamic::serialize(void* buffer) const noexcept { - try { - serialize_value(&buffer, mType); - serialize_value(&buffer, mCfgType); - serialize_value(&buffer, mHiddenSize); - serialize_value(&buffer, mHasBias); - - char* d = static_cast(buffer); - serFromDev(d, static_cast(mBetaDev.get()), mHiddenSize * mParamWordsize); - serFromDev(d, static_cast(mGammaDev.get()), mHiddenSize * mParamWordsize); - if (mHasBias) { - serFromDev(d, static_cast(mBiasDev.get()), mHiddenSize * mParamWordsize); - } - } catch (std::exception const& e) { - caughtError(e); - } -} - -void SkipLayerNormPluginDynamic::destroy() noexcept { - try { - gLogInfo << "SkipLayerNormPluginDynamic destroy" << endl; - // This gets called when the network containing plugin is destroyed - mGammaDev.reset(nullptr); - mBetaDev.reset(nullptr); - if (mHasBias) { - mBiasDev.reset(nullptr); - } - delete this; - } catch (std::exception const& e) { - caughtError(e); - } -} - -void SkipLayerNormPluginDynamic::setPluginNamespace(char const* libNamespace) noexcept { - try { - mNamespace = libNamespace; - } catch (std::exception const& e) { - caughtError(e); - } -} - -char const* SkipLayerNormPluginDynamic::getPluginNamespace() const noexcept { return mNamespace.c_str(); } - -// IPluginV2DynamicExt Methods -IPluginV2DynamicExt* SkipLayerNormPluginDynamic::clone() const noexcept { - try { - gLogInfo << "SkipLayerNormPluginDynamic clone" << endl; - - auto* p = new SkipLayerNormPluginDynamic(mLayerName, mType, mHiddenSize, mBeta, mGamma, mBias); - p->initialize(); - p->setPluginNamespace(mNamespace.c_str()); - return p; - } catch (std::exception const& e) { - caughtError(e); - } - return nullptr; -} - -DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, - int32_t nbInputs, IExprBuilder& exprBuilder) noexcept { - try { - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(nbInputs == 2); - IXRT_PLUGIN_ASSERT(outputIndex == 0); - IXRT_PLUGIN_ASSERT(inputs[0].nbDims == inputs[1].nbDims); - return inputs[0]; - } catch (std::exception const& e) { - caughtError(e); - } - return DimsExprs{}; -} - -bool SkipLayerNormPluginDynamic::supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept { - try { - IXRT_PLUGIN_ASSERT(inOut != nullptr); - IXRT_PLUGIN_ASSERT(nbInputs == 2); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(pos >= 0 && pos < (nbInputs + nbOutputs)); - - PluginTensorDesc const& in = inOut[pos]; - if (pos == 0) { - return (in.type == mType) && (in.format == TensorFormat::kLINEAR); - } - PluginTensorDesc const& prev = inOut[pos - 1]; - - return in.type == prev.type && in.format == prev.format && (in.type == DataType::kHALF); - } catch (std::exception const& e) { - caughtError(e); - } - return false; -} - -void SkipLayerNormPluginDynamic::configurePlugin(DynamicPluginTensorDesc const* inputs, int32_t nbInputs, - DynamicPluginTensorDesc const* outputs, int32_t nbOutputs) noexcept { - try { - gLogInfo << "SkipLayerNormPluginDynamic configurePlugin" << endl; - - // Validate input arguments - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(outputs != nullptr); - IXRT_PLUGIN_ASSERT(nbOutputs == 1); - IXRT_PLUGIN_ASSERT(nbInputs == 2); - if (mType == DataType::kFLOAT || mType == DataType::kHALF) { - IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type); - IXRT_PLUGIN_ASSERT(mType == inputs[1].desc.type); - } else { - IXRT_PLUGIN_ASSERT(mType == inputs[0].desc.type || DataType::kFLOAT == inputs[0].desc.type); - IXRT_PLUGIN_ASSERT(mType == inputs[1].desc.type || DataType::kFLOAT == inputs[1].desc.type); - } - auto const& inDims0 = inputs[0].desc.dims; - auto const& inDims1 = inputs[1].desc.dims; - IXRT_PLUGIN_ASSERT(inDims0.nbDims == inDims1.nbDims); - - IXRT_PLUGIN_ASSERT(std::equal(inDims0.d, inDims0.d + inDims0.nbDims, inDims1.d)); - - IXRT_PLUGIN_ASSERT(inDims0.nbDims == 5); - mHiddenSize = inDims0.d[HDIM]; // hiddensize - IXRT_PLUGIN_ASSERT(mHiddenSize != 0U); - IXRT_PLUGIN_ASSERT(inDims0.d[3] == 1); - IXRT_PLUGIN_ASSERT(inDims0.d[4] == 1); - IXRT_PLUGIN_ASSERT(outputs[0].desc.type == DataType::kHALF); - - mCfgType = inputs[0].desc.type == DataType::kINT8 ? DataType::kHALF : inputs[0].desc.type; - - auto const paramType = getParamWordType(mCfgType); - mParamWordsize = getElementSize(paramType); - } catch (std::exception const& e) { - caughtError(e); - } -} - -size_t SkipLayerNormPluginDynamic::getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, - PluginTensorDesc const* outputs, int32_t nbOutputs) const noexcept { - return 0; -} - -int32_t SkipLayerNormPluginDynamic::enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, - void const* const* inputs, void* const* outputs, void* workspace, - cudaStream_t stream) noexcept { - gLogInfo << "in SkipLayerNormPluginDynamic.." << endl; - int32_t status = -1; - try { - IXRT_PLUGIN_ASSERT(inputs != nullptr); - IXRT_PLUGIN_ASSERT(outputs != nullptr); - int32_t const inputVolume = volume(inputDesc[0].dims); - DataType iType = inputDesc->type; - - // Our plugin outputs only one tensor - // Launch CUDA kernel wrapper and save its return value - if (iType == DataType::kFLOAT) { - gLogInfo << "SkipLayerNormPlugin fp32 not supported yet!" << endl; - return STATUS_NOT_SUPPORTED; - } else if (iType == DataType::kHALF) { - auto const* input = static_cast(inputs[0]); - auto skip = (half*)(inputs[1]); - auto* output = static_cast(outputs[0]); - auto const* const bias = static_cast(mBiasDev.get()); - auto const* const beta = static_cast(mBetaDev.get()); - auto const* const gamma = static_cast(mGammaDev.get()); - - if (mHasBias) { - status = computeSkipLayerNorm(stream, static_cast(mHiddenSize), inputVolume, input, - gamma, beta, bias, skip, output); - } else { - status = computeSkipLayerNorm(stream, static_cast(mHiddenSize), inputVolume, - input, gamma, beta, bias, skip, output); - } - } else { - IXRT_PLUGIN_CHECK_VALUE(false, "Unsupported type error, expected [kHALF,kFLOAT], but received " + - std::to_string(static_cast(iType))); - } - if (status != cudaSuccess) { - return STATUS_FAILURE; - } - return STATUS_SUCCESS; - } catch (std::exception const& e) { - caughtError(e); - } - return STATUS_FAILURE; -} diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu deleted file mode 100644 index 1b127fc5bbd62c131fea5f0eceddc4dc5b464d47..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.cu +++ /dev/null @@ -1,401 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#include -#include -#include - -#include "backend/bert/bert_helper.h" -#include "skipLayerNormPlugin.h" -// #include "backend/transformer/transformer_add_norm.h" - -using namespace nvinfer1::ixrt_plugin::backend; - -namespace nvinfer1::ixrt_plugin { -namespace bert { - -template -__global__ void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, - const half *residual_bias, half *output, half *residual, int hidden_size, - bool is_post_ln) { - float2 vals[THREAD_DATA_LEN]; - int block_start = blockIdx.x * hidden_size / 2; - half2 *p_input = (half2 *)input; - half2 *p_output = (half2 *)output; - half2 *p_residual = (half2 *)residual; - half2 *p_scale = (half2 *)scale; - half2 *p_bias = (half2 *)bias; - half2 *p_residual_bias = (half2 *)residual_bias; - // one line start - p_input += block_start; - p_output += block_start; - p_residual += block_start; - - float thread_m2 = 0; - float thread_mean = 0; - float thread_count = 0; - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - if (element_index < hidden_size / 2) { - half2 value1 = p_input[element_index]; - half2 value2 = p_residual[element_index]; - - vals[it].x = __half2float(value1.x) + __half2float(value2.x); - vals[it].y = __half2float(value1.y) + __half2float(value2.y); - - half2 res_bias_val_1; - if (residual_bias == nullptr) { - res_bias_val_1.x = __float2half(0.0f); - res_bias_val_1.y = __float2half(0.0f); - } else { - res_bias_val_1 = p_residual_bias[element_index]; - } - vals[it].x = vals[it].x + __half2float(res_bias_val_1.x); - vals[it].y = vals[it].y + __half2float(res_bias_val_1.y); - - WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count); - } - } - - float mean = 0; - float m2 = 0; - float count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count); - mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE); - m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE); - count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE); - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - if (element_index < hidden_size / 2) { - float2 norm_value; - half2 scale_1 = p_scale[element_index]; - half2 bias_1 = p_bias[element_index]; - norm_value.x = (vals[it].x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + - __half2float(bias_1.x); - norm_value.y = (vals[it].y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + - __half2float(bias_1.y); - - half2 res; - res.x = __float2half(norm_value.x); - res.y = __float2half(norm_value.y); - - p_output[element_index] = res; - - half2 r1; - if (is_post_ln) { - r1 = res; - } else { - r1.x = __float2half(vals[it].x); - r1.y = __float2half(vals[it].y); - } - p_residual[element_index] = r1; - } - } -} - -void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, const half *residual_bias, - half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream, - bool is_post_ln) { - if (hidden_size > 2048) { - throw std::runtime_error("hidden_size should <= 1024"); - } - if (hidden_size % 2 != 0) { - throw std::runtime_error("hidden_size % 2 != 0"); - } - - dim3 gridSize(batch_tokens); - dim3 blockSize(C10_WARP_SIZE); - - int neareast_hidden_size = hidden_size; - if (neareast_hidden_size % (C10_WARP_SIZE * 2) != 0) { - neareast_hidden_size = neareast_hidden_size + C10_WARP_SIZE * 2 - neareast_hidden_size % (C10_WARP_SIZE * 2); - } - - int num_warp = neareast_hidden_size / C10_WARP_SIZE / 2; - - switch (num_warp) { - case 1: - IxinferResidualBiasLnPad<1><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 2: - IxinferResidualBiasLnPad<2><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 3: - IxinferResidualBiasLnPad<3><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 4: - IxinferResidualBiasLnPad<4><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 5: - IxinferResidualBiasLnPad<5><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 6: - IxinferResidualBiasLnPad<6><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 7: - IxinferResidualBiasLnPad<7><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 8: - IxinferResidualBiasLnPad<8><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 9: - IxinferResidualBiasLnPad<9><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 10: - IxinferResidualBiasLnPad<10><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 11: - IxinferResidualBiasLnPad<11><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 12: - IxinferResidualBiasLnPad<12><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 13: - IxinferResidualBiasLnPad<13><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 14: - IxinferResidualBiasLnPad<14><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 15: - IxinferResidualBiasLnPad<15><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 16: - IxinferResidualBiasLnPad<16><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - default: - std::cout << "hidden size: " << hidden_size << std::endl; - throw std::runtime_error("IxinferResidualBiasLnPad not supported!"); - break; - } -} - -template -__global__ void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias, - half *output, half *residual, int hidden_size, bool is_post_ln) { - float2 vals[THREAD_DATA_LEN]; - int block_start = blockIdx.x * hidden_size / 2; - half2 *p_input = (half2 *)input; - half2 *p_output = (half2 *)output; - half2 *p_residual = (half2 *)residual; - half2 *p_scale = (half2 *)scale; - half2 *p_bias = (half2 *)bias; - half2 *p_residual_bias = (half2 *)residual_bias; - - p_input += block_start; - p_output += block_start; - p_residual += block_start; - - float thread_m2 = 0; - float thread_mean = 0; - float thread_count = 0; - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - half2 value1 = p_input[element_index]; - half2 value2 = p_residual[element_index]; - - vals[it].x = __half2float(value1.x) + __half2float(value2.x); - vals[it].y = __half2float(value1.y) + __half2float(value2.y); - - half2 res_bias_val_1; - if (residual_bias == nullptr) { - res_bias_val_1.x = __float2half(0.0f); - res_bias_val_1.y = __float2half(0.0f); - } else { - res_bias_val_1 = p_residual_bias[element_index]; - } - vals[it].x = vals[it].x + __half2float(res_bias_val_1.x); - vals[it].y = vals[it].y + __half2float(res_bias_val_1.y); - - WelfordCombine(vals[it].x, &thread_mean, &thread_m2, &thread_count); - WelfordCombine(vals[it].y, &thread_mean, &thread_m2, &thread_count); - } - - float mean = 0; - float m2 = 0; - float count = 0; - WelfordWarpReduce(thread_mean, thread_m2, thread_count, &mean, &m2, &count); - mean = __shfl_sync(0xffffffff, mean, 0, C10_WARP_SIZE); - m2 = __shfl_sync(0xffffffff, m2, 0, C10_WARP_SIZE); - count = __shfl_sync(0xffffffff, count, 0, C10_WARP_SIZE); - -#pragma unroll - for (int it = 0; it < THREAD_DATA_LEN; ++it) { - int element_index = threadIdx.x + it * C10_WARP_SIZE; - float2 norm_value; - half2 scale_1 = p_scale[element_index]; - half2 bias_1 = p_bias[element_index]; - norm_value.x = - (vals[it].x - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.x) + __half2float(bias_1.x); - norm_value.y = - (vals[it].y - mean) * rsqrtf(m2 / hidden_size + epsilon) * __half2float(scale_1.y) + __half2float(bias_1.y); - - half2 res; - res.x = __float2half(norm_value.x); - res.y = __float2half(norm_value.y); - - p_output[element_index] = res; - - half2 r1; - if (is_post_ln) { - r1 = res; - } else { - r1.x = __float2half(vals[it].x); - r1.y = __float2half(vals[it].y); - } - p_residual[element_index] = r1; - } -} - -void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias, - half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream, - bool is_post_ln) { - if (hidden_size > 2048) { - throw std::runtime_error("hidden_size should <= 1024"); - } - if ((hidden_size % 2 == 0) && (hidden_size % (C10_WARP_SIZE * 2) != 0)) { - IxinferResidualBiasLnPad(input, scale, bias, residual_bias, output, residual, batch_tokens, hidden_size, stream, - is_post_ln); - } else { - if (hidden_size % (C10_WARP_SIZE * 2) != 0) { - throw std::runtime_error("hidden_size // (C10_WARP_SIZE*2) != 0"); - } - dim3 gridSize(batch_tokens); - dim3 blockSize(C10_WARP_SIZE); - - int num_warp = hidden_size / C10_WARP_SIZE / 2; - - switch (num_warp) { - case 1: - IxinferResidualBiasLn<1><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 2: - IxinferResidualBiasLn<2><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 3: - IxinferResidualBiasLn<3><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 4: - IxinferResidualBiasLn<4><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 5: - IxinferResidualBiasLn<5><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 6: - IxinferResidualBiasLn<6><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 7: - IxinferResidualBiasLn<7><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 8: - IxinferResidualBiasLn<8><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 9: - IxinferResidualBiasLn<9><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 10: - IxinferResidualBiasLn<10><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 11: - IxinferResidualBiasLn<11><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 12: - IxinferResidualBiasLn<12><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 13: - IxinferResidualBiasLn<13><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 14: - IxinferResidualBiasLn<14><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 15: - IxinferResidualBiasLn<15><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - case 16: - IxinferResidualBiasLn<16><<>>(input, scale, bias, residual_bias, output, - residual, hidden_size, is_post_ln); - break; - default: - throw std::runtime_error("IxinferResidualBiasLn"); - break; - } - } -} - -template -int32_t computeSkipLayerNorm(cudaStream_t stream, int32_t E, int32_t volume, const T* input, const T* gamma, const T* beta, const T* bias, T* skip, T* output) -{ - assert(volume % E == 0); - int32_t batch_tokens = volume / E; - IxinferResidualBiasLn(input, gamma, beta, bias, output, skip, batch_tokens, E, stream, true); - return 0; -} - -template int32_t computeSkipLayerNorm(cudaStream_t, int32_t, int32_t, const half*, const half*, const half*, const half*, half*, half*); -template int32_t computeSkipLayerNorm(cudaStream_t, int32_t, int32_t, const half*, const half*, const half*, const half*, half*, half*); -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h b/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h deleted file mode 100644 index fa37318fcff8e3d8ab3f3cfcadf34e378477a1a3..0000000000000000000000000000000000000000 --- a/models/nlp/plm/bert_large_squad/ixrt/src/skip_layernorm/skipLayerNormPlugin.h +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -* All Rights Reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); you may -* not use this file except in compliance with the License. You may obtain -* a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -* License for the specific language governing permissions and limitations -* under the License. -* -* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -* SPDX-License-Identifier: Apache-2.0 -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ -#pragma once -#include -#include - -#include "NvInferRuntime.h" -#include "bertCommon.h" - -namespace nvinfer1::ixrt_plugin { -namespace bert { - -template -int32_t computeSkipLayerNorm(cudaStream_t stream, int32_t E, int32_t volume, const T* input, const T* gamma, const T* beta, const T* bias, T* skip, T* output); - -void IxinferResidualBiasLn(const half *input, const half *scale, const half *bias, const half *residual_bias, - half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream, - bool is_post_ln); - -void IxinferResidualBiasLnPad(const half *input, const half *scale, const half *bias, const half *residual_bias, - half *output, half *residual, int batch_tokens, int hidden_size, cudaStream_t stream, - bool is_post_ln); -class SkipLayerNormPluginDynamic : public IPluginV2DynamicExt { - public: - SkipLayerNormPluginDynamic(const std::string name, const nvinfer1::DataType type, int32_t const ld, - nvinfer1::Weights const& beta, nvinfer1::Weights const& gamma, nvinfer1::Weights const& bias); - SkipLayerNormPluginDynamic(const std::string &name, void const* data, size_t length); - SkipLayerNormPluginDynamic() noexcept = delete; - ~SkipLayerNormPluginDynamic() override = default; - - // IPluginV2 methods - char const* getPluginType() const noexcept override; - char const* getPluginVersion() const noexcept override; - int32_t getNbOutputs() const noexcept override; - int32_t initialize() noexcept override; - void terminate() noexcept override; - size_t getSerializationSize() const noexcept override; - void serialize(void* buffer) const noexcept override; - void destroy() noexcept override; - void setPluginNamespace(char const* libNamespace) noexcept override; - char const* getPluginNamespace() const noexcept override; - - // IPluginV2Ext methods - DataType getOutputDataType(int32_t index, DataType const* inputType, int32_t nbInputs) const noexcept override; - - // IPluginV2DynamicExt methods - IPluginV2DynamicExt* clone() const noexcept override; - DimsExprs getOutputDimensions(int32_t outputIndex, DimsExprs const* inputs, int32_t nbInputs, - IExprBuilder& exprBuilder) noexcept override; - bool supportsFormatCombination(int32_t pos, PluginTensorDesc const* inOut, int32_t nbInputs, - int32_t nbOutputs) noexcept override; - void configurePlugin(DynamicPluginTensorDesc const* in, int32_t nbInputs, DynamicPluginTensorDesc const* out, - int32_t nbOutputs) noexcept override; - size_t getWorkspaceSize(PluginTensorDesc const* inputs, int32_t nbInputs, PluginTensorDesc const* outputs, - int32_t nbOutputs) const noexcept override; - int32_t enqueue(PluginTensorDesc const* inputDesc, PluginTensorDesc const* outputDesc, void const* const* inputs, - void* const* outputs, void* workspace, cudaStream_t stream) noexcept override; - - private: - const std::string mLayerName; - std::string mNamespace; - cuda_unique_ptr mGammaDev; - cuda_unique_ptr mBetaDev; - WeightsWithOwnership mGamma; - WeightsWithOwnership mBeta; - size_t mHiddenSize{}; - size_t mParamWordsize{}; - DataType mType; - DataType mCfgType; - // mCfgType is the dataType for beta, gamma bias weights, always fp16 or fp32 - // mType is the plugin IO datatype, can be int8 - - bool mHasBias{}; - cuda_unique_ptr mBiasDev; - WeightsWithOwnership mBias; -}; - -class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator -{ -public: - SkipLayerNormPluginDynamicCreator(); - - char const* getPluginName() const noexcept override; - - char const* getPluginVersion() const noexcept override; - - nvinfer1::PluginFieldCollection const* getFieldNames() noexcept override; - - nvinfer1::IPluginV2* createPlugin(char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept override; - - nvinfer1::IPluginV2* deserializePlugin( - char const* name, void const* serialData, size_t serialLength) noexcept override; - - void setPluginNamespace(char const* pluginNamespace) noexcept override; - - char const* getPluginNamespace() const noexcept override; - -private: - static nvinfer1::PluginFieldCollection mFC; - static std::vector mPluginAttributes; - std::string mNamespace; -}; - -} // namespace bert -} // namespace nvinfer1::ixrt_plugin \ No newline at end of file diff --git a/tests/run_ixrt.py b/tests/run_ixrt.py index eb25acab7388ad14c509fd48a0862ff0bbec7f32..f2395bb83fcdee8e28aebf7683e05a2199f8b75e 100644 --- a/tests/run_ixrt.py +++ b/tests/run_ixrt.py @@ -481,16 +481,14 @@ def run_nlp_testcase(model): elif model_name == "bert_large_squad": script = f""" set -x - cd ../{model['model_path']}/python - bash script/build_engine.sh --bs 32 - bash script/inference_squad.sh --bs 32 + bash script/infer_bert_large_squad_fp16_accuracy.sh + bash script/infer_bert_large_squad_fp16_performance.sh """ if prec == "int8": script = f""" set -x - cd ../{model['model_path']}/python - bash script/build_engine.sh --bs 32 --int8 - bash script/inference_squad.sh --bs 32 --int8 + bash script/infer_bert_large_squad_int8_accuracy.sh + bash script/infer_bert_large_squad_int8_performance.sh """ r, t = run_script(script)