diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/build.py b/models/nlp/large_language_model/llama2-13b/trtllm/build.py deleted file mode 100644 index 4ff0c9eaa0cedfd382783a5cfcca9175bf38acad..0000000000000000000000000000000000000000 --- a/models/nlp/large_language_model/llama2-13b/trtllm/build.py +++ /dev/null @@ -1,1163 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import math -import os -import sys -import time -from pathlib import Path - -# isort: off -import torch -import torch.multiprocessing as mp -import tensorrt as trt -# isort: on -from transformers import LlamaConfig, LlamaForCausalLM - -try: - from transformers import MixtralForCausalLM -except ImportError: - MixtralForCausalLM = None - -try: - from transformers import LlavaConfig, LlavaForConditionalGeneration -except ImportError: - pass - -import tensorrt_llm -from tensorrt_llm import profiler -from tensorrt_llm._common import check_max_num_tokens -from tensorrt_llm._utils import str_dtype_to_trt -from tensorrt_llm.builder import Builder -from tensorrt_llm.layers import MoeConfig -from tensorrt_llm.layers.attention import PositionEmbeddingType -from tensorrt_llm.logger import logger -from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import quantize_model -from tensorrt_llm.network import net_guard -from tensorrt_llm.plugin.plugin import ContextFMHAType -from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime.lora_manager import LoraConfig - -from tensorrt_llm.models.llama.weight import ( # isort:skip - get_scaling_factors, load_from_awq_llama, load_from_binary, - load_from_gptq_llama, load_from_hf_checkpoint, load_from_hf_llama, - load_from_meta_llama, parse_bin_config) - -MODEL_NAME = "llama" - -# 2 routines: get_engine_name, serialize_engine -# are direct copy from gpt example, TODO: put in utils? - -import onnx -from onnx import TensorProto, helper - - -def trt_dtype_to_onnx(dtype): - if dtype == trt.float16: - return TensorProto.DataType.FLOAT16 - if dtype == trt.bfloat16: - return TensorProto.DataType.BFLOAT16 - elif dtype == trt.float32: - return TensorProto.DataType.FLOAT - elif dtype == trt.int32: - return TensorProto.DataType.INT32 - elif dtype == trt.int64: - return TensorProto.DataType.INT64 - elif dtype == trt.bool: - return TensorProto.DataType.BOOL - else: - raise TypeError("%s is not supported" % dtype) - - -def to_onnx(network, path): - inputs = [] - for i in range(network.num_inputs): - network_input = network.get_input(i) - inputs.append( - helper.make_tensor_value_info( - network_input.name, trt_dtype_to_onnx(network_input.dtype), - list(network_input.shape))) - - outputs = [] - for i in range(network.num_outputs): - network_output = network.get_output(i) - outputs.append( - helper.make_tensor_value_info( - network_output.name, trt_dtype_to_onnx(network_output.dtype), - list(network_output.shape))) - - nodes = [] - for i in range(network.num_layers): - layer = network.get_layer(i) - layer_inputs = [] - for j in range(layer.num_inputs): - ipt = layer.get_input(j) - if ipt is not None: - layer_inputs.append(layer.get_input(j).name) - layer_outputs = [ - layer.get_output(j).name for j in range(layer.num_outputs) - ] - nodes.append( - helper.make_node(str(layer.type), - name=layer.name, - inputs=layer_inputs, - outputs=layer_outputs, - domain="com.nvidia")) - - onnx_model = helper.make_model(helper.make_graph(nodes, - 'attention', - inputs, - outputs, - initializer=None), - producer_name='NVIDIA') - onnx.save(onnx_model, path) - - -def get_engine_name(model, dtype, tp_size, pp_size, rank): - if pp_size == 1: - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) - return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, - pp_size, rank) - - -def serialize_engine(engine, path): - logger.info(f'Serializing engine to {path}...') - tik = time.time() - with open(path, 'wb') as f: - f.write(engine) - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Engine serialized. Total time: {t}') - - -def parse_arguments(cmd_args=None): - parser = argparse.ArgumentParser() - parser.add_argument('--world_size', type=int, default=1) - parser.add_argument('--tp_size', type=int, default=1) - parser.add_argument('--pp_size', type=int, default=1) - parser.add_argument('--model_dir', type=str, default=None) - parser.add_argument('--bin_model_dir', type=str, default=None) - parser.add_argument('--meta_ckpt_dir', type=str, default=None) - parser.add_argument('--quant_ckpt_path', type=str, default=None) - parser.add_argument('--dtype', - type=str, - default='float16', - choices=['float32', 'bfloat16', 'float16']) - parser.add_argument( - '--timing_cache', - type=str, - default='model.cache', - help= - 'The path of to read timing cache from, will be ignored if the file does not exist' - ) - parser.add_argument( - '--profiling_verbosity', - type=str, - default='layer_names_only', - choices=['layer_names_only', 'detailed', 'none'], - help= - 'The profiling verbosity for the generated TRT engine. Set to detailed can inspect tactic choices and kernel parameters.' - ) - parser.add_argument('--log_level', type=str, default='info') - parser.add_argument('--vocab_size', type=int, default=32000) - parser.add_argument('--n_layer', type=int, default=32) - parser.add_argument('--n_positions', type=int, default=2048) - parser.add_argument('--n_embd', type=int, default=4096) - parser.add_argument('--n_head', type=int, default=32) - parser.add_argument('--n_kv_head', type=int, default=None) - parser.add_argument('--multiple_of', type=int, default=256) - parser.add_argument('--ffn_dim_multiplier', type=float, default=1.0) - parser.add_argument('--inter_size', type=int, default=None) - parser.add_argument('--hidden_act', type=str, default='silu') - parser.add_argument('--rms_norm_eps', type=float, default=1e-06) - parser.add_argument('--max_batch_size', type=int, default=8) - parser.add_argument('--max_input_len', type=int, default=2048) - parser.add_argument('--max_output_len', type=int, default=512) - parser.add_argument('--max_beam_width', type=int, default=1) - parser.add_argument('--rotary_base', type=float, default=10000.0) - parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None) - parser.add_argument('--use_gpt_attention_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_gemm_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_rmsnorm_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_lookup_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_gather_last_token_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_activation_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_elementwise_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument("--use_cast_plugin", action="store_true") - - parser.add_argument('--parallel_build', default=False, action='store_true') - parser.add_argument('--enable_context_fmha', - default=False, - action='store_true') - parser.add_argument('--enable_context_fmha_fp32_acc', - default=False, - action='store_true') - parser.add_argument( - '--use_paged_context_fmha', - action='store_true', - help= - 'Activates paged context FMHA. This mode of the context FMHA is required for chunked context, speculative decoding and reuse of KV cache blocks. Context FMHA performance is worse when this mode is on.' - ) - parser.add_argument( - '--multi_block_mode', - default=False, - action='store_true', - help= - 'Split long kv sequence into multiple blocks (applied to generation MHA kernels). \ - It is beneficial when batch x num_heads cannot fully utilize GPU.' - ) - parser.add_argument( - '--disable_xqa', - default=False, - action='store_true', - help= - 'Disable XQA optimization for the generation MHA. See more details in docs/gpt_attention.' - ) - parser.add_argument('--visualize', default=False, action='store_true') - parser.add_argument('--load_by_shard', - action='store_true', - help='Load a pretrained model shard-by-shard.') - parser.add_argument('--enable_debug_output', - default=False, - action='store_true') - parser.add_argument('--gpus_per_node', type=int, default=8) - parser.add_argument('--builder_opt', type=int, default=None) - parser.add_argument( - '--output_dir', - type=str, - default='engine_outputs', - help= - 'The path to save the serialized engine files, timing cache file and model configs' - ) - parser.add_argument('--remove_input_padding', - default=False, - action='store_true') - parser.add_argument( - '--use_fused_mlp', - default=False, - action='store_true', - help= - 'Enable horizontal fusion in GatedMLP, reduces layer input traffic and potentially improves performance. ' - 'For FP8 PTQ, the downside is slight reduction of accuracy because one of the quantization scaling factors are discarded ' - '(0.45734 vs 0.45755 for LLaMA-v2 7B using ammo/examples/hf/instruct_eval/mmlu.py).' - ) - parser.add_argument('--enable_pos_shift', - default=False, - action='store_true', - help='Enable position shift for streamingllm method') - parser.add_argument( - '--dense_context_fmha', - default=False, - action='store_true', - help= - 'Enable dense fmha in context phase, otherwise sliding window attention.' - 'If dense_context_fmha=False, the sliding window size is the max attention window size.' - ) - # Arguments related to the quantization of the model. - parser.add_argument( - '--use_smooth_quant', - default=False, - action="store_true", - help= - 'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.' - 'See --per_channel and --per_token for finer-grained quantization options.' - ) - parser.add_argument( - '--per_channel', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor for the GEMM\'s result. ' - 'per_channel instead uses a different static scaling factor for each channel. ' - 'The latter is usually more accurate, but a little slower.') - parser.add_argument( - '--per_token', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor to scale activations in the int8 range. ' - 'per_token chooses at run time, and for each token, a custom scaling factor. ' - 'The latter is usually more accurate, but a little slower.') - parser.add_argument( - '--per_group', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor to scale weights in the int4 range. ' - 'per_group chooses at run time, and for each group, a custom scaling factor. ' - 'The flag is built for GPTQ/AWQ quantization.') - parser.add_argument('--group_size', - type=int, - default=128, - help='Group size used in GPTQ/AWQ quantization.') - parser.add_argument( - '--int8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' - ) - parser.add_argument( - '--use_parallel_embedding', - action="store_true", - default=False, - help= - 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' - ) - parser.add_argument( - '--embedding_sharding_dim', - type=int, - default=1, # Meta does TP on hidden dim - choices=[0, 1], - help= - 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' - 'To shard it along hidden dimension, set embedding_sharding_dim=1' - 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' - ) - parser.add_argument( - '--enable_fp8', - default=False, - action='store_true', - help='Use FP8 Linear layer for Attention QKV/Dense and MLP.') - parser.add_argument( - '--fp8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV' - ) - parser.add_argument( - '--quantized_fp8_model_path', - type=str, - default=None, - help='Path of a quantized model checkpoint in .npz format') - parser.add_argument( - '--use_weight_only', - default=False, - action="store_true", - help='Quantize weights for the various GEMMs to INT4/INT8.' - 'See --weight_only_precision to set the precision') - parser.add_argument( - '--disable_weight_only_quant_plugin', - default=False, - action="store_true", - help= - 'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.' - 'You must also use --use_weight_only for that argument to have an impact.' - ) - parser.add_argument( - '--weight_only_precision', - const='int8', - type=str, - nargs='?', - default='int8', - choices=['int8', 'int4', 'int4_awq', 'int4_gptq'], - help= - 'Define the precision for the weights when using weight-only quantization.' - 'You must also use --use_weight_only for that argument to have an impact.' - ) - parser.add_argument( - '--quantize_lm_head', - default=False, - action="store_true", - help='Quantize lm_head weights as well when using int4_awq.') - parser.add_argument( - '--use_inflight_batching', - action="store_true", - default=False, - help="Activates inflight batching mode of gptAttentionPlugin.") - parser.add_argument( - '--paged_kv_cache', - action="store_true", - default=False, - help= - 'By default we use contiguous KV cache. By setting this flag you enable paged KV cache' - ) - parser.add_argument('--tokens_per_block', - type=int, - default=128, - help='Number of tokens per block in paged KV cache') - parser.add_argument( - '--max_num_tokens', - type=int, - default=None, - help= - 'Define the max number of tokens supported by the engine, note that it takes no effect if --remove_input_padding is not set' - ) - parser.add_argument( - '--strongly_typed', - default=False, - action="store_true", - help= - 'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.' - ) - parser.add_argument( - '--use_custom_all_reduce', - action='store_true', - help= - 'Activates latency-optimized algorithm for all-reduce instead of NCCL.') - parser.add_argument( - '--max_prompt_embedding_table_size', - type=int, - default=0, - help='Setting to a value > 0 enables support for prompt tuning.') - parser.add_argument( - '--gather_all_token_logits', - action='store_true', - default=False, - help='Enable both gather_context_logits and gather_generation_logits') - parser.add_argument('--gather_context_logits', - action='store_true', - default=False, - help='Gather context logits') - parser.add_argument('--gather_generation_logits', - action='store_true', - default=False, - help='Gather generation logits') - parser.add_argument( - '--use_lora_plugin', - nargs='?', - const=None, - default=False, - choices=['float16', 'float32', 'bfloat16'], - help="Activates the lora plugin which enables embedding sharing.") - parser.add_argument( - '--lora_target_modules', - nargs='+', - default=None, - choices=[ - "attn_qkv", - "attn_q", - "attn_k", - "attn_v", - "attn_dense", - "mlp_h_to_4h", - "mlp_gate", - "mlp_4h_to_h", - ], - help= - "Add lora in which modules. Only be activated when use_lora_plugin is enabled." - ) - parser.add_argument('--hf_lora_dir', type=str, default=None) - parser.add_argument( - '--max_lora_rank', - type=int, - default=64, - help='maximum lora rank for different lora modules. ' - 'It is used to compute the workspace size of lora plugin.') - parser.add_argument( - '--moe_num_experts', - default=0, - type=int, - help='Specify the number of experts to use for MOE layers') - parser.add_argument( - '--moe_top_k', - default=0, - type=int, - help= - 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' - ) - parser.add_argument( - '--moe_tp_mode', - default=MoeConfig.ParallelismMode.TENSOR_PARALLEL, - type=int, - help= - 'Controls how to distribute experts in TP. Check layers/moe.py for accepted values', - ) - parser.add_argument( - '--moe_renorm_mode', - default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, - type=int, - help= - 'Controls renormalization after gate logits. Check layers/moe.py for accepted values', - ) - parser.add_argument("--total_build_time_target", type=float, default=0) - - args = parser.parse_args(cmd_args) - logger.set_level(args.log_level) - - assert args.total_build_time_target >= 0, "total_build_time_target must bigger than 0" - - assert not ( - args.use_smooth_quant and args.use_weight_only - ), "You cannot enable both SmoothQuant and INT8 weight-only together." - - if not args.remove_input_padding: - if args.use_gpt_attention_plugin: - logger.warning( - f"It is recommended to specify --remove_input_padding when using GPT attention plugin" - ) - - if args.use_inflight_batching: - if not args.use_gpt_attention_plugin: - args.use_gpt_attention_plugin = 'float16' - logger.info( - f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'" - ) - if not args.remove_input_padding: - args.remove_input_padding = True - logger.info( - "Using remove input padding for inflight batching mode.") - if not args.paged_kv_cache: - args.paged_kv_cache = True - logger.info("Using paged KV cache for inflight batching mode.") - - if args.use_smooth_quant: - args.quant_mode = QuantMode.use_smooth_quant(args.per_token, - args.per_channel) - elif args.use_weight_only: - args.quant_mode = QuantMode.from_description( - quantize_weights=True, - quantize_activations=False, - per_token=False, - per_channel=False, - per_group=args.per_group, - use_int4_weights="int4" in args.weight_only_precision) - else: - args.quant_mode = QuantMode(0) - - if args.int8_kv_cache: - args.quant_mode = args.quant_mode.set_int8_kv_cache() - elif args.fp8_kv_cache: - args.quant_mode = args.quant_mode.set_fp8_kv_cache() - if args.enable_fp8: - args.quant_mode = args.quant_mode.set_fp8_qdq() - - if args.rotary_scaling is not None: - assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin." - rotary_scaling = { - "type": args.rotary_scaling[0], - "factor": float(args.rotary_scaling[1]) - } - assert rotary_scaling["type"] in ["linear", "dynamic"] - assert rotary_scaling["factor"] > 1.0 - args.rotary_scaling = rotary_scaling - - if args.model_dir is not None: - hf_config = LlamaConfig.from_pretrained(args.model_dir) - if hf_config.model_type == "llava": - # LLaVA = Vision model + Llama LLM - # We load a llava config and use its' text config as llama config - hf_config = LlavaConfig.from_pretrained(args.model_dir).text_config - hf_config.model_type = "llava" # Replace llama with llava - - args.inter_size = hf_config.intermediate_size # override the inter_size for LLaMA - args.n_embd = hf_config.hidden_size - args.n_head = hf_config.num_attention_heads - if hasattr(hf_config, "num_key_value_heads"): - args.n_kv_head = hf_config.num_key_value_heads - - # hf_config.num_hidden_layers = 1 # only for debug - args.n_layer = hf_config.num_hidden_layers - args.n_positions = hf_config.max_position_embeddings - args.vocab_size = hf_config.vocab_size if hf_config.vocab_size is not None else args.vocab_size - args.hidden_act = hf_config.hidden_act - args.rms_norm_eps = hf_config.rms_norm_eps - # These attributes only exists with Mixtral, for the moment - args.moe_num_experts = getattr(hf_config, "num_local_experts", - args.moe_num_experts) - args.moe_top_k = getattr(hf_config, "num_experts_per_tok", - args.moe_top_k) - args.rotary_base = getattr(hf_config, "rope_theta", args.rotary_base) - args.model_type = hf_config.model_type - if hf_config.model_type == "mixtral": - # HF LLaMA-type models are implicitly using gated activation. - # With our MoE implementation, we must make it explicit - args.hidden_act = "swiglu" - - elif args.meta_ckpt_dir is not None: - with open(Path(args.meta_ckpt_dir, "params.json")) as fp: - meta_config: dict = json.load(fp) - args.n_embd = meta_config["dim"] - args.n_head = meta_config["n_heads"] - args.n_layer = meta_config["n_layers"] - args.n_kv_head = meta_config.get("n_kv_heads", args.n_head) - if "hidden_dim" in meta_config: - args.inter_size = meta_config["hidden_dim"] - else: - args.multiple_of = meta_config.get("multiple_of", 1) - n_embd = int(4 * args.n_embd * 2 / 3) - args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1) - args.inter_size = args.multiple_of * ( - (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) - // args.multiple_of) - args.rms_norm_eps = meta_config["norm_eps"] - args.moe_num_experts = meta_config.get("moe", {}).get("num_experts", 0) - args.moe_top_k = meta_config.get("moe", {}).get("num_experts_per_tok", - 0) - elif args.bin_model_dir is not None: - n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head = parse_bin_config( - Path(args.bin_model_dir) / "config.ini") - args.inter_size = inter_size # override the inter_size for LLaMA - args.n_kv_head = n_kv_head - args.n_embd = n_embd - args.n_head = n_head - args.n_layer = n_layer - args.n_positions = n_positions - args.vocab_size = vocab_size if args.vocab_size is None else args.vocab_size - args.hidden_act = hidden_act - args.rms_norm_eps = 1e-06 - logger.warning("Set rms_norm_eps to 1e-06 directly.") - if args.n_kv_head is None: - args.n_kv_head = args.n_head - elif args.n_kv_head != args.n_head: - assert (args.n_head % args.n_kv_head) == 0, \ - "MQA/GQA requires the number of heads to be divisible by the number of K/V heads." - assert (args.n_kv_head % args.tp_size) == 0 or (args.tp_size % args.n_kv_head) == 0, \ - "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \ - "the tensor parallelism size to be divisible by the number of K/V heads." - - hf_modules_to_trtllm_modules = { - "q_proj": "attn_q", - "k_proj": "attn_k", - "v_proj": "attn_v", - "o_proj": "attn_dense", - "gate_proj": "mlp_h_to_4h", - "down_proj": "mlp_4h_to_h", - "up_proj": "mlp_gate" - } # lora modules on llama - - trtllm_modules_to_hf_modules = { - "attn_q": "q_proj", - "attn_k": "k_proj", - "attn_v": "v_proj", - "attn_dense": "o_proj", - "mlp_h_to_4h": "gate_proj", - "mlp_4h_to_h": "down_proj", - "mlp_gate": "up_proj", - } - - lora_config = LoraConfig.from_hf(args.hf_lora_dir, - hf_modules_to_trtllm_modules, - trtllm_modules_to_hf_modules) - - if lora_config.is_valid: - if args.lora_target_modules is None: - args.lora_target_modules = lora_config.lora_target_modules - # the lora checkpoint might finetune the embedding - if lora_config.vocab_size != 0: - args.vocab_size = lora_config.vocab_size - - args.lora_config = lora_config - - if args.weight_only_precision == 'int4_awq': - inter_alignment = args.tp_size * 128 - if args.inter_size % inter_alignment != 0: - args.inter_size = int((args.inter_size + inter_alignment - 1) / - inter_alignment) * inter_alignment - logger.info("To use awq we pad intermediate_size to {}.".format( - args.inter_size)) - - if args.quantize_lm_head: - vocab_alignment = args.tp_size * 64 - if args.vocab_size % vocab_alignment != 0: - args.vocab_size = int((args.vocab_size + vocab_alignment - 1) / - vocab_alignment) * vocab_alignment - logger.info("To use awq we pad vocab_size to {}.".format( - args.vocab_size)) - - assert args.pp_size * args.tp_size == args.world_size - - args.max_num_tokens = check_max_num_tokens( - max_num_tokens=args.max_num_tokens, - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - remove_input_padding=args.remove_input_padding) - - assert (math.log2(args.tokens_per_block).is_integer() - ), "tokens_per_block must be power of 2" - if args.enable_context_fmha or args.enable_context_fmha_fp32_acc: - assert (args.tokens_per_block >= - 128), "Context fMHA requires >= 128 tokens per block" - - if args.inter_size is None: - # this should not be need when loading a real model - # but it is helpful when creating a dummy model without loading any real weights - n_embd = int(4 * args.n_embd * 2 / 3) - args.inter_size = args.multiple_of * ( - (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) // - args.multiple_of) - logger.info(f"Setting inter_size to {args.inter_size}.") - - if args.enable_pos_shift: - assert args.use_gpt_attention_plugin, "Position shift is only support in the gpt attention plugin." - assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc - - if args.moe_num_experts and args.moe_top_k == 0: - args.moe_top_k = 1 - args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k, - args.moe_tp_mode, - args.moe_renorm_mode).validate() - - if args.gather_all_token_logits: - args.gather_context_logits = True - args.gather_generation_logits = True - - return args - - -def get_model_object(args, mapping, trt_dtype=None): - if trt_dtype is None: - trt_dtype = str_dtype_to_trt(args.dtype) - # Initialize Module - logger.debug("[Python]llama exampels, Initialize tensorrt_llm.models.LLaMAForCausalLM....") - tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM( - num_layers=args.n_layer, - num_heads=args.n_head, - num_kv_heads=args.n_kv_head, - hidden_size=args.n_embd, - vocab_size=args.vocab_size, - hidden_act=args.hidden_act, - max_position_embeddings=args.n_positions, - dtype=trt_dtype, - mlp_hidden_size=args.inter_size, - position_embedding_type=PositionEmbeddingType.rope_gpt_neox, - mapping=mapping, - rotary_base=args.rotary_base, - rotary_scaling=args.rotary_scaling, - use_parallel_embedding=args.use_parallel_embedding, - embedding_sharding_dim=args.embedding_sharding_dim, - quant_mode=args.quant_mode, - rms_norm_eps=args.rms_norm_eps, - use_fused_mlp=args.use_fused_mlp, - use_prompt_tuning=args.max_prompt_embedding_table_size > 0, - enable_pos_shift=args.enable_pos_shift, - dense_context_fmha=args.dense_context_fmha, - moe_config=args.moe_config, - max_lora_rank=args.max_lora_rank) - quantize_kwargs = {} - if args.use_smooth_quant or args.use_weight_only: - if args.weight_only_precision == 'int4_awq': - exclude_modules = ['lm_head'] if not args.quantize_lm_head else [] - quantize_kwargs = { - "group_size": args.group_size, - "zero": False, - "pre_quant_scale": True, - "exclude_modules": exclude_modules, - } - elif args.weight_only_precision == 'int4_gptq': - quantize_kwargs = { - "group_size": args.group_size, - "zero": True, - "pre_quant_scale": False, - } - elif args.enable_fp8 or args.fp8_kv_cache: - logger.info(f'Loading scaling factors from ' - f'{args.quantized_fp8_model_path}') - quant_scales = get_scaling_factors(args.quantized_fp8_model_path, - num_layers=args.n_layer, - quant_mode=args.quant_mode) - quantize_kwargs = {"quant_scales": quant_scales} - - if args.use_weight_only and args.moe_config.has_moe(): - if 'exclude_modules' in quantize_kwargs: - quantize_kwargs['exclude_modules'].append('router') - else: - quantize_kwargs['exclude_modules'] = ['lm_head', 'router'] - - tensorrt_llm_llama = quantize_model(tensorrt_llm_llama, args.quant_mode, - **quantize_kwargs) - if args.per_group: - if args.weight_only_precision == 'int4_awq': - load_from_awq_llama(tensorrt_llm_llama=tensorrt_llm_llama, - quant_ckpt_path=args.quant_ckpt_path, - quantize_lm_head=args.quantize_lm_head, - mapping=mapping, - dtype=args.dtype, - bin_model_dir=args.bin_model_dir) - else: - load_from_gptq_llama(tensorrt_llm_llama=tensorrt_llm_llama, - quant_ckpt_path=args.quant_ckpt_path, - mapping=mapping, - dtype=args.dtype, - bin_model_dir=args.bin_model_dir) - elif args.meta_ckpt_dir is not None: - load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, mapping, - args.dtype) - elif args.model_dir is not None: - logger.info(f'Loading HF LLaMA ... from {args.model_dir}') - tik = time.time() - if not args.load_by_shard: - if args.model_type == "llava": - hf_llava = LlavaForConditionalGeneration.from_pretrained( - args.model_dir, torch_dtype="auto") - hf_llama = hf_llava.language_model - else: - hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM - hf_llama = hf_model.from_pretrained( - args.model_dir, - device_map={ - "model": "cpu", - "lm_head": "cpu", - "embed_tokens": "cpu", - "layers": "cpu", - "norm": "cpu", - }, # Load to CPU memory - torch_dtype='auto', - ) - use_gemm_woq_plugin = not args.disable_weight_only_quant_plugin - # hf_llama.config.num_hidden_layers = 1 # only for debug - load_from_hf_llama(tensorrt_llm_llama, - hf_llama, - mapping=mapping, - dtype=args.dtype, - use_gemm_woq_plugin=use_gemm_woq_plugin, - lora_config=args.lora_config) - del hf_llama - else: - load_from_hf_checkpoint(tensorrt_llm_llama, - args.model_dir, - mapping, - dtype=args.dtype, - lora_config=args.lora_config) - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'HF LLaMA loaded. Total time: {t}') - - elif args.bin_model_dir is not None: - load_from_binary(tensorrt_llm_llama, - args.bin_model_dir, - mapping, - fp16=(args.dtype == 'float16'), - multi_query_mode=(args.n_kv_head != args.n_head)) - - return tensorrt_llm_llama - - -def update_plugin_configs(args, network): - if args.use_gpt_attention_plugin: - network.plugin_config.set_gpt_attention_plugin( - dtype=args.use_gpt_attention_plugin) - if args.use_gemm_plugin: - if not args.enable_fp8: - network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin) - else: - logger.info( - "Gemm plugin does not support FP8. Disabled Gemm plugin.") - if args.use_rmsnorm_plugin: - network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin) - if args.use_lora_plugin: - network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin) - if args.use_lookup_plugin: - network.plugin_config.set_lookup_plugin(dtype=args.use_lookup_plugin) - if args.use_gather_last_token_plugin: - network.plugin_config.set_gather_last_token_plugin(dtype=args.use_gather_last_token_plugin) - if args.use_activation_plugin: - network.plugin_config.set_activation_plugin(dtype=args.use_activation_plugin) - if args.use_elementwise_plugin: - network.plugin_config.set_elementwise_plugin(dtype=args.use_elementwise_plugin) - if args.use_cast_plugin: - network.plugin_config.set_cast_plugin() - - # Quantization plugins. - if args.use_smooth_quant: - network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype) - network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype) - network.plugin_config.set_quantize_tensor_plugin() - network.plugin_config.set_quantize_per_token_plugin() - assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc) - if args.enable_context_fmha: - network.plugin_config.set_context_fmha(ContextFMHAType.enabled) - if args.enable_context_fmha_fp32_acc: - network.plugin_config.set_context_fmha( - ContextFMHAType.enabled_with_fp32_acc) - if args.multi_block_mode: - network.plugin_config.enable_mmha_multi_block_mode() - if not args.disable_xqa: - network.plugin_config.enable_xqa_optimization() - - if args.use_weight_only and not args.disable_weight_only_quant_plugin: - if args.per_group: - network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin( - dtype=args.dtype) - else: - network.plugin_config.set_weight_only_quant_matmul_plugin( - dtype=args.dtype) - if args.world_size > 1: - network.plugin_config.set_nccl_plugin(args.dtype, - args.use_custom_all_reduce) - if args.remove_input_padding: - network.plugin_config.enable_remove_input_padding() - if args.paged_kv_cache: - network.plugin_config.enable_paged_kv_cache(args.tokens_per_block) - return - - -def build_rank_engine(builder: Builder, - builder_config: tensorrt_llm.builder.BuilderConfig, - engine_name, rank, args): - ''' - @brief: Build the engine on the given rank. - @param rank: The rank to build the engine. - @param args: The cmd line arguments. - @return: The built engine. - ''' - dtype = str_dtype_to_trt(args.dtype) - mapping = Mapping(world_size=args.world_size, - rank=rank, - tp_size=args.tp_size, - pp_size=args.pp_size) - - assert args.n_layer % args.pp_size == 0, \ - f"num_layers {args.n_layer} must be a multiple of pipeline parallelism size {args.pp_size}" - - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {rank} Engine build starts') - # Initialize Module - tensorrt_llm_llama = get_model_object(args, - mapping=mapping, - trt_dtype=dtype) - - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {rank} model weight loaded.') - - # Module -> Network - logger.debug("[Python]llama exampels, convert module to network....") - network = builder.create_network() - network.trt_network.name = engine_name - update_plugin_configs(args, network) - - if args.use_paged_context_fmha: - assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc, "context fmha must be enabled" - network.plugin_config.set_paged_context_fmha() - - logger.debug(f"[Python]llama exampels, network.plugin_config: \n{network.plugin_config}") - with net_guard(network): - # Prepare - network.set_named_parameters(tensorrt_llm_llama.named_parameters()) - - # Forward - inputs = tensorrt_llm_llama.prepare_inputs( - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - max_seq_len=args.max_input_len + args.max_output_len, - use_cache=True, - max_beam_width=args.max_beam_width, - max_num_tokens=args.max_num_tokens, - prompt_embedding_table_size=args.max_prompt_embedding_table_size, - gather_context_logits=args.gather_context_logits, - gather_generation_logits=args.gather_generation_logits, - lora_target_modules=args.lora_target_modules) - logger.info(f"[Python]llama exampels, forward....\n") - tensorrt_llm_llama(*inputs) - logger.info(f"[Python]llama exampels, forward finished\n") - if args.enable_debug_output: - # mark intermediate nodes' outputs - for k, v in tensorrt_llm_llama.named_network_outputs(): - logger.debug(f"enable_debug_output, debug tensor name: {k}") - v = v.trt_tensor - v.name = k - network.trt_network.mark_output(v) - v.dtype = dtype - if args.visualize: - model_path = os.path.join(args.output_dir, 'test.onnx') - to_onnx(network.trt_network, model_path) - - logger.debug("[Python]llama examples, tensorrt_llm.graph_rewriting.optimize....") - tensorrt_llm.graph_rewriting.optimize(network) - - engine = None - - # Network -> Engine - logger.debug("[Python]llama examples, builder.build_engine....") - engine = builder.build_engine(network, builder_config) - if rank == 0: - config_path = os.path.join(args.output_dir, 'config.json') - builder.save_config(builder_config, config_path) - - return engine - - -def get_builder_config_namespace(args, cache): - # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT - # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ) - # OR INT8 KV cache is set to contiguous (without paged KV cache enabled). - int8_trt_flag = (args.quant_mode.has_act_or_weight_quant() - and not args.quant_mode.has_per_group_scaling()) or ( - not args.paged_kv_cache - and args.quant_mode.has_int8_kv_cache()) - config = argparse.Namespace( - name=MODEL_NAME, - precision=args.dtype, - timing_cache=args.timing_cache if cache is None else cache, - profiling_verbosity=args.profiling_verbosity, - tensor_parallel=args.tp_size, - pipeline_parallel=args.pp_size, - parallel_build=args.parallel_build, - num_layers=args.n_layer, - num_heads=args.n_head, - num_kv_heads=args.n_kv_head, - hidden_size=args.n_embd, - vocab_size=args.vocab_size, - hidden_act=args.hidden_act, - max_position_embeddings=args.n_positions, - max_batch_size=args.max_batch_size, - max_beam_width=args.max_beam_width, - max_input_len=args.max_input_len, - max_output_len=args.max_output_len, - max_num_tokens=args.max_num_tokens, - int8=int8_trt_flag, - quant_mode=args.quant_mode, - strongly_typed=args.strongly_typed, - opt_level=args.builder_opt, - max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - gather_context_logits=args.gather_context_logits, - gather_generation_logits=args.gather_generation_logits, - lora_target_modules=args.lora_target_modules, - mlp_hidden_size=args.inter_size, - hf_modules_to_trtllm_modules=args.lora_config. - hf_modules_to_trtllm_modules, - trtllm_modules_to_hf_modules=args.lora_config. - trtllm_modules_to_hf_modules, - ) - return config - - -def build(rank, args): - torch.cuda.set_device(rank % args.gpus_per_node) - logger.set_level(args.log_level) - os.makedirs(args.output_dir, exist_ok=True) - - # when doing serializing build, all ranks share one engine - builder = Builder() - cache = None - for cur_rank in range(args.world_size): - # skip other ranks if parallel_build is enabled - if args.parallel_build and cur_rank != rank: - continue - tik = time.time() - - # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT - # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ) - # OR INT8 KV cache is set to contiguous (without paged KV cache enabled). - int8_trt_flag = (args.quant_mode.has_act_or_weight_quant() - and not args.quant_mode.has_per_group_scaling()) or ( - not args.paged_kv_cache - and args.quant_mode.has_int8_kv_cache()) - builder_config = builder.create_builder_config( - **vars(get_builder_config_namespace(args, cache))) - engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size, - args.pp_size, cur_rank) - logger.debug("[Python]llama example, build_rank_engine....") - engine = build_rank_engine(builder, builder_config, engine_name, - cur_rank, args) - assert engine is not None, f'Failed to build engine for rank {cur_rank}' - - local_num_kv_heads = (args.n_kv_head + args.world_size - - 1) // args.world_size - kv_dtype = str_dtype_to_trt(args.dtype) - if args.quant_mode.has_int8_kv_cache(): - kv_dtype = str_dtype_to_trt('int8') - elif args.quant_mode.has_fp8_kv_cache(): - kv_dtype = str_dtype_to_trt('fp8') - - # FIXME (Not Support libnvidia-ml.so) - # profiler.check_gpt_mem_usage( - # engine=engine, - # kv_dtype=kv_dtype, - # use_gpt_attention_plugin=args.use_gpt_attention_plugin, - # paged_kv_cache=args.paged_kv_cache, - # max_batch_size=args.max_batch_size, - # max_beam_width=args.max_beam_width, - # max_seq_len=args.max_input_len + args.max_output_len, - # local_num_kv_heads=local_num_kv_heads, - # head_size=args.n_embd / args.n_head, - # num_layers=args.n_layer) - - if cur_rank == 0: - # Use in-memory timing cache for multiple builder passes. - if not args.parallel_build: - cache = builder_config.trt_builder_config.get_timing_cache() - - serialize_engine(engine, os.path.join(args.output_dir, engine_name)) - del engine - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {cur_rank} Engine serialized') - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info( - f'Rank {cur_rank} Engine build time: {t} - {tok - tik} (sec)') - - if rank == 0: - ok = builder.save_timing_cache( - builder_config, os.path.join(args.output_dir, "model.cache")) - assert ok, "Failed to save timing cache." - - -if __name__ == '__main__': - args = parse_arguments() - print(args) - tik = time.time() - if args.parallel_build and args.world_size > 1 and \ - torch.cuda.device_count() >= args.world_size: - logger.warning( - f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.' - ) - mp.spawn(build, nprocs=args.world_size, args=(args, )) - else: - args.parallel_build = False - logger.info('Serially build TensorRT engines.') - build(0, args) - - tok = time.time() - build_engine_time = tok - tik - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Total time of building all {args.world_size} engines: {t}') - - if args.total_build_time_target != 0: - status = build_engine_time <= args.total_build_time_target - if status: - print("successful.") - else: - print(f"Build engine time check failed! Target: {args.total_build_time_target}, Actual: {build_engine_time}") - sys.exit(int(not status)) diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/convert_checkpoint.py b/models/nlp/large_language_model/llama2-13b/trtllm/convert_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..6c44e840456923d9e77b43c1a509e77658d175ae --- /dev/null +++ b/models/nlp/large_language_model/llama2-13b/trtllm/convert_checkpoint.py @@ -0,0 +1,500 @@ +import argparse +import json +import os +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed + +from transformers import AutoConfig + +import tensorrt_llm +from tensorrt_llm._utils import release_gc +from tensorrt_llm.layers import MoeConfig +from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import LLaMAForCausalLM +from tensorrt_llm.models.convert_utils import has_safetensors +from tensorrt_llm.models.llama.convert import load_hf_llama +from tensorrt_llm.models.modeling_utils import QuantConfig +from tensorrt_llm.quantization import QuantAlgo + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', type=str, default=None) + parser.add_argument('--meta_ckpt_dir', type=str, default=None) + + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') + parser.add_argument( + '--moe_tp_size', + type=int, + default=-1, + help= + 'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE' + ) + parser.add_argument('--dtype', + type=str, + default='float16', + choices=['float32', 'bfloat16', 'float16']) + parser.add_argument('--vocab_size', type=int, default=32000) + parser.add_argument('--n_positions', type=int, default=2048) + parser.add_argument('--n_layer', type=int, default=32) + parser.add_argument('--n_head', type=int, default=32) + parser.add_argument('--n_kv_head', type=int, default=None) + parser.add_argument('--n_embd', type=int, default=4096) + parser.add_argument('--inter_size', type=int, default=11008) + parser.add_argument('--multiple_of', type=int, default=None) + parser.add_argument('--ffn_dim_multiplier', type=float, default=None) + parser.add_argument('--rms_norm_eps', type=float, default=1e-06) + + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--disable_weight_only_quant_plugin', + default=False, + action="store_true", + help= + 'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4', 'int4_gptq'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--calib_dataset', + type=str, + default='ccdv/cnn_dailymail', + help= + "The huggingface dataset name or the local directory of the dataset for calibration." + ) + parser.add_argument( + "--smoothquant", + "-sq", + type=float, + default=None, + help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)" + " to Smoothquant the model, and output int8 weights." + " A good first try is 0.5. Must be in [0, 1]") + parser.add_argument( + '--per_channel', + action="store_true", + default=False, + help= + 'By default, we use a single static scaling factor for the GEMM\'s result. ' + 'per_channel instead uses a different static scaling factor for each channel. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--per_token', + action="store_true", + default=False, + help= + 'By default, we use a single static scaling factor to scale activations in the int8 range. ' + 'per_token chooses at run time, and for each token, a custom scaling factor. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--int8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' + ) + parser.add_argument( + '--fp8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV' + ) + parser.add_argument( + '--quant_ckpt_path', + type=str, + default=None, + help='Path of a quantized model checkpoint in .safetensors format') + parser.add_argument("--use_fp8_rowwise", + action="store_true", + default=False, + help="Enable Fp8 per-token per-channel quantization") + + parser.add_argument( + '--per_group', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor to scale weights in the int4 range. ' + 'per_group chooses at run time, and for each group, a custom scaling factor. ' + 'The flag is built for GPTQ/AWQ quantization.') + + parser.add_argument('--load_by_shard', + action='store_true', + help='Load a pretrained model shard-by-shard.') + parser.add_argument('--hidden_act', type=str, default='silu') + + parser.add_argument('--rotary_base', type=float, default=10000.0) + + parser.add_argument('--group_size', + type=int, + default=128, + help='Group size used in GPTQ quantization.' + ) # AWQ is only supported by quantize.py script + + parser.add_argument("--load_model_on_cpu", action="store_true") + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=0, + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' + ) + parser.add_argument( + '--use_embedding_sharing', + action="store_true", + default=False, + help= + 'Try to reduce the engine size by sharing the embedding lookup table between two layers.' + 'Note: the flag might not take effect when the criteria are not met.') + parser.add_argument('--output_dir', + type=str, + default='tllm_checkpoint', + help='The path to save the TensorRT-LLM checkpoint') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') + parser.add_argument( + '--moe_num_experts', + default=0, + type=int, + help='Specify the number of experts to use for MOE layers') + parser.add_argument( + '--moe_top_k', + default=0, + type=int, + help= + 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' + ) + parser.add_argument( + '--moe_renorm_mode', + default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, + type=int, + help= + 'Controls renormalization after gate logits. Check layers/moe.py for accepted values', + ) + parser.add_argument( + '--save_config_only', + action="store_true", + default=False, + help= + 'Only save the model config w/o read and converting weights, be careful, this is for debug only' + ) + parser.add_argument( + '--remove_duplicated_kv_heads', + action="store_true", + default=False, + help= + 'Only used to remove the duplicated kv heads of llama-3.1 405B HF model.' + ) + parser.add_argument('--log_level', type=str, default='info') + + args = parser.parse_args() + # changing the default to be consistent as the cli help said. + if args.moe_num_experts and args.moe_top_k == 0: + args.moe_top_k = 1 + return args + + +def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: + '''return config dict with quantization info based on the command line args + ''' + quant_config = QuantConfig() + if args.use_weight_only: + if args.weight_only_precision == 'int8': + quant_config.quant_algo = QuantAlgo.W8A16 + elif args.weight_only_precision == 'int4': + quant_config.quant_algo = QuantAlgo.W4A16 + elif args.smoothquant: + quant_config.smoothquant_val = args.smoothquant + if args.per_channel: + if args.per_token: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN + else: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN + else: + if args.per_token: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN + else: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN + elif args.use_fp8_rowwise: + quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN + # this will be overwritten if specified in the hf config. + quant_config.clamp_val = [-1200.0, 1200.0] + + if args.int8_kv_cache: + quant_config.kv_cache_quant_algo = QuantAlgo.INT8 + + if args.fp8_kv_cache: + quant_config.kv_cache_quant_algo = QuantAlgo.FP8 + + if args.weight_only_precision == 'int4_gptq': + quant_config.group_size = args.group_size + quant_config.has_zero_point = True + quant_config.pre_quant_scale = False + quant_config.quant_algo = QuantAlgo.W4A16_GPTQ + + return quant_config + + +def update_quant_config_from_hf(quant_config, hf_config) -> QuantConfig: + hf_config_dict = hf_config.to_dict() + if hf_config_dict.get('quantization_config'): + # update the quant_algo, and clamp_val. + if hf_config_dict['quantization_config'].get( + 'quant_method') == 'fbgemm_fp8': + logger.info( + "Load quantization configs from huggingface model_config.") + quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN + activation_scale_ub = hf_config_dict['quantization_config'].get( + 'activation_scale_ub', 1200.0) + quant_config.clamp_val = [-activation_scale_ub, activation_scale_ub] + return quant_config + + +def convert_and_save_meta(args, rank): + mapping = Mapping(world_size=args.tp_size * args.pp_size, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size, + rank=rank) + llama = LLaMAForCausalLM.from_meta_ckpt( + args.meta_ckpt_dir, + args.dtype, + quant_config=args_to_quant_config(args), + mapping=mapping, + use_parallel_embedding=args.use_parallel_embedding, + embedding_sharding_dim=args.embedding_sharding_dim) + llama.save_checkpoint(args.output_dir, save_config=(rank == 0)) + + +def args_to_build_options(args): + return { + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + 'disable_weight_only_quant_plugin': + args.disable_weight_only_quant_plugin, + 'remove_duplicated_kv_heads': args.remove_duplicated_kv_heads, + 'quant_ckpt_path': args.quant_ckpt_path, + 'load_model_on_cpu': args.load_model_on_cpu, + } + + +def from_cli_args(args): + n_kv_head = args.n_kv_head if args.n_kv_head is not None else args.n_head + config = { + 'architecture': "LlamaForCausalLM", + 'dtype': args.dtype, + 'logits_dtype': 'float32', + 'num_hidden_layers': args.n_layer, + 'num_attention_heads': args.n_head, + 'hidden_size': args.n_embd, + 'intermediate_size': args.inter_size, + 'ffn_dim_multiplier': args.ffn_dim_multiplier, + 'multiple_of': args.multiple_of, + 'num_key_value_heads': n_kv_head, + 'vocab_size': args.vocab_size, + 'position_embedding_type': 'rope_gpt_neox', + 'max_position_embeddings': args.n_positions, + 'hidden_act': args.hidden_act, + 'rotary_base': args.rotary_base, + 'norm_epsilon': args.rms_norm_eps, + 'moe': { + 'num_experts': args.moe_num_experts, + 'top_k': args.moe_top_k, + 'normalization_mode': args.moe_renorm_mode, + }, + 'mapping': { + 'world_size': args.tp_size * args.pp_size, + 'tp_size': args.tp_size, + 'pp_size': args.pp_size, + 'moe_tp_size': args.moe_tp_size, + 'moe_ep_size': args.moe_ep_size, + }, + 'quantization': args_to_quant_config(args).to_dict() + } + config.update(args_to_build_options(args)) + return config + + +def convert_and_save_hf(args): + model_dir = args.model_dir + load_model_on_cpu = args.load_model_on_cpu + load_by_shard = args.load_by_shard + world_size = args.tp_size * args.pp_size + # Need to convert the cli args to the kay-value pairs and override them in the generate config dict. + # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now, + # before the refactor is done. + override_fields = {} + override_fields.update(args_to_build_options(args)) + + quant_config = args_to_quant_config(args) + + try: + hf_config = AutoConfig.from_pretrained(model_dir, + trust_remote_code=True) + quant_config = update_quant_config_from_hf(quant_config, hf_config) + except: + # llava_llama needs its own defined config. + logger.warning("AutoConfig cannot load the huggingface config.") + + if args.smoothquant is not None or args.int8_kv_cache: + assert not args.load_by_shard, "When using quantization, TRT-LLM needs to load the whole HF model, thus load by shard not supported" + mapping = Mapping(world_size=world_size, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) + # TODO: support moe quantization for tp + ep + LLaMAForCausalLM.quantize( + args.model_dir, + args.output_dir, + dtype=args.dtype, + mapping=mapping, + quant_config=quant_config, + device='cpu' if args.load_model_on_cpu else 'cuda', + calib_dataset=args.calib_dataset, + **override_fields) + else: + # When not loading by shard, preload one complete model and then slice per rank weights from this + # this saves the disk reloading time + hf_model = None + if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER") is not None \ + and os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER").strip() == "2": + if "vila" in model_dir or "llava" in model_dir: + hf_model = load_hf_llama(model_dir, load_model_on_cpu) + elif not (args.load_by_shard or + (has_safetensors(model_dir) + and not quant_config.quant_mode.has_any_quant())): + hf_model = load_hf_llama(model_dir, load_model_on_cpu) + + def convert_and_save_rank(args, rank): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) + llama = LLaMAForCausalLM.from_hugging_face( + model_dir if hf_model is None else hf_model, + args.dtype, + mapping=mapping, + quant_config=quant_config, + load_by_shard=load_by_shard, + **override_fields, + ) + llama.save_checkpoint(args.output_dir, save_config=(rank == 0)) + del llama + + execute(args.workers, [convert_and_save_rank] * world_size, args) + release_gc() + + +def execute(workers, func, args): + if workers == 1: + for rank, f in enumerate(func): + f(args, rank) + else: + with ThreadPoolExecutor(max_workers=workers) as p: + futures = [p.submit(f, args, rank) for rank, f in enumerate(func)] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + +def main(): + print(tensorrt_llm.__version__) + args = parse_arguments() + logger.set_level(args.log_level) + + world_size = args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" + tik = time.time() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + if (args.model_dir is None + and args.meta_ckpt_dir is None): # generate fake config.json + config = from_cli_args(args) + with open(os.path.join(args.output_dir, 'config.json'), 'w') as f: + json.dump(config, f, indent=4) + elif args.meta_ckpt_dir is not None: + assert args.model_dir is None, "Shall not specify both meta checkpoint dir and hugging face dir" + execute(args.workers, [convert_and_save_meta] * world_size, args) + else: # all other paths from hf model + assert args.model_dir is not None + assert ( + args.quant_ckpt_path is not None + and args.weight_only_precision == 'int4_gptq' + ) or args.quant_ckpt_path is None, "only gptq weights only needs this option" + convert_and_save_hf(args) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Total time of converting checkpoints: {t}') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/run.py b/models/nlp/large_language_model/llama2-13b/trtllm/run.py index 3899ec9d55a33bca6eeeac4840353345467b474d..5590749592d3237b3087f2b745fd9abb9569bf51 100644 --- a/models/nlp/large_language_model/llama2-13b/trtllm/run.py +++ b/models/nlp/large_language_model/llama2-13b/trtllm/run.py @@ -16,63 +16,45 @@ import argparse import ast import csv +import os from pathlib import Path import sys import time +import sys +import time import numpy as np import torch +from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, + add_common_args, load_tokenizer, read_decoder_start_token_id, + read_model_name, supports_inflight_batching, + throttle_generator) + import tensorrt_llm import tensorrt_llm.profiler from tensorrt_llm.logger import logger from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner -from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, - load_tokenizer, read_model_name, throttle_generator) - if PYTHON_BINDINGS: from tensorrt_llm.runtime import ModelRunnerCpp def parse_arguments(args=None): + # see `add_common_args` for extended list of arguments parser = argparse.ArgumentParser() + parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument('--max_output_len', type=int, required=True) - parser.add_argument( - '--max_attention_window_size', - type=int, - default=None, - help= - 'The attention window size that controls the sliding window attention / cyclic kv cache behaviour' - ) - parser.add_argument('--sink_token_length', - type=int, - default=None, - help='The sink token length.') - parser.add_argument('--log_level', type=str, default='error') - parser.add_argument('--engine_dir', type=str, default='engine_outputs') - parser.add_argument('--use_py_session', - default=False, - action='store_true', - help="Whether or not to use Python runtime session") parser.add_argument( '--input_text', type=str, nargs='+', default=["Born in north-east France, Soyer trained as a"]) - parser.add_argument( - '--no_prompt_template', - dest='use_prompt_template', - default=True, - action='store_false', - help= - "Whether or not to use default prompt template to wrap the input text.") parser.add_argument( '--input_file', type=str, help= 'CSV or Numpy file containing tokenized input. Alternative to text input.', default=None) - parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument('--output_csv', type=str, help='CSV file where the tokenized output is stored.', @@ -87,89 +69,26 @@ def parse_arguments(args=None): help= 'Numpy file where the generation logits are stored. Use only when num_beams==1', default=None) - parser.add_argument('--tokenizer_dir', - help="HF tokenizer config path", - default='gpt2') - parser.add_argument( - '--tokenizer_type', - help= - 'Specify that argument when providing a .model file as the tokenizer_dir. ' - 'It allows AutoTokenizer to instantiate the correct tokenizer type.') - parser.add_argument('--vocab_file', - help="Used for sentencepiece tokenizers") - parser.add_argument('--num_beams', - type=int, - help="Use beam search if num_beams >1", - default=1) - parser.add_argument('--temperature', type=float, default=1.0) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--top_p', type=float, default=0.0) - parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--repetition_penalty', type=float, default=1.0) - parser.add_argument('--presence_penalty', type=float, default=0.0) - parser.add_argument('--frequency_penalty', type=float, default=0.0) - parser.add_argument('--debug_mode', - default=False, - action='store_true', - help="Whether or not to turn on the debug mode") - parser.add_argument('--no_add_special_tokens', - dest='add_special_tokens', - default=True, - action='store_false', - help="Whether or not to add special tokens") - parser.add_argument('--streaming', default=False, action='store_true') - parser.add_argument('--streaming_interval', - type=int, - help="How often to return tokens when streaming.", - default=5) - parser.add_argument( - '--prompt_table_path', - type=str, - help="Path to .npy file, exported by nemo_prompt_convert.py") - parser.add_argument( - '--prompt_tasks', - help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0") - parser.add_argument('--lora_dir', + parser.add_argument('--output_log_probs_npy', type=str, - default=None, - nargs="+", - help="The directory of LoRA weights") - parser.add_argument( - '--lora_task_uids', - type=str, - default=None, - nargs="+", - help="The list of LoRA task uids; use -1 to disable the LoRA module") - parser.add_argument('--lora_ckpt_source', + help='Numpy file where the log_probs are stored', + default=None) + parser.add_argument('--output_cum_log_probs_npy', type=str, - default="hf", - choices=["hf", "nemo"], - help="The source of lora checkpoint.") - parser.add_argument( - '--num_prepend_vtokens', - nargs="+", - type=int, - help="Number of (default) virtual tokens to prepend to each sentence." - " For example, '--num_prepend_vtokens=10' will prepend the tokens" - " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") + help='Numpy file where the cum_log_probs are stored', + default=None) parser.add_argument( '--run_profiling', default=False, action='store_true', help="Run several 10 iterations to profile the inference latencies.") - parser.add_argument( - '--medusa_choices', - type=str, - default=None, - help="Medusa choice to use, if not none, will use Medusa decoding." - " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." - ) parser.add_argument('--target_load_engine_time', type=float, default=0) parser.add_argument('--target_qps', type=float, default=0) + parser = add_common_args(parser) return parser.parse_args(args=args) @@ -182,7 +101,8 @@ def parse_input(tokenizer, max_input_length=923, pad_id=None, num_prepend_vtokens=[], - model_name=None): + model_name=None, + model_version=None): if pad_id is None: pad_id = tokenizer.pad_token_id @@ -211,13 +131,12 @@ def parse_input(tokenizer, elif input_file.endswith('.txt'): with open(input_file, 'r', encoding='utf-8', errors='replace') as txt_file: - input_text = txt_file.read() - input_ids = tokenizer.encode( + input_text = txt_file.readlines() + batch_input_ids = tokenizer( input_text, add_special_tokens=add_special_tokens, truncation=True, - max_length=max_input_length) - batch_input_ids.append(input_ids) + max_length=max_input_length)["input_ids"] else: print('Input file format not supported.') raise SystemExit @@ -230,9 +149,11 @@ def parse_input(tokenizer, batch_input_ids[i] = list( range(base_vocab_size, base_vocab_size + length)) + batch_input_ids[i] - if model_name == 'glm_10b': + + if input_file is None and 'GLM' in model_name and model_version == 'glm': for ids in batch_input_ids: ids.append(tokenizer.sop_token_id) + batch_input_ids = [ torch.tensor(x, dtype=torch.int32) for x in batch_input_ids ] @@ -247,7 +168,11 @@ def print_output(tokenizer, output_npy=None, context_logits=None, generation_logits=None, - output_logits_npy=None): + cum_log_probs=None, + log_probs=None, + output_logits_npy=None, + output_cum_log_probs_npy=None, + output_log_probs_npy=None): batch_size, num_beams, _ = output_ids.size() if output_csv is None and output_npy is None: for batch_idx in range(batch_size): @@ -265,7 +190,6 @@ def print_output(tokenizer, f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') output_ids = output_ids.reshape((-1, output_ids.size(2))) - if output_csv is not None: output_file = Path(output_csv) output_file.parent.mkdir(exist_ok=True, parents=True) @@ -303,6 +227,20 @@ def print_output(tokenizer, dtype='float32') np.save(output_generation_logits_file, generation_outputs) + # Save cum log probs + if cum_log_probs is not None and output_cum_log_probs_npy is not None: + cum_log_probs_file = Path(output_cum_log_probs_npy) + cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), + dtype='float32') + np.save(cum_log_probs_file, cum_log_probs_outputs) + + # Save cum log probs + if log_probs is not None and output_log_probs_npy is not None: + log_probs_file = Path(output_log_probs_npy) + log_probs_outputs = np.array(log_probs.cpu().contiguous(), + dtype='float32') + np.save(log_probs_file, log_probs_outputs) + def check_status(args, load_engine_time, qps): print("==================== check status ====================") @@ -320,28 +258,35 @@ def main(args): runtime_rank = tensorrt_llm.mpi_rank() logger.set_level(args.log_level) - model_name = read_model_name(args.engine_dir) - if args.tokenizer_dir is None: + # different handling if encoder-decoder models + is_enc_dec = { + name + for name in os.listdir(args.engine_dir) + if os.path.isdir(os.path.join(args.engine_dir, name)) + } == {'encoder', 'decoder'} + if is_enc_dec: + logger.warning( + "This path is an encoder-decoder model. Using different handling.") + assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." + + model_name, model_version = read_model_name( + args.engine_dir) if not is_enc_dec else ("", "") + if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS: + logger.warning( + "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] tokenizer, pad_id, end_id = load_tokenizer( tokenizer_dir=args.tokenizer_dir, vocab_file=args.vocab_file, model_name=model_name, + model_version=model_version, tokenizer_type=args.tokenizer_type, ) - # # An example to stop generation when the model generate " London" on first sentence, " eventually became" on second sentence - # stop_words_list = [[" London"], ["eventually became"]] - # stop_words_list = tensorrt_llm.runtime.to_word_list_format(stop_words_list, tokenizer) - # stop_words_list = torch.Tensor(stop_words_list).to(torch.int32).to("cuda").contiguous() - stop_words_list = None - - # # An example to prevent generating " chef" on first sentence, " eventually" and " chef before" on second sentence - # bad_words_list = [[" chef"], [" eventually, chef before"]] - # bad_words_list = tensorrt_llm.runtime.to_word_list_format(bad_words_list, tokenizer) - # bad_words_list = torch.Tensor(bad_words_list).to(torch.int32).to("cuda").contiguous() - bad_words_list = None + if args.end_id: + end_id = args.end_id prompt_template = None if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: @@ -354,8 +299,47 @@ def main(args): max_input_length=args.max_input_length, pad_id=pad_id, num_prepend_vtokens=args.num_prepend_vtokens, - model_name=model_name) - input_lengths = [x.size(0) for x in batch_input_ids] + model_name=model_name, + model_version=model_version) + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + if model_version == 'glm4': # add default stop token ids for GLM-4 + glm4_stop_ids = [[151329], [151336], [151338]] + if stop_words_list is None: + stop_words_list = [glm4_stop_ids] * len(batch_input_ids) + else: + for req_stop_words_list in stop_words_list: + req_stop_words_list.extend(glm4_stop_ids) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + + if is_enc_dec: + encoder_input_ids = batch_input_ids + decoder_start_token_id = read_decoder_start_token_id( + os.path.join(args.engine_dir, "decoder")) + decoder_input_ids = [ + torch.tensor([decoder_start_token_id], dtype=torch.int32) + for _ in batch_input_ids + ] + + input_lengths = [x.size(0) for x in decoder_input_ids + ] if is_enc_dec else [x.size(0) for x in batch_input_ids] + encoder_input_lengths = [x.size(0) + for x in encoder_input_ids] if is_enc_dec else None + + if not args.use_py_session and not supports_inflight_batching( + os.path.join(args.engine_dir, "decoder") if is_enc_dec else args. + engine_dir): + logger.warning( + "The given engine does not support in-flight batching, fallback to python session" + ) + args.use_py_session = True if not PYTHON_BINDINGS and not args.use_py_session: logger.warning( @@ -367,34 +351,60 @@ def main(args): "Debug mode is not supported in C++ session for now, fallback to Python session." ) args.use_py_session = True + if args.return_all_generated_tokens and args.use_py_session: + raise ValueError( + "Returning all the generated tokens at each step is not supported in the Python session, use C++ session instead." + ) + if (not args.return_all_generated_tokens) and args.streaming and ( + args.num_beams > 1): + logger.warning( + "Setting return_all_generated_tokens to True since streaming AND beam search are done simultaneously. " + "Returning the full beams at each streaming step is needed because beam search + streaming can change previous outputs. " + "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." + ) + args.return_all_generated_tokens = True runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict(engine_dir=args.engine_dir, - lora_dir=args.lora_dir, - rank=runtime_rank, - debug_mode=args.debug_mode, - lora_ckpt_source=args.lora_ckpt_source) + runner_kwargs = dict( + engine_dir=args.engine_dir, + lora_dir=args.lora_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + lora_ckpt_source=args.lora_ckpt_source, + gpu_weights_percent=args.gpu_weights_percent, + ) + if not args.use_py_session: + runner_kwargs.update(is_enc_dec=is_enc_dec) if args.medusa_choices is not None: args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.use_py_session, "Medusa is only supported by py_session" - assert args.temperature == 0, "Medusa should use temperature == 0" + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" assert args.num_beams == 1, "Medusa should use num_beams == 1" runner_kwargs.update(medusa_choices=args.medusa_choices) if not args.use_py_session: runner_kwargs.update( max_batch_size=len(batch_input_ids), - max_input_len=max(input_lengths), + max_input_len=max( + encoder_input_lengths if is_enc_dec else input_lengths), max_output_len=args.max_output_len, max_beam_width=args.num_beams, max_attention_window_size=args.max_attention_window_size, sink_token_length=args.sink_token_length, - ) + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode) + runner_kwargs.update( + enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) runner = runner_cls.from_dir(**runner_kwargs) torch.cuda.synchronize() start_time = time.time() with torch.no_grad(): outputs = runner.generate( - batch_input_ids, + batch_input_ids=decoder_input_ids + if is_enc_dec else batch_input_ids, + encoder_input_ids=encoder_input_ids if is_enc_dec else None, max_new_tokens=args.max_output_len, max_attention_window_size=args.max_attention_window_size, sink_token_length=args.sink_token_length, @@ -405,27 +415,32 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, + no_repeat_ngram_size=args.no_repeat_ngram_size, return_dict=True, - medusa_choices=args.medusa_choices) + medusa_choices=args.medusa_choices, + return_all_generated_tokens=args.return_all_generated_tokens) torch.cuda.synchronize() - - status = False + end_time = time.time() + if runtime_rank == 0: num_inputs = sum([torch.numel(x) for x in batch_input_ids]) num_outputs = torch.numel(outputs["output_ids"]) num_gens = num_outputs - num_inputs - load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine") qps = num_gens/(end_time-start_time) logger.info(f'Load engine takes: {load_engine_time} sec') @@ -433,29 +448,46 @@ def main(args): status = check_status(args, load_engine_time, qps) else: status = True - + if args.streaming: for curr_outputs in throttle_generator(outputs, args.streaming_interval): if runtime_rank == 0: output_ids = curr_outputs['output_ids'] sequence_lengths = curr_outputs['sequence_lengths'] - print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy) + cum_log_probs = None + log_probs = None + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output( + tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) else: if runtime_rank == 0: output_ids = outputs['output_ids'] sequence_lengths = outputs['sequence_lengths'] context_logits = None generation_logits = None + cum_log_probs = None + log_probs = None if runner.gather_context_logits: context_logits = outputs['context_logits'] if runner.gather_generation_logits: generation_logits = outputs['generation_logits'] + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] print_output(tokenizer, output_ids, input_lengths, @@ -464,7 +496,11 @@ def main(args): output_npy=args.output_npy, context_logits=context_logits, generation_logits=generation_logits, - output_logits_npy=args.output_logits_npy) + output_logits_npy=args.output_logits_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) if args.run_profiling: ite = 10 @@ -482,17 +518,24 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, - return_dict=True) + return_dict=True, + return_all_generated_tokens=args.return_all_generated_tokens + ) torch.cuda.synchronize() tensorrt_llm.profiler.start("tmp") @@ -509,23 +552,31 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, - return_dict=True) + return_dict=True, + return_all_generated_tokens=args.return_all_generated_tokens + ) torch.cuda.synchronize() tensorrt_llm.profiler.stop("tmp") print( f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" ) + if status: print("successful.") else: @@ -536,4 +587,4 @@ def main(args): if __name__ == '__main__': args = parse_arguments() print(args) - main(args) + main(args) \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/scripts/requirements.txt b/models/nlp/large_language_model/llama2-13b/trtllm/scripts/requirements.txt index f7cbbb8b7e9bbd8aab6303fd8b5de1dacbd353b8..38e019fe009252d42e512c6b71cc261bb7788de7 100644 --- a/models/nlp/large_language_model/llama2-13b/trtllm/scripts/requirements.txt +++ b/models/nlp/large_language_model/llama2-13b/trtllm/scripts/requirements.txt @@ -1,30 +1,43 @@ -accelerate +accelerate>=0.25.0 build colored # cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image. -diffusers +# diffusers>=0.27.0 lark mpi4py -numpy +numpy<2 onnx>=1.12.0 +openai polygraphy psutil pybind11 -pynvml>=11.5.0 -sentencepiece>=0.1.99 -# tensorrt==9.2.0.post12.dev5 -# torch -# nvidia-ammo~=0.5.0; platform_machine=="x86_64" -transformers +# pynvml>=11.5.0 +pulp +pandas +h5py==3.10.0 +StrEnum +# tensorrt~=10.3.0 +# https://github.com/pytorch/pytorch/blob/v2.4.0/version.txt uses 2.4.0a0. +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07 uses 2.4.0a0. +# torch>=2.4.0a0,<=2.4.0 +# nvidia-modelopt~=0.15.0 +transformers>=4.38.2,<=4.42.4 +#transformers +pillow==10.3.0 wheel optimum -evaluate janus -parameterized -scikit-learn +mpmath>=1.3.0 +click +click_option_group +aenum +datasets==2.14.6 +evaluate~=0.4.1 +rouge_score~=0.1.2 +sentencepiece~=0.1.99 + -# special -scipy==1.11.4 -pandas==1.5.3 -nltk -rouge_score +setuptools +parameterized +# scikit-learn +# scipy==1.11.4 \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/scripts/test_trtllm_llama2_13b_gpu2.sh b/models/nlp/large_language_model/llama2-13b/trtllm/scripts/test_trtllm_llama2_13b_gpu2.sh index 2e0f26dd76cbf12ea7789b21727b20615e34b5ec..72212c0f2087f4df4faa6e67ae8df10cadd1ef7a 100644 --- a/models/nlp/large_language_model/llama2-13b/trtllm/scripts/test_trtllm_llama2_13b_gpu2.sh +++ b/models/nlp/large_language_model/llama2-13b/trtllm/scripts/test_trtllm_llama2_13b_gpu2.sh @@ -15,16 +15,20 @@ #!/bin/bash +echo "start run $0" + EXIT_STATUS=0 -LOG_LEVEL=info +LOG_LEVEL=${LOG_LEVEL:-INFO} BS=${BS:-1} DTYPE=${DTYPE:-"float16"} +LOAD_TIME_TARGET=${LOAD_TIME_TARGET:-16} +TPS_TARGET=${TPS_TARGET:-32.2} PROJECT_DIR="./" DATASET_DIR=${DATASET_DIR:-"${PROJECT_DIR}/data/datasets_cnn_dailymail"} MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-13b-chat"} -ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}/checkpoints/"} +ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}"} export TLLM_LOG_LEVEL=${LOG_LEVEL} export PLUGIN_DTYPE="float16" @@ -38,7 +42,7 @@ check_status() export TASK_DATA_PATH=${DATASET_DIR} -# target is 95% of best (load engine time: 41.74, rouge1: 29.21, tps: 15.23) +# target is 80% of best (load engine time: 11, rouge1: 29.54, tps: 33.9) mpirun -n 2 --allow-run-as-root \ python3 ${PROJECT_DIR}/summarize.py \ --test_trt_llm \ @@ -49,8 +53,8 @@ python3 ${PROJECT_DIR}/summarize.py \ --tokenizer_dir ${MODEL_DIR} \ --tokenizer_type "llama" \ --engine_dir ${ENGINE_DIR} \ ---target_load_engine_time 43.94 \ ---tensorrt_llm_rouge1_threshold 27.74 \ ---target_tps 14.46 \ +--tensorrt_llm_rouge1_threshold 27.7 \ +--target_load_engine_time ${LOAD_TIME_TARGET} \ +--target_tps ${TPS_TARGET} \ --use_py_session "$@"; check_status -exit ${EXIT_STATUS} +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/scripts/test_trtllm_llama2_13b_gpu2_build.sh b/models/nlp/large_language_model/llama2-13b/trtllm/scripts/test_trtllm_llama2_13b_gpu2_build.sh index 4919ffa3cefd7cf5c9f3c74a1186a24d00eb3a96..f33a8013a55ebad395b301be290a333aebcf998a 100644 --- a/models/nlp/large_language_model/llama2-13b/trtllm/scripts/test_trtllm_llama2_13b_gpu2_build.sh +++ b/models/nlp/large_language_model/llama2-13b/trtllm/scripts/test_trtllm_llama2_13b_gpu2_build.sh @@ -15,17 +15,19 @@ #!/bin/bash +echo "start run $0" + EXIT_STATUS=0 -LOG_LEVEL=info +LOG_LEVEL=${LOG_LEVEL:-INFO} BS=${BS:-1} DTYPE=${DTYPE:-"float16"} +BUILD_TIME_TARGET=${BUILD_TIME_TARGET:-72} PROJECT_DIR="./" MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-13b-chat"} -OUTPUT_DIR=${OUTPUT_DIR:-"${PROJECT_DIR}/checkpoints/"} - -echo PROJECT_DIR : ${PROJECT_DIR} +ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}"} +CHECKPOINT_DIR="${ENGINE_DIR}/checkpoints" export TLLM_LOG_LEVEL=${LOG_LEVEL} export PLUGIN_DTYPE="float16" @@ -37,17 +39,23 @@ check_status() fi } -# best(build engine time: 223.33) is 95% of target -python3 ${PROJECT_DIR}/build.py \ ---log_level ${LOG_LEVEL} \ ---dtype ${DTYPE} \ + +python3 convert_checkpoint.py \ --model_dir ${MODEL_DIR} \ ---remove_input_padding \ ---use_gpt_attention_plugin float16 --use_gemm_plugin float16 \ ---enable_context_fmha \ ---disable_xqa \ ---world_size 2 \ +--output_dir ${CHECKPOINT_DIR} \ --tp_size 2 \ ---total_build_time_target 235.1 \ ---output_dir ${OUTPUT_DIR} "$@"; check_status -exit ${EXIT_STATUS} +--workers 2 \ +--dtype ${DTYPE} + + +# best(build engine time: 50) is 70% of target +trtllm-build \ +--log_level ${LOG_LEVEL} \ +--max_batch_size ${BS} \ +--checkpoint_dir ${CHECKPOINT_DIR} \ +--remove_input_padding enable \ +--context_fmha enable \ +--workers 2 \ +--total_build_time_target ${BUILD_TIME_TARGET} \ +--output_dir ${ENGINE_DIR} "$@"; check_status +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/summarize.py b/models/nlp/large_language_model/llama2-13b/trtllm/summarize.py index 8e9437f506a09aefb4c6e63d2827e8bdea2814e4..8896ab0bfb500807ea8e6d8a2b9568a3afc257f6 100644 --- a/models/nlp/large_language_model/llama2-13b/trtllm/summarize.py +++ b/models/nlp/large_language_model/llama2-13b/trtllm/summarize.py @@ -26,11 +26,12 @@ import torch from datasets import load_dataset, load_from_disk from transformers import (AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, GenerationConfig) -from utils import DEFAULT_HF_MODEL_DIRS, load_tokenizer, read_model_name +from utils import (DEFAULT_HF_MODEL_DIRS, add_common_args, load_tokenizer, + read_model_name, supports_inflight_batching) import tensorrt_llm import tensorrt_llm.profiler as profiler -from tensorrt_llm._utils import str_dtype_to_torch +from tensorrt_llm._utils import mpi_broadcast, str_dtype_to_torch from tensorrt_llm.logger import logger from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner from tensorrt_llm.tools.ppl import ppl @@ -58,19 +59,26 @@ def main(args): runtime_rank = tensorrt_llm.mpi_rank() logger.set_level(args.log_level) - model_name = read_model_name(args.engine_dir) + test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0 + test_trt_llm = args.test_trt_llm + model_name, model_version = read_model_name(args.engine_dir) if args.hf_model_dir is None: - args.hf_model_dir = DEFAULT_HF_MODEL_DIRS[model_name] + logger.warning( + "hf_model_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) + if model_name in DEFAULT_HF_MODEL_DIRS: + args.hf_model_dir = DEFAULT_HF_MODEL_DIRS[model_name] + else: + args.hf_model_dir = None if args.tokenizer_dir is None: args.tokenizer_dir = args.hf_model_dir - test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0 - test_trt_llm = args.test_trt_llm profiler.start('load tokenizer') tokenizer, pad_id, end_id = load_tokenizer( tokenizer_dir=args.tokenizer_dir, vocab_file=args.vocab_file, model_name=model_name, + model_version=model_version, tokenizer_type=args.tokenizer_type, ) profiler.stop('load tokenizer') @@ -96,24 +104,34 @@ def main(args): dataset_input_key = 'input' dataset_output_key = 'output' dataset_split = 'validation' # only this split contains reference strings - - + elif args.eval_task == "eval_context_ppl": + dataset_name = "SlimPajama-6B" + dataset_revision = None + dataset_input_key = 'text' + dataset_output_key = 'text' + dataset_split = 'test' + args.output_len = 1 # Only want to compute the ppl of context + args.eval_ppl = True + logger.warning( + f"Run task '{args.eval_task}', setting 'output_len' to 1, and enable 'eval_ppl'." + ) + if args.dataset_dir is not None and isinstance(args.dataset_dir, str): + args.dataset_dir = args.dataset_dir.rstrip('/') + if args.dataset_dir.endswith(dataset_name): + dataset_name = args.dataset_dir + else: + dataset_name = f"{args.dataset_dir}/{dataset_name}" + logger.info(f"prepare datasets....") if os.getenv("TASK_DATA_PATH"): dataset = load_from_disk(os.getenv("TASK_DATA_PATH"))[dataset_split] else: - # dataset = load_dataset(dataset_name, - # dataset_revision, - # cache_dir=args.dataset_path, - # split=dataset_split, - # trust_remote_code=True) - dataset = load_dataset(dataset_name, dataset_revision, - cache_dir=args.dataset_path, + cache_dir=args.dataset_cache_dir, split=dataset_split) - logger.info(f"datasets is ready.") + max_batch_size = args.batch_size # runtime parameters @@ -124,77 +142,35 @@ def main(args): max_attention_window_size = args.max_attention_window_size sink_token_length = args.sink_token_length + if args.end_id: + end_id = args.end_id + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + if model_version == 'glm4': # add default stop token ids for GLM-4 + glm4_stop_ids = [[151329], [151336], [151338]] + if stop_words_list is None: + stop_words_list = [glm4_stop_ids] * args.batch_size + else: + for req_stop_words_list in stop_words_list: + req_stop_words_list.extend(glm4_stop_ids) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + # random_seed = 5 temperature = args.temperature num_beams = args.num_beams length_penalty = args.length_penalty + early_stopping = args.early_stopping repetition_penalty = args.repetition_penalty presence_penalty = args.presence_penalty frequency_penalty = args.frequency_penalty - if test_trt_llm: - if not PYTHON_BINDINGS and not args.use_py_session: - logger.warning( - "Python bindings of C++ session is unavailable, fallback to Python session." - ) - args.use_py_session = True - runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict(engine_dir=args.engine_dir, - rank=runtime_rank, - debug_mode=args.debug_mode) - if args.medusa_choices is not None: - args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.use_py_session, "Medusa is only supported by py_session" - assert args.temperature == 0, "Medusa should use temperature == 0" - assert args.num_beams == 1, "Medusa should use num_beams == 1" - runner_kwargs.update(medusa_choices=args.medusa_choices) - if not args.use_py_session: - runner_kwargs.update( - max_batch_size=max_batch_size, - max_input_len=test_token_num, - max_output_len=output_len, - max_beam_width=num_beams, - max_attention_window_size=max_attention_window_size, - sink_token_length=sink_token_length) - runner = runner_cls.from_dir(**runner_kwargs) - assert not (args.eval_ppl and not (runner.gather_context_logits and runner.gather_generation_logits)), \ - "PPL evaluation requires engine built with gather_all_token_logits enabled" - - if test_hf: - profiler.start('load HF model') - dtype_alias_mapping = { - 'fp32': 'float32', - 'fp16': 'float16', - 'bf16': 'bfloat16' - } - args.data_type = dtype_alias_mapping.get(args.data_type, args.data_type) - if model_name.startswith('chatglm'): - auto_model_cls = AutoModel - elif model_name.startswith('glm'): - auto_model_cls = AutoModelForSeq2SeqLM - else: - auto_model_cls = AutoModelForCausalLM - model = auto_model_cls.from_pretrained( - args.hf_model_dir, - trust_remote_code=True, - torch_dtype=str_dtype_to_torch(args.data_type), - device_map='auto' if args.hf_device_map_auto else None) - try: - model.to_bettertransformer() - except ValueError as e: - logger.warning( - f'Fail to call model.to_bettertransformer(), exception:\n{str(e)}' - ) - if not args.hf_device_map_auto: - model.cuda() - if model_name == 'qwen': - model.generation_config = GenerationConfig.from_pretrained( - args.hf_model_dir, trust_remote_code=True) - profiler.stop('load HF model') - logger.info( - f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec' - ) - output_dir = Path(args.output_dir) if args.output_dir else None if output_dir is not None: output_dir.mkdir(exist_ok=True, parents=True) @@ -207,9 +183,21 @@ def main(args): f.write(f'Model path: {args.hf_model_dir}\n') f.write(f'Tokenizer path: {args.tokenizer_dir}\n') + # TODO: Add random_seed flag in gptj + rouge_dir = args.rouge_dir if args.rouge_dir and os.path.exists( + args.rouge_dir) else "rouge" + metric_tensorrt_llm = [evaluate.load(rouge_dir) for _ in range(num_beams)] + metric_hf = [evaluate.load(rouge_dir) for _ in range(num_beams)] + for i in range(num_beams): + metric_tensorrt_llm[i].seed = 0 + metric_hf[i].seed = 0 + ppls_trt_llm = [[] for _ in range(num_beams)] + ppls_hf = [[] for _ in range(num_beams)] + def _prepare_inputs(batch_input_texts, eval_task='summarize', - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(batch_input_texts) append_str = ' TL;DR: ' if eval_task == 'summarize' else '' batch_input_ids = [] @@ -218,12 +206,13 @@ def main(args): curr_text = curr_text.strip().replace(" n't", "n't") # TODO: The below lines are used to be compatible with the original code; may need fix - if model_name.startswith(('chatglm2', 'chatglm3')): + if 'GLM' in model_name and model_version in ('chatglm2', + 'chatglm3'): input_ids = tokenizer.encode(curr_text, return_tensors='pt').squeeze(0) input_ids = input_ids[:test_token_num] - elif model_name == 'qwen': - from qwen.utils.utils import make_context + elif 'qwen' in model_name.lower() and model_version == 'qwen': + from tensorrt_llm.models.qwen.utils import make_context # use make_content to generate prompt system_prompt = "You are a useful assistant, please directly output the corresponding summary according to the article entered by the user." _, input_id_list = make_context( @@ -235,6 +224,18 @@ def main(args): ) input_ids = torch.tensor(input_id_list) else: + if 'qwen' in model_name.lower() and 'qwen2' in model_version: + messages = [{ + "role": + "system", + "content": + "You are a helpful assistant, please summarize the article entered by the user with one or two sentences." + }, { + "role": "user", + "content": curr_text + }] + curr_text = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) input_ids = tokenizer.encode( curr_text, return_tensors='pt', @@ -242,17 +243,23 @@ def main(args): truncation=True, max_length=test_token_num).squeeze(0) - batch_input_ids.append(input_ids) + if input_ids.numel() > min_input_length: + batch_input_ids.append(input_ids) return batch_input_ids def eval_trt_llm(datapoint, eval_task='summarize', eval_ppl=False, - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(datapoint[dataset_input_key]) batch_input_ids = _prepare_inputs(datapoint[dataset_input_key], eval_task=eval_task, - add_special_tokens=add_special_tokens) + add_special_tokens=add_special_tokens, + min_input_length=min_input_length) + batch_size = len(batch_input_ids) + if batch_size == 0: + return [], [], [], {} input_lengths = [x.size(0) for x in batch_input_ids] with torch.no_grad(): @@ -266,11 +273,15 @@ def main(args): temperature=temperature, top_k=top_k, top_p=top_p, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, num_beams=num_beams, length_penalty=length_penalty, + early_stopping=early_stopping, repetition_penalty=repetition_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + lora_uids=args.lora_task_uids, output_sequence_lengths=True, return_dict=True, medusa_choices=args.medusa_choices) @@ -327,7 +338,8 @@ def main(args): def eval_hf(datapoint, eval_task='summarize', eval_ppl=False, - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(datapoint[dataset_input_key]) if batch_size > 1: logger.warning( @@ -335,7 +347,11 @@ def main(args): ) batch_input_ids = _prepare_inputs(datapoint[dataset_input_key], eval_task=eval_task, - add_special_tokens=add_special_tokens) + add_special_tokens=add_special_tokens, + min_input_length=min_input_length) + batch_size = len(batch_input_ids) + if batch_size == 0: + return [], [], [], [[] for _ in range(batch_size)] input_lengths = [x.size(0) for x in batch_input_ids] # Left padding for HF max_length = max(input_lengths) @@ -349,6 +365,12 @@ def main(args): batch_input_ids = torch.stack(batch_input_ids) batch_input_ids = batch_input_ids.cuda() + # specialization for HF + if early_stopping in [0, 1]: + local_early_stopping = bool(early_stopping) + else: + local_early_stopping = "never" + with torch.no_grad(): outputs = model.generate(batch_input_ids, max_new_tokens=output_len, @@ -358,8 +380,8 @@ def main(args): pad_token_id=pad_id, num_beams=num_beams, num_return_sequences=num_beams, - early_stopping=True, length_penalty=length_penalty, + early_stopping=local_early_stopping, output_scores=True, return_dict_in_generate=True) if eval_ppl and batch_size == 1: @@ -384,7 +406,14 @@ def main(args): output_ids != pad_id).sum(dim=-1) context_logits = context_outputs['logits'] # Remove the first generation logits which are same to last context logits - generation_logits = torch.stack(outputs['scores'][1:], dim=1) + generation_logits = outputs['scores'][1:] + # When output_len is 1, generation_logits would be () and lead to error if we do torch.stack + if len(generation_logits) == 0: + generation_logits = torch.empty( + [context_logits.shape[0], 0, context_logits.shape[-1]], + device=context_logits.device) + else: + generation_logits = torch.stack(generation_logits, dim=1) _, max_gen_len, voc_size = generation_logits.size() generation_logits = generation_logits.view(batch_size, num_beams, max_gen_len, voc_size) @@ -410,12 +439,58 @@ def main(args): return output_lines_list, tokens_list, ppls if test_trt_llm: + if not supports_inflight_batching(args.engine_dir): + logger.warning( + "The given engine does not support in-flight batching, fallback to python session" + ) + args.use_py_session = True + + if not PYTHON_BINDINGS and not args.use_py_session: + logger.warning( + "Python bindings of C++ session is unavailable, fallback to Python session." + ) + args.use_py_session = True + if args.return_all_generated_tokens: + raise ValueError( + "Returning all the generated tokens at each step is not supported in summarize.py" + ) + runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp + runner_kwargs = dict(engine_dir=args.engine_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + gpu_weights_percent=args.gpu_weights_percent) + if args.medusa_choices is not None: + args.medusa_choices = ast.literal_eval(args.medusa_choices) + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" + assert args.num_beams == 1, "Medusa should use num_beams == 1" + runner_kwargs.update(medusa_choices=args.medusa_choices) + if not args.use_py_session: + runner_kwargs.update( + max_batch_size=max_batch_size, + max_input_len=test_token_num, + max_output_len=output_len, + max_beam_width=num_beams, + max_attention_window_size=max_attention_window_size, + sink_token_length=sink_token_length, + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode) + runner_kwargs.update( + enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) + runner = runner_cls.from_dir(**runner_kwargs) + assert not (args.eval_ppl and not (runner.gather_context_logits and runner.gather_generation_logits)), \ + "PPL evaluation requires engine built with gather_all_token_logits enabled" + datapoint = dataset[0:1] output, *_ = eval_trt_llm(datapoint, eval_task=args.eval_task, eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - if runtime_rank == 0: + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + if runtime_rank == 0 and args.eval_task != "eval_context_ppl": logger.info( "---------------------------------------------------------") logger.info("TensorRT-LLM Generated : ") @@ -424,71 +499,50 @@ def main(args): logger.info(f"\n Output : {output}") logger.info( "---------------------------------------------------------") - if test_hf: - datapoint = dataset[0:1] - output, *_ = eval_hf(datapoint, - eval_task=args.eval_task, - eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - logger.info("---------------------------------------------------------") - logger.info("HF Generated : ") - logger.info(f" Input : {datapoint[dataset_input_key]}") - logger.info(f"\n Reference : {datapoint[dataset_output_key]}") - logger.info(f"\n Output : {output}") - logger.info("---------------------------------------------------------") - # TODO: Add random_seed flag in gptj - metric_tensorrt_llm = [evaluate.load("rouge") for _ in range(num_beams)] - metric_hf = [evaluate.load("rouge") for _ in range(num_beams)] - for i in range(num_beams): - metric_tensorrt_llm[i].seed = 0 - metric_hf[i].seed = 0 - ppls_trt_llm = [[] for _ in range(num_beams)] - ppls_hf = [[] for _ in range(num_beams)] + ite_count = 0 + data_point_idx = 0 + total_output_token_count_trt_llm = 0 # only valid for runtime_rank == 0 + + if args.stability_test: + logger.info(f"stability test, need {args.stability_test_hours} hours") + else: + logger.info(f"dataset size: {len(dataset)}, max_ite: {args.max_ite}") + stability_start_time = time.time() - ite_count = 0 - data_point_idx = 0 - total_output_token_count_trt_llm = 0 # only valid for runtime_rank == 0 - - if args.stability_test: - logger.info(f"stability test, need {args.stability_test_hours} hours") - else: - logger.info(f"dataset size: {len(dataset)}, max_ite: {args.max_ite}") - stability_start_time = time.time() - while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): - if runtime_rank == 0: - logger.debug( - f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" - ) - datapoint = dataset[data_point_idx:(data_point_idx + max_batch_size)] + while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): + if runtime_rank == 0: + logger.debug( + f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" + ) + datapoint = dataset[data_point_idx:(data_point_idx + + max_batch_size)] - if test_trt_llm: profiler.start('tensorrt_llm') output_tensorrt_llm, output_ids_trt_llm, curr_ppls_trt_llm, lengths_info = eval_trt_llm( datapoint, eval_task=args.eval_task, eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) profiler.stop('tensorrt_llm') + + empty_batch = (runtime_rank == 0 and len(output_tensorrt_llm) == 0) + empty_batch = mpi_broadcast(empty_batch, 0) + if empty_batch: + # No valid samples in the current batch, skip this iteration + data_point_idx += max_batch_size + continue + if runtime_rank == 0: input_lengths = lengths_info['input_lengths'] seq_lengths = lengths_info['seq_lengths'] output_token_count_trt_llm = sum( - seq_lengths[idx][0] - input_lengths[idx] - for idx in range(len(input_lengths))) + seq_lengths[bs][bm] - input_lengths[bs] + for bm in range(len(output_tensorrt_llm[0])) + for bs in range(len(output_tensorrt_llm))) total_output_token_count_trt_llm += output_token_count_trt_llm - if test_hf: - profiler.start('hf') - output_hf, _, curr_ppls_hf = eval_hf( - datapoint, - eval_task=args.eval_task, - eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - profiler.stop('hf') - - if runtime_rank == 0: - if test_trt_llm: for batch_idx in range(len(output_tensorrt_llm)): for beam_idx in range(num_beams): metric_tensorrt_llm[beam_idx].add_batch( @@ -502,13 +556,121 @@ def main(args): ppls_trt_llm[beam_idx].append( curr_ppls_trt_llm[batch_idx][beam_idx]) if output_dir is not None: - # yapf: disable for i in range(len(output_tensorrt_llm[0])): for beam_idx in range(num_beams): with (output_dir / 'trtllm.out').open('a') as f: - f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_tensorrt_llm[beam_idx][i]}\n') - # yapf: enable - if test_hf: + f.write( + f'[{data_point_idx + i}] [Beam {beam_idx}] {output_tensorrt_llm[beam_idx][i]}\n' + ) + + logger.debug('-' * 100) + logger.debug(f"Input : {datapoint[dataset_input_key]}") + logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}') + logger.debug(f"Reference : {datapoint[dataset_output_key]}") + + data_point_idx += max_batch_size + ite_count += 1 + + if args.stability_test: + test_time_hours = round((time.time() - stability_start_time) / 3600, 1) + if test_time_hours > args.stability_test_hours: + if runtime_rank == 0: + logger.info(f"Stability Test Finished. Total run {test_time_hours} hours.") + break + else: + data_point_idx = data_point_idx % len(dataset) + ite_count = ite_count % args.max_ite + if runtime_rank == 0 and ite_count % 100 == 0: + logger.info( + f"stability test, remain {round(args.stability_test_hours - test_time_hours, 1)} hours") + elif runtime_rank == 0 and ite_count % 10 == 0: + logger.info(f"data_point_idx: {data_point_idx}, ite_count: {ite_count}") + + del runner + + if test_hf and runtime_rank == 0: + profiler.start('load HF model') + dtype_alias_mapping = { + 'fp32': 'float32', + 'fp16': 'float16', + 'bf16': 'bfloat16' + } + args.hf_data_type = dtype_alias_mapping.get(args.hf_data_type, + args.hf_data_type) + if 'GLM' in model_name and model_version == 'glm': + auto_model_cls = AutoModelForSeq2SeqLM + elif 'GLM' in model_name and model_version == 'chatglm': + auto_model_cls = AutoModel + else: + auto_model_cls = AutoModelForCausalLM + model = auto_model_cls.from_pretrained( + args.hf_model_dir, + trust_remote_code=True, + torch_dtype=str_dtype_to_torch(args.hf_data_type), + device_map='auto' if args.hf_device_map_auto else None) + try: + model.to_bettertransformer() + except Exception as e: + logger.warning( + f'Fail to call model.to_bettertransformer(), exception:\n{str(e)}' + ) + if not args.hf_device_map_auto: + model.cuda() + if model_name == 'qwen': + model.generation_config = GenerationConfig.from_pretrained( + args.hf_model_dir, trust_remote_code=True) + profiler.stop('load HF model') + logger.info( + f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec' + ) + + datapoint = dataset[0:1] + output, *_ = eval_hf(datapoint, + eval_task=args.eval_task, + eval_ppl=args.eval_ppl, + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + if runtime_rank == 0 and args.eval_task != "eval_context_ppl": + logger.info( + "---------------------------------------------------------") + logger.info("HF Generated : ") + logger.info(f" Input : {datapoint[dataset_input_key]}") + logger.info(f"\n Reference : {datapoint[dataset_output_key]}") + logger.info(f"\n Output : {output}") + logger.info( + "---------------------------------------------------------") + + ite_count = 0 + data_point_idx = 0 + total_output_token_count_hf = 0 # only valid for runtime_rank == 0 + while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): + if runtime_rank == 0: + logger.debug( + f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" + ) + datapoint = dataset[data_point_idx:(data_point_idx + + max_batch_size)] + + profiler.start('hf') + output_hf, token_list, curr_ppls_hf = eval_hf( + datapoint, + eval_task=args.eval_task, + eval_ppl=args.eval_ppl, + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + profiler.stop('hf') + + # HF model runs on rank 0 only + empty_batch = len(output_hf) == 0 + if empty_batch: + # No valid samples in the current batch, skip this iteration + data_point_idx += max_batch_size + continue + + if runtime_rank == 0: + seq_lengths = [len(tokens) for tokens in token_list] + total_output_token_count_hf += sum(seq_lengths) + for beam_idx in range(num_beams): for batch_idx in range(len(output_hf[beam_idx])): metric_hf[beam_idx].add_batch( @@ -520,37 +682,21 @@ def main(args): ppls_hf[beam_idx].append( curr_ppls_hf[batch_idx][beam_idx]) if output_dir is not None: - # yapf: disable for i in range(len(output_hf[0])): for beam_idx in range(num_beams): with (output_dir / 'hf.out').open('a') as f: - f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_hf[beam_idx][i]}\n') - # yapf: enable + f.write( + f'[{data_point_idx + i}] [Beam {beam_idx}] {output_hf[beam_idx][i]}\n' + ) - logger.debug('-' * 100) - logger.debug(f"Input : {datapoint[dataset_input_key]}") - if test_trt_llm: - logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}') - if test_hf: + logger.debug('-' * 100) + logger.debug(f"Input : {datapoint[dataset_input_key]}") logger.debug(f'HF Output: {output_hf}') - logger.debug(f"Reference : {datapoint[dataset_output_key]}") + logger.debug(f"Reference : {datapoint[dataset_output_key]}") - data_point_idx += max_batch_size - ite_count += 1 - - if args.stability_test: - test_time_hours = round((time.time() - stability_start_time)/3600, 1) - if test_time_hours > args.stability_test_hours: - if runtime_rank == 0: - logger.info(f"Stability Test Finished. Total run {test_time_hours} hours.") - break - else: - data_point_idx = data_point_idx % len(dataset) - ite_count = ite_count % args.max_ite - if runtime_rank == 0 and ite_count % 1000 == 0: - logger.info(f"stability test, remain {round(args.stability_test_hours - test_time_hours, 1)} hours") - elif runtime_rank == 0 and ite_count % 10 == 0: - logger.info(f"data_point_idx: {data_point_idx}, ite_count: {ite_count}") + data_point_idx += max_batch_size + ite_count += 1 + del model if runtime_rank == 0: if test_trt_llm: @@ -558,6 +704,7 @@ def main(args): logger.info( f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)' ) + logger.info( f'TensorRT-LLM (total output tokens: {total_output_token_count_trt_llm})' ) @@ -567,30 +714,30 @@ def main(args): rouge1 = 0 tps = total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm") - + for beam_idx in range(num_beams): logger.info(f"TensorRT-LLM beam {beam_idx} result") - computed_metrics_tensorrt_llm = metric_tensorrt_llm[ - beam_idx].compute() - for key in computed_metrics_tensorrt_llm.keys(): - logger.info( - f' {key} : {computed_metrics_tensorrt_llm[key]*100}') - - if args.check_accuracy and beam_idx == 0: - assert computed_metrics_tensorrt_llm[ - 'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold - - if beam_idx == 0: - rouge1 = computed_metrics_tensorrt_llm['rouge1'] * 100 - + if args.eval_task != "eval_context_ppl": + computed_metrics_tensorrt_llm = metric_tensorrt_llm[ + beam_idx].compute() + for key in computed_metrics_tensorrt_llm.keys(): + logger.info( + f' {key} : {computed_metrics_tensorrt_llm[key]*100}' + ) + if args.check_accuracy and beam_idx == 0: + assert computed_metrics_tensorrt_llm[ + 'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold + + if beam_idx == 0: + rouge1 = computed_metrics_tensorrt_llm['rouge1'] * 100 if args.eval_ppl: logger.info( f" Per-token perplexity: {np.mean(ppls_trt_llm[beam_idx])}" ) if args.check_accuracy and beam_idx == 0: - assert np.mean(ppls_trt_llm[beam_idx] - ) < args.tensorrt_llm_ppl_threshold - + avg_ppl = np.mean(ppls_trt_llm[beam_idx]) + assert avg_ppl < args.tensorrt_llm_ppl_threshold, f"[FAILED] average PPL ({avg_ppl}) is larger than threshold ({args.tensorrt_llm_ppl_threshold})" + load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine") logger.info(f'Load engine takes: {load_engine_time} sec') @@ -599,19 +746,27 @@ def main(args): print("successful.") else: print("failed.") - - sys.exit(int(not status)) - + + sys.exit(int(not status)) + if test_hf: np.random.seed(0) # rouge score use sampling to compute the score logger.info( f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)' ) + logger.info( + f'Hugging Face (total output tokens: {total_output_token_count_hf})' + ) + logger.info( + f'Hugging Face (tokens per second: {total_output_token_count_hf / profiler.elapsed_time_in_sec("hf")})' + ) + for beam_idx in range(num_beams): logger.info(f"HF beam {beam_idx} result") computed_metrics_hf = metric_hf[beam_idx].compute() - for key in computed_metrics_hf.keys(): - logger.info(f' {key} : {computed_metrics_hf[key]*100}') + if args.eval_task != "eval_context_ppl": + for key in computed_metrics_hf.keys(): + logger.info(f' {key} : {computed_metrics_hf[key]*100}') if args.eval_ppl and args.batch_size == 1: logger.info( f" Per-token perplexity: {np.mean(ppls_hf[beam_idx])}") @@ -619,34 +774,15 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) - parser.add_argument( - '--tokenizer_dir', - default=None, - help='tokenizer path; defaults to hf_model_dir if left unspecified') - parser.add_argument( - '--tokenizer_type', - help= - 'Specify that argument when providing a .model file as the tokenizer_dir. ' - 'It allows AutoTokenizer to instantiate the correct tokenizer type.') - parser.add_argument('--vocab_file') parser.add_argument('--test_hf', action='store_true') parser.add_argument('--test_trt_llm', action='store_true') - parser.add_argument( - '--data_type', - type=str, - choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'], - default='fp16') - parser.add_argument('--engine_dir', type=str, default='engine_outputs') - parser.add_argument('--use_py_session', - default=False, - action='store_true', - help="Whether or not to use Python runtime session") - parser.add_argument( - '--eval_task', - type=str, - default='summarize', - choices=['summarize', 'summarize_long', 'code_completion']) + parser.add_argument('--eval_task', + type=str, + default='summarize', + choices=[ + 'summarize', 'summarize_long', 'code_completion', + 'eval_context_ppl' + ]) parser.add_argument('--check_accuracy', action='store_true') parser.add_argument('--tensorrt_llm_rouge1_threshold', type=float, @@ -655,51 +791,33 @@ if __name__ == '__main__': parser.add_argument('--tensorrt_llm_ppl_threshold', type=float, default=15.0) + parser.add_argument( + '--dataset_dir', + type=str, + default=None, + help="The local directory of the dataset for evaluation; " + "will download the dataset from huggingface hub if not specified.") + parser.add_argument( + '--dataset_cache_dir', + type=str, + default=None, + help="The local cache directory for dataset; " + "will use `~/.cache/huggingface/datasets` if not specified.") parser.add_argument('--target_load_engine_time', type=float, default=0) parser.add_argument('--target_tps', type=float, default=0) - parser.add_argument('--dataset_path', type=str, default='') - parser.add_argument('--log_level', type=str, default='info') parser.add_argument('--batch_size', type=int, default=1) parser.add_argument('--max_ite', type=int, default=20) parser.add_argument('--output_len', type=int, default=100) parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument( - '--max_attention_window_size', + '--min_input_length', type=int, - default=None, - help= - 'The attention window size that controls the sliding window attention / cyclic kv cache behaviour' - ) - parser.add_argument('--sink_token_length', - type=int, - default=None, - help='The sink token length.') - parser.add_argument('--num_beams', type=int, default=1) - parser.add_argument('--temperature', type=float, default=1.0) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--top_p', type=float, default=0.0) - parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--repetition_penalty', type=float, default=1.0) - parser.add_argument('--presence_penalty', type=float, default=0.0) - parser.add_argument('--frequency_penalty', type=float, default=0.0) - parser.add_argument('--debug_mode', - default=False, - action='store_true', - help="Whether or not to turn on the debug mode") - parser.add_argument('--no_add_special_tokens', - dest='add_special_tokens', - default=True, - action='store_false', - help="Whether or not to add special tokens") - parser.add_argument( - '--hf_device_map_auto', - action='store_true', - help="Use device map 'auto' to load a pretrained HF model. This may " - "help to test a large model that cannot fit into a singlue GPU.") + default=0, + help='skip the sentences which are shorter than min_input_length.') parser.add_argument( '--output_dir', type=str, @@ -708,17 +826,19 @@ if __name__ == '__main__': "TensorRT-LLM outputs, and 'hf.out' for HF outputs. If None, do not " "save outputs.") parser.add_argument( - '--medusa_choices', - type=str, + '--rouge_dir', default=None, - help="Medusa choice to use, if not none, will use Medusa decoding." - " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." + type=str, + help= + "evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF." ) parser.add_argument('--stability_test', default=False, action='store_true', help="Whether or not to run stability test for tensorrt_llm.") parser.add_argument('--stability_test_hours', type=float, default=24.0) + parser = add_common_args(parser) args = parser.parse_args() print(args) - main(args) + + main(args) \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/utils.py b/models/nlp/large_language_model/llama2-13b/trtllm/utils.py index 44042d9e2dcb44dd6cd917ab16a00010e4005202..340ea03995dc62d200234e43ec3e73a4d4923bbb 100644 --- a/models/nlp/large_language_model/llama2-13b/trtllm/utils.py +++ b/models/nlp/large_language_model/llama2-13b/trtllm/utils.py @@ -12,55 +12,90 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import json from pathlib import Path from typing import Optional -from transformers import AutoTokenizer, T5Tokenizer +from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -import tensorrt_llm +from tensorrt_llm.bindings import GptJsonConfig +from tensorrt_llm.builder import get_engine_version DEFAULT_HF_MODEL_DIRS = { - 'baichuan': 'baichuan-inc/Baichuan-13B-Chat', - 'bloom': 'bigscience/bloom-560m', - 'chatglm_6b': 'THUDM/chatglm-6b', - 'chatglm2_6b': 'THUDM/chatglm2-6b', - 'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k', - 'chatglm3_6b': 'THUDM/chatglm3-6b', - 'chatglm3_6b_base': 'THUDM/chatglm3-6b-base', - 'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k', - 'falcon': 'tiiuae/falcon-rw-1b', - 'glm_10b': 'THUDM/glm-10b', - 'gpt': 'gpt2-medium', - 'gptj': 'EleutherAI/gpt-j-6b', - 'gptneox': 'EleutherAI/gpt-neox-20b', - 'internlm': 'internlm/internlm-chat-7b', - 'llama': 'meta-llama/Llama-2-7b-hf', - 'mpt': 'mosaicml/mpt-7b', - 'phi': 'microsoft/phi-2', - 'opt': 'facebook/opt-350m', - 'qwen': 'Qwen/Qwen-7B', + 'BaichuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat', + 'BaiChuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat', + 'BloomForCausalLM': 'bigscience/bloom-560m', + 'GLMModel': 'THUDM/glm-10b', + 'ChatGLMModel': 'THUDM/chatglm3-6b', + 'ChatGLMForCausalLM': 'THUDM/chatglm3-6b', + 'RWForCausalLM': 'tiiuae/falcon-rw-1b', + 'FalconForCausalLM': 'tiiuae/falcon-rw-1b', + 'GPT2LMHeadModel': 'gpt2', + 'GPT2LMHeadCustomModel': 'gpt2', + 'Starcoder2ForCausalLM': 'bigcode/starcoder2-3b', + 'GPTForCausalLM': 'gpt2', + 'GPTJForCausalLM': 'EleutherAI/gpt-j-6b', + 'GPTNeoXForCausalLM': 'EleutherAI/gpt-neox-20b', + 'InternLMForCausalLM': 'internlm/internlm-chat-7b', + 'InternLM2ForCausalLM': 'internlm/internlm2-chat-7b', + 'LlamaForCausalLM': 'meta-llama/Llama-2-7b-hf', + 'MPTForCausalLM': 'mosaicml/mpt-7b', + 'PhiForCausalLM': 'microsoft/phi-2', + 'OPTForCausalLM': 'facebook/opt-350m', + 'QWenLMHeadModel': 'Qwen/Qwen-7B', + 'QWenForCausalLM': 'Qwen/Qwen-7B', + 'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B', + 'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B', + 'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b', } +INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (书生·浦语). +- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. +- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文. +""" + +QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n" + DEFAULT_PROMPT_TEMPLATES = { - 'internlm': - "<|User|>:{input_text}\n<|Bot|>:", - 'qwen': - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", + 'InternLMForCausalLM': "<|User|>:{input_text}\n<|Bot|>:", + 'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION + + "<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", + 'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE, + 'QWenForCausalLM': QWEN_PROMPT_TEMPLATE, + 'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE, + 'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE, } +def supports_inflight_batching(engine_dir): + config_path = Path(engine_dir) / "config.json" + json_config = GptJsonConfig.parse_file(config_path) + model_config = json_config.model_config + return model_config.supports_inflight_batching + + +def read_decoder_start_token_id(engine_dir): + with open(Path(engine_dir) / "config.json", 'r') as f: + config = json.load(f) + return config['pretrained_config']['decoder_start_token_id'] + + def read_model_name(engine_dir: str): - engine_version = tensorrt_llm.runtime.engine.get_engine_version(engine_dir) + engine_version = get_engine_version(engine_dir) with open(Path(engine_dir) / "config.json", 'r') as f: config = json.load(f) if engine_version is None: - return config['builder_config']['name'] + return config['builder_config']['name'], None - return config['pretrained_config']['architecture'] + model_arch = config['pretrained_config']['architecture'] + model_version = None + if 'GLM' in model_arch: + model_version = config['pretrained_config']['chatglm_version'] + if 'qwen' in model_arch.lower(): + model_version = config['pretrained_config']['qwen_type'] + return model_arch, model_version def throttle_generator(generator, stream_interval): @@ -74,7 +109,8 @@ def throttle_generator(generator, stream_interval): def load_tokenizer(tokenizer_dir: Optional[str] = None, vocab_file: Optional[str] = None, - model_name: str = 'gpt', + model_name: str = 'GPTForCausalLM', + model_version: Optional[str] = None, tokenizer_type: Optional[str] = None): if vocab_file is None: use_fast = True @@ -86,28 +122,34 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None, padding_side='left', truncation_side='left', trust_remote_code=True, - tokenizer_type=tokenizer_type, + # tokenizer_type=tokenizer_type, # adapt to llama3 use_fast=use_fast) + elif model_name == 'GemmaForCausalLM' or model_name == 'RecurrentGemmaForCausalLM': + from transformers import GemmaTokenizer + + # Initialize tokenizer from vocab file. + tokenizer = GemmaTokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False) + elif model_name == 'Grok1ModelForCausalLM': + tokenizer = LlamaTokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False, + use_fast=False) else: # For gpt-next, directly load from tokenizer.model - assert model_name == 'gpt' tokenizer = T5Tokenizer(vocab_file=vocab_file, padding_side='left', - truncation_side='left') - - if model_name == 'qwen': + truncation_side='left', + legacy=False) + if 'qwen' in model_name.lower() and model_version == 'qwen': with open(Path(tokenizer_dir) / "generation_config.json") as f: gen_config = json.load(f) - chat_format = gen_config['chat_format'] - if chat_format == 'raw': - pad_id = gen_config['pad_token_id'] - end_id = gen_config['eos_token_id'] - elif chat_format == 'chatml': - pad_id = tokenizer.im_end_id - end_id = tokenizer.im_end_id - else: - raise Exception(f"unknown chat format: {chat_format}") - elif model_name == 'glm_10b': + pad_id = gen_config['pad_token_id'] + end_id = gen_config['eos_token_id'] + elif 'GLM' in model_name and model_version == 'glm': pad_id = tokenizer.pad_token_id end_id = tokenizer.eop_token_id else: @@ -117,3 +159,212 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None, end_id = tokenizer.eos_token_id return tokenizer, pad_id, end_id + + +def add_common_args(parser): + # sampling arguments + parser.add_argument('--num_beams', + type=int, + help="Use beam search if num_beams > 1", + default=1) + parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--top_k', type=int, default=1) + parser.add_argument('--top_p', type=float, default=0.0) + parser.add_argument('--length_penalty', type=float, default=1.0) + parser.add_argument('--repetition_penalty', type=float, default=1.0) + parser.add_argument('--presence_penalty', type=float, default=0.0) + parser.add_argument('--frequency_penalty', type=float, default=0.0) + parser.add_argument('--beam_search_diversity_rate', type=float, default=0.0) + parser.add_argument('--random_seed', type=int, default=0) + parser.add_argument('--early_stopping', + type=int, + help='Use early stopping if num_beams > 1, ' + '1 for early-stopping, 0 for non-early-stopping' + 'other values for stopping by length', + default=1) + parser.add_argument( + '--end_id', + default=None, + type=int, + help="Override tokenizer end_id to stop on given end_id token.") + parser.add_argument( + '--stop_words', + default=None, + type=str, + nargs="+", + action='append', + help= + 'Set stop words for a batch. Successive invocations of --stop_words set stop words for other batches.' + ' E.g.: --stop_words " London" " chef" --stop_words "eventually became" "was not"', + ) + parser.add_argument( + '--bad_words', + default=None, + type=str, + nargs="+", + action='append', + help= + 'Set bad words for a batch. Successive invocations of --bad_words set bad words for other batches.' + ' E.g.: --bad_words " London" " chef" --bad_words "eventually became" "was not"', + ) + parser.add_argument('--no_repeat_ngram_size', type=int, default=None) + + # common runtime arguments + parser.add_argument('--sink_token_length', + type=int, + default=None, + help='The sink token length.') + parser.add_argument( + '--max_attention_window_size', + type=int, + default=None, + help= + 'The attention window size that controls the sliding window attention / cyclic kv cache behavior' + ) + parser.add_argument( + '--multi_block_mode', + action='store_true', + help= + "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel." + ) + parser.add_argument('--enable_context_fmha_fp32_acc', + action='store_true', + help="Enable FMHA runner FP32 accumulation.") + parser.add_argument('--log_level', type=str, default='info') + parser.add_argument( + '--no_prompt_template', + dest='use_prompt_template', + default=True, + action='store_false', + help= + "Whether or not to use default prompt template to wrap the input text.") + parser.add_argument('--use_py_session', + default=False, + action='store_true', + help="Whether or not to use Python runtime session") + parser.add_argument('--debug_mode', + default=False, + action='store_true', + help="Whether or not to turn on the debug mode") + parser.add_argument('--streaming', default=False, action='store_true') + parser.add_argument('--streaming_interval', + type=int, + help="How often to return tokens when streaming.", + default=5) + parser.add_argument( + '--prompt_table_path', + type=str, + help="Path to .npy file, exported by nemo_prompt_convert.py") + parser.add_argument( + '--prompt_tasks', + help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0") + parser.add_argument('--lora_dir', + type=str, + default=None, + nargs="+", + help="The directory of LoRA weights") + parser.add_argument('--lora_ckpt_source', + type=str, + default="hf", + choices=["hf", "nemo"], + help="The source of lora checkpoint.") + parser.add_argument( + '--lora_task_uids', + type=str, + default=None, + nargs="+", + help="The list of LoRA task uids; use -1 to disable the LoRA module") + parser.add_argument( + '--num_prepend_vtokens', + nargs="+", + type=int, + help="Number of (default) virtual tokens to prepend to each sentence." + " For example, '--num_prepend_vtokens=10' will prepend the tokens" + " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") + parser.add_argument( + '--medusa_choices', + type=str, + default=None, + help="Medusa choice to use, if not none, will use Medusa decoding." + " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." + ) + + # model arguments + parser.add_argument('--engine_dir', type=str, default='engine_outputs') + parser.add_argument( + '--tokenizer_type', + help= + 'Specify that argument when providing a .model file as the tokenizer_dir. ' + 'It allows AutoTokenizer to instantiate the correct tokenizer type.') + parser.add_argument('--vocab_file', + help="Used for sentencepiece tokenizers") + parser.add_argument('--no_add_special_tokens', + dest='add_special_tokens', + default=True, + action='store_false', + help="Whether or not to add special tokens") + parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) + parser.add_argument( + '--tokenizer_dir', + default=None, + help='tokenizer path; defaults to hf_model_dir if left unspecified') + + # memory argument + parser.add_argument( + '--gpu_weights_percent', + default=1, + type=float, + help= + 'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.', + ) + parser.add_argument( + '--max_tokens_in_paged_kv_cache', + default=None, + type=int, + help= + 'Specify the maximum number of tokens in a kv cache page (only available with cpp session).', + ) + parser.add_argument( + '--kv_cache_enable_block_reuse', + action='store_true', + help= + 'Enables block reuse in kv cache (only available with cpp session).', + ) + parser.add_argument( + '--kv_cache_free_gpu_memory_fraction', + default=0.9, + type=float, + help='Specify the free gpu memory fraction.', + ) + parser.add_argument( + '--enable_chunked_context', + action='store_true', + help='Enables chunked context (only available with cpp session).', + ) + + # hf model argument (if use hf model) + parser.add_argument( + '--hf_data_type', + '--data_type', + type=str, + choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'], + default='fp16', + help="The data type for hf model.") + parser.add_argument( + '--hf_device_map_auto', + action='store_true', + help="Use device map 'auto' to load a pretrained HF model. This may " + "help to test a large model that cannot fit into a singlue GPU.") + + parser.add_argument( + "--return_all_generated_tokens", + default=False, + action="store_true", + help="This option changes the token output only for streaming. " + "If not specified, return only generated tokens at each step. " + "If specified, return the full beams/outputs at each step. " + "It is automatically enabled for num_beams>1 (only available with cpp session). " + "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." + ) + + return parser \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/build.py b/models/nlp/large_language_model/llama2-70b/trtllm/build.py deleted file mode 100644 index 4ff0c9eaa0cedfd382783a5cfcca9175bf38acad..0000000000000000000000000000000000000000 --- a/models/nlp/large_language_model/llama2-70b/trtllm/build.py +++ /dev/null @@ -1,1163 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import math -import os -import sys -import time -from pathlib import Path - -# isort: off -import torch -import torch.multiprocessing as mp -import tensorrt as trt -# isort: on -from transformers import LlamaConfig, LlamaForCausalLM - -try: - from transformers import MixtralForCausalLM -except ImportError: - MixtralForCausalLM = None - -try: - from transformers import LlavaConfig, LlavaForConditionalGeneration -except ImportError: - pass - -import tensorrt_llm -from tensorrt_llm import profiler -from tensorrt_llm._common import check_max_num_tokens -from tensorrt_llm._utils import str_dtype_to_trt -from tensorrt_llm.builder import Builder -from tensorrt_llm.layers import MoeConfig -from tensorrt_llm.layers.attention import PositionEmbeddingType -from tensorrt_llm.logger import logger -from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import quantize_model -from tensorrt_llm.network import net_guard -from tensorrt_llm.plugin.plugin import ContextFMHAType -from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime.lora_manager import LoraConfig - -from tensorrt_llm.models.llama.weight import ( # isort:skip - get_scaling_factors, load_from_awq_llama, load_from_binary, - load_from_gptq_llama, load_from_hf_checkpoint, load_from_hf_llama, - load_from_meta_llama, parse_bin_config) - -MODEL_NAME = "llama" - -# 2 routines: get_engine_name, serialize_engine -# are direct copy from gpt example, TODO: put in utils? - -import onnx -from onnx import TensorProto, helper - - -def trt_dtype_to_onnx(dtype): - if dtype == trt.float16: - return TensorProto.DataType.FLOAT16 - if dtype == trt.bfloat16: - return TensorProto.DataType.BFLOAT16 - elif dtype == trt.float32: - return TensorProto.DataType.FLOAT - elif dtype == trt.int32: - return TensorProto.DataType.INT32 - elif dtype == trt.int64: - return TensorProto.DataType.INT64 - elif dtype == trt.bool: - return TensorProto.DataType.BOOL - else: - raise TypeError("%s is not supported" % dtype) - - -def to_onnx(network, path): - inputs = [] - for i in range(network.num_inputs): - network_input = network.get_input(i) - inputs.append( - helper.make_tensor_value_info( - network_input.name, trt_dtype_to_onnx(network_input.dtype), - list(network_input.shape))) - - outputs = [] - for i in range(network.num_outputs): - network_output = network.get_output(i) - outputs.append( - helper.make_tensor_value_info( - network_output.name, trt_dtype_to_onnx(network_output.dtype), - list(network_output.shape))) - - nodes = [] - for i in range(network.num_layers): - layer = network.get_layer(i) - layer_inputs = [] - for j in range(layer.num_inputs): - ipt = layer.get_input(j) - if ipt is not None: - layer_inputs.append(layer.get_input(j).name) - layer_outputs = [ - layer.get_output(j).name for j in range(layer.num_outputs) - ] - nodes.append( - helper.make_node(str(layer.type), - name=layer.name, - inputs=layer_inputs, - outputs=layer_outputs, - domain="com.nvidia")) - - onnx_model = helper.make_model(helper.make_graph(nodes, - 'attention', - inputs, - outputs, - initializer=None), - producer_name='NVIDIA') - onnx.save(onnx_model, path) - - -def get_engine_name(model, dtype, tp_size, pp_size, rank): - if pp_size == 1: - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) - return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, - pp_size, rank) - - -def serialize_engine(engine, path): - logger.info(f'Serializing engine to {path}...') - tik = time.time() - with open(path, 'wb') as f: - f.write(engine) - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Engine serialized. Total time: {t}') - - -def parse_arguments(cmd_args=None): - parser = argparse.ArgumentParser() - parser.add_argument('--world_size', type=int, default=1) - parser.add_argument('--tp_size', type=int, default=1) - parser.add_argument('--pp_size', type=int, default=1) - parser.add_argument('--model_dir', type=str, default=None) - parser.add_argument('--bin_model_dir', type=str, default=None) - parser.add_argument('--meta_ckpt_dir', type=str, default=None) - parser.add_argument('--quant_ckpt_path', type=str, default=None) - parser.add_argument('--dtype', - type=str, - default='float16', - choices=['float32', 'bfloat16', 'float16']) - parser.add_argument( - '--timing_cache', - type=str, - default='model.cache', - help= - 'The path of to read timing cache from, will be ignored if the file does not exist' - ) - parser.add_argument( - '--profiling_verbosity', - type=str, - default='layer_names_only', - choices=['layer_names_only', 'detailed', 'none'], - help= - 'The profiling verbosity for the generated TRT engine. Set to detailed can inspect tactic choices and kernel parameters.' - ) - parser.add_argument('--log_level', type=str, default='info') - parser.add_argument('--vocab_size', type=int, default=32000) - parser.add_argument('--n_layer', type=int, default=32) - parser.add_argument('--n_positions', type=int, default=2048) - parser.add_argument('--n_embd', type=int, default=4096) - parser.add_argument('--n_head', type=int, default=32) - parser.add_argument('--n_kv_head', type=int, default=None) - parser.add_argument('--multiple_of', type=int, default=256) - parser.add_argument('--ffn_dim_multiplier', type=float, default=1.0) - parser.add_argument('--inter_size', type=int, default=None) - parser.add_argument('--hidden_act', type=str, default='silu') - parser.add_argument('--rms_norm_eps', type=float, default=1e-06) - parser.add_argument('--max_batch_size', type=int, default=8) - parser.add_argument('--max_input_len', type=int, default=2048) - parser.add_argument('--max_output_len', type=int, default=512) - parser.add_argument('--max_beam_width', type=int, default=1) - parser.add_argument('--rotary_base', type=float, default=10000.0) - parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None) - parser.add_argument('--use_gpt_attention_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_gemm_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_rmsnorm_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_lookup_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_gather_last_token_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_activation_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_elementwise_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument("--use_cast_plugin", action="store_true") - - parser.add_argument('--parallel_build', default=False, action='store_true') - parser.add_argument('--enable_context_fmha', - default=False, - action='store_true') - parser.add_argument('--enable_context_fmha_fp32_acc', - default=False, - action='store_true') - parser.add_argument( - '--use_paged_context_fmha', - action='store_true', - help= - 'Activates paged context FMHA. This mode of the context FMHA is required for chunked context, speculative decoding and reuse of KV cache blocks. Context FMHA performance is worse when this mode is on.' - ) - parser.add_argument( - '--multi_block_mode', - default=False, - action='store_true', - help= - 'Split long kv sequence into multiple blocks (applied to generation MHA kernels). \ - It is beneficial when batch x num_heads cannot fully utilize GPU.' - ) - parser.add_argument( - '--disable_xqa', - default=False, - action='store_true', - help= - 'Disable XQA optimization for the generation MHA. See more details in docs/gpt_attention.' - ) - parser.add_argument('--visualize', default=False, action='store_true') - parser.add_argument('--load_by_shard', - action='store_true', - help='Load a pretrained model shard-by-shard.') - parser.add_argument('--enable_debug_output', - default=False, - action='store_true') - parser.add_argument('--gpus_per_node', type=int, default=8) - parser.add_argument('--builder_opt', type=int, default=None) - parser.add_argument( - '--output_dir', - type=str, - default='engine_outputs', - help= - 'The path to save the serialized engine files, timing cache file and model configs' - ) - parser.add_argument('--remove_input_padding', - default=False, - action='store_true') - parser.add_argument( - '--use_fused_mlp', - default=False, - action='store_true', - help= - 'Enable horizontal fusion in GatedMLP, reduces layer input traffic and potentially improves performance. ' - 'For FP8 PTQ, the downside is slight reduction of accuracy because one of the quantization scaling factors are discarded ' - '(0.45734 vs 0.45755 for LLaMA-v2 7B using ammo/examples/hf/instruct_eval/mmlu.py).' - ) - parser.add_argument('--enable_pos_shift', - default=False, - action='store_true', - help='Enable position shift for streamingllm method') - parser.add_argument( - '--dense_context_fmha', - default=False, - action='store_true', - help= - 'Enable dense fmha in context phase, otherwise sliding window attention.' - 'If dense_context_fmha=False, the sliding window size is the max attention window size.' - ) - # Arguments related to the quantization of the model. - parser.add_argument( - '--use_smooth_quant', - default=False, - action="store_true", - help= - 'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.' - 'See --per_channel and --per_token for finer-grained quantization options.' - ) - parser.add_argument( - '--per_channel', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor for the GEMM\'s result. ' - 'per_channel instead uses a different static scaling factor for each channel. ' - 'The latter is usually more accurate, but a little slower.') - parser.add_argument( - '--per_token', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor to scale activations in the int8 range. ' - 'per_token chooses at run time, and for each token, a custom scaling factor. ' - 'The latter is usually more accurate, but a little slower.') - parser.add_argument( - '--per_group', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor to scale weights in the int4 range. ' - 'per_group chooses at run time, and for each group, a custom scaling factor. ' - 'The flag is built for GPTQ/AWQ quantization.') - parser.add_argument('--group_size', - type=int, - default=128, - help='Group size used in GPTQ/AWQ quantization.') - parser.add_argument( - '--int8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' - ) - parser.add_argument( - '--use_parallel_embedding', - action="store_true", - default=False, - help= - 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' - ) - parser.add_argument( - '--embedding_sharding_dim', - type=int, - default=1, # Meta does TP on hidden dim - choices=[0, 1], - help= - 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' - 'To shard it along hidden dimension, set embedding_sharding_dim=1' - 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' - ) - parser.add_argument( - '--enable_fp8', - default=False, - action='store_true', - help='Use FP8 Linear layer for Attention QKV/Dense and MLP.') - parser.add_argument( - '--fp8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV' - ) - parser.add_argument( - '--quantized_fp8_model_path', - type=str, - default=None, - help='Path of a quantized model checkpoint in .npz format') - parser.add_argument( - '--use_weight_only', - default=False, - action="store_true", - help='Quantize weights for the various GEMMs to INT4/INT8.' - 'See --weight_only_precision to set the precision') - parser.add_argument( - '--disable_weight_only_quant_plugin', - default=False, - action="store_true", - help= - 'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.' - 'You must also use --use_weight_only for that argument to have an impact.' - ) - parser.add_argument( - '--weight_only_precision', - const='int8', - type=str, - nargs='?', - default='int8', - choices=['int8', 'int4', 'int4_awq', 'int4_gptq'], - help= - 'Define the precision for the weights when using weight-only quantization.' - 'You must also use --use_weight_only for that argument to have an impact.' - ) - parser.add_argument( - '--quantize_lm_head', - default=False, - action="store_true", - help='Quantize lm_head weights as well when using int4_awq.') - parser.add_argument( - '--use_inflight_batching', - action="store_true", - default=False, - help="Activates inflight batching mode of gptAttentionPlugin.") - parser.add_argument( - '--paged_kv_cache', - action="store_true", - default=False, - help= - 'By default we use contiguous KV cache. By setting this flag you enable paged KV cache' - ) - parser.add_argument('--tokens_per_block', - type=int, - default=128, - help='Number of tokens per block in paged KV cache') - parser.add_argument( - '--max_num_tokens', - type=int, - default=None, - help= - 'Define the max number of tokens supported by the engine, note that it takes no effect if --remove_input_padding is not set' - ) - parser.add_argument( - '--strongly_typed', - default=False, - action="store_true", - help= - 'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.' - ) - parser.add_argument( - '--use_custom_all_reduce', - action='store_true', - help= - 'Activates latency-optimized algorithm for all-reduce instead of NCCL.') - parser.add_argument( - '--max_prompt_embedding_table_size', - type=int, - default=0, - help='Setting to a value > 0 enables support for prompt tuning.') - parser.add_argument( - '--gather_all_token_logits', - action='store_true', - default=False, - help='Enable both gather_context_logits and gather_generation_logits') - parser.add_argument('--gather_context_logits', - action='store_true', - default=False, - help='Gather context logits') - parser.add_argument('--gather_generation_logits', - action='store_true', - default=False, - help='Gather generation logits') - parser.add_argument( - '--use_lora_plugin', - nargs='?', - const=None, - default=False, - choices=['float16', 'float32', 'bfloat16'], - help="Activates the lora plugin which enables embedding sharing.") - parser.add_argument( - '--lora_target_modules', - nargs='+', - default=None, - choices=[ - "attn_qkv", - "attn_q", - "attn_k", - "attn_v", - "attn_dense", - "mlp_h_to_4h", - "mlp_gate", - "mlp_4h_to_h", - ], - help= - "Add lora in which modules. Only be activated when use_lora_plugin is enabled." - ) - parser.add_argument('--hf_lora_dir', type=str, default=None) - parser.add_argument( - '--max_lora_rank', - type=int, - default=64, - help='maximum lora rank for different lora modules. ' - 'It is used to compute the workspace size of lora plugin.') - parser.add_argument( - '--moe_num_experts', - default=0, - type=int, - help='Specify the number of experts to use for MOE layers') - parser.add_argument( - '--moe_top_k', - default=0, - type=int, - help= - 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' - ) - parser.add_argument( - '--moe_tp_mode', - default=MoeConfig.ParallelismMode.TENSOR_PARALLEL, - type=int, - help= - 'Controls how to distribute experts in TP. Check layers/moe.py for accepted values', - ) - parser.add_argument( - '--moe_renorm_mode', - default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, - type=int, - help= - 'Controls renormalization after gate logits. Check layers/moe.py for accepted values', - ) - parser.add_argument("--total_build_time_target", type=float, default=0) - - args = parser.parse_args(cmd_args) - logger.set_level(args.log_level) - - assert args.total_build_time_target >= 0, "total_build_time_target must bigger than 0" - - assert not ( - args.use_smooth_quant and args.use_weight_only - ), "You cannot enable both SmoothQuant and INT8 weight-only together." - - if not args.remove_input_padding: - if args.use_gpt_attention_plugin: - logger.warning( - f"It is recommended to specify --remove_input_padding when using GPT attention plugin" - ) - - if args.use_inflight_batching: - if not args.use_gpt_attention_plugin: - args.use_gpt_attention_plugin = 'float16' - logger.info( - f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'" - ) - if not args.remove_input_padding: - args.remove_input_padding = True - logger.info( - "Using remove input padding for inflight batching mode.") - if not args.paged_kv_cache: - args.paged_kv_cache = True - logger.info("Using paged KV cache for inflight batching mode.") - - if args.use_smooth_quant: - args.quant_mode = QuantMode.use_smooth_quant(args.per_token, - args.per_channel) - elif args.use_weight_only: - args.quant_mode = QuantMode.from_description( - quantize_weights=True, - quantize_activations=False, - per_token=False, - per_channel=False, - per_group=args.per_group, - use_int4_weights="int4" in args.weight_only_precision) - else: - args.quant_mode = QuantMode(0) - - if args.int8_kv_cache: - args.quant_mode = args.quant_mode.set_int8_kv_cache() - elif args.fp8_kv_cache: - args.quant_mode = args.quant_mode.set_fp8_kv_cache() - if args.enable_fp8: - args.quant_mode = args.quant_mode.set_fp8_qdq() - - if args.rotary_scaling is not None: - assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin." - rotary_scaling = { - "type": args.rotary_scaling[0], - "factor": float(args.rotary_scaling[1]) - } - assert rotary_scaling["type"] in ["linear", "dynamic"] - assert rotary_scaling["factor"] > 1.0 - args.rotary_scaling = rotary_scaling - - if args.model_dir is not None: - hf_config = LlamaConfig.from_pretrained(args.model_dir) - if hf_config.model_type == "llava": - # LLaVA = Vision model + Llama LLM - # We load a llava config and use its' text config as llama config - hf_config = LlavaConfig.from_pretrained(args.model_dir).text_config - hf_config.model_type = "llava" # Replace llama with llava - - args.inter_size = hf_config.intermediate_size # override the inter_size for LLaMA - args.n_embd = hf_config.hidden_size - args.n_head = hf_config.num_attention_heads - if hasattr(hf_config, "num_key_value_heads"): - args.n_kv_head = hf_config.num_key_value_heads - - # hf_config.num_hidden_layers = 1 # only for debug - args.n_layer = hf_config.num_hidden_layers - args.n_positions = hf_config.max_position_embeddings - args.vocab_size = hf_config.vocab_size if hf_config.vocab_size is not None else args.vocab_size - args.hidden_act = hf_config.hidden_act - args.rms_norm_eps = hf_config.rms_norm_eps - # These attributes only exists with Mixtral, for the moment - args.moe_num_experts = getattr(hf_config, "num_local_experts", - args.moe_num_experts) - args.moe_top_k = getattr(hf_config, "num_experts_per_tok", - args.moe_top_k) - args.rotary_base = getattr(hf_config, "rope_theta", args.rotary_base) - args.model_type = hf_config.model_type - if hf_config.model_type == "mixtral": - # HF LLaMA-type models are implicitly using gated activation. - # With our MoE implementation, we must make it explicit - args.hidden_act = "swiglu" - - elif args.meta_ckpt_dir is not None: - with open(Path(args.meta_ckpt_dir, "params.json")) as fp: - meta_config: dict = json.load(fp) - args.n_embd = meta_config["dim"] - args.n_head = meta_config["n_heads"] - args.n_layer = meta_config["n_layers"] - args.n_kv_head = meta_config.get("n_kv_heads", args.n_head) - if "hidden_dim" in meta_config: - args.inter_size = meta_config["hidden_dim"] - else: - args.multiple_of = meta_config.get("multiple_of", 1) - n_embd = int(4 * args.n_embd * 2 / 3) - args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1) - args.inter_size = args.multiple_of * ( - (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) - // args.multiple_of) - args.rms_norm_eps = meta_config["norm_eps"] - args.moe_num_experts = meta_config.get("moe", {}).get("num_experts", 0) - args.moe_top_k = meta_config.get("moe", {}).get("num_experts_per_tok", - 0) - elif args.bin_model_dir is not None: - n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head = parse_bin_config( - Path(args.bin_model_dir) / "config.ini") - args.inter_size = inter_size # override the inter_size for LLaMA - args.n_kv_head = n_kv_head - args.n_embd = n_embd - args.n_head = n_head - args.n_layer = n_layer - args.n_positions = n_positions - args.vocab_size = vocab_size if args.vocab_size is None else args.vocab_size - args.hidden_act = hidden_act - args.rms_norm_eps = 1e-06 - logger.warning("Set rms_norm_eps to 1e-06 directly.") - if args.n_kv_head is None: - args.n_kv_head = args.n_head - elif args.n_kv_head != args.n_head: - assert (args.n_head % args.n_kv_head) == 0, \ - "MQA/GQA requires the number of heads to be divisible by the number of K/V heads." - assert (args.n_kv_head % args.tp_size) == 0 or (args.tp_size % args.n_kv_head) == 0, \ - "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \ - "the tensor parallelism size to be divisible by the number of K/V heads." - - hf_modules_to_trtllm_modules = { - "q_proj": "attn_q", - "k_proj": "attn_k", - "v_proj": "attn_v", - "o_proj": "attn_dense", - "gate_proj": "mlp_h_to_4h", - "down_proj": "mlp_4h_to_h", - "up_proj": "mlp_gate" - } # lora modules on llama - - trtllm_modules_to_hf_modules = { - "attn_q": "q_proj", - "attn_k": "k_proj", - "attn_v": "v_proj", - "attn_dense": "o_proj", - "mlp_h_to_4h": "gate_proj", - "mlp_4h_to_h": "down_proj", - "mlp_gate": "up_proj", - } - - lora_config = LoraConfig.from_hf(args.hf_lora_dir, - hf_modules_to_trtllm_modules, - trtllm_modules_to_hf_modules) - - if lora_config.is_valid: - if args.lora_target_modules is None: - args.lora_target_modules = lora_config.lora_target_modules - # the lora checkpoint might finetune the embedding - if lora_config.vocab_size != 0: - args.vocab_size = lora_config.vocab_size - - args.lora_config = lora_config - - if args.weight_only_precision == 'int4_awq': - inter_alignment = args.tp_size * 128 - if args.inter_size % inter_alignment != 0: - args.inter_size = int((args.inter_size + inter_alignment - 1) / - inter_alignment) * inter_alignment - logger.info("To use awq we pad intermediate_size to {}.".format( - args.inter_size)) - - if args.quantize_lm_head: - vocab_alignment = args.tp_size * 64 - if args.vocab_size % vocab_alignment != 0: - args.vocab_size = int((args.vocab_size + vocab_alignment - 1) / - vocab_alignment) * vocab_alignment - logger.info("To use awq we pad vocab_size to {}.".format( - args.vocab_size)) - - assert args.pp_size * args.tp_size == args.world_size - - args.max_num_tokens = check_max_num_tokens( - max_num_tokens=args.max_num_tokens, - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - remove_input_padding=args.remove_input_padding) - - assert (math.log2(args.tokens_per_block).is_integer() - ), "tokens_per_block must be power of 2" - if args.enable_context_fmha or args.enable_context_fmha_fp32_acc: - assert (args.tokens_per_block >= - 128), "Context fMHA requires >= 128 tokens per block" - - if args.inter_size is None: - # this should not be need when loading a real model - # but it is helpful when creating a dummy model without loading any real weights - n_embd = int(4 * args.n_embd * 2 / 3) - args.inter_size = args.multiple_of * ( - (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) // - args.multiple_of) - logger.info(f"Setting inter_size to {args.inter_size}.") - - if args.enable_pos_shift: - assert args.use_gpt_attention_plugin, "Position shift is only support in the gpt attention plugin." - assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc - - if args.moe_num_experts and args.moe_top_k == 0: - args.moe_top_k = 1 - args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k, - args.moe_tp_mode, - args.moe_renorm_mode).validate() - - if args.gather_all_token_logits: - args.gather_context_logits = True - args.gather_generation_logits = True - - return args - - -def get_model_object(args, mapping, trt_dtype=None): - if trt_dtype is None: - trt_dtype = str_dtype_to_trt(args.dtype) - # Initialize Module - logger.debug("[Python]llama exampels, Initialize tensorrt_llm.models.LLaMAForCausalLM....") - tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM( - num_layers=args.n_layer, - num_heads=args.n_head, - num_kv_heads=args.n_kv_head, - hidden_size=args.n_embd, - vocab_size=args.vocab_size, - hidden_act=args.hidden_act, - max_position_embeddings=args.n_positions, - dtype=trt_dtype, - mlp_hidden_size=args.inter_size, - position_embedding_type=PositionEmbeddingType.rope_gpt_neox, - mapping=mapping, - rotary_base=args.rotary_base, - rotary_scaling=args.rotary_scaling, - use_parallel_embedding=args.use_parallel_embedding, - embedding_sharding_dim=args.embedding_sharding_dim, - quant_mode=args.quant_mode, - rms_norm_eps=args.rms_norm_eps, - use_fused_mlp=args.use_fused_mlp, - use_prompt_tuning=args.max_prompt_embedding_table_size > 0, - enable_pos_shift=args.enable_pos_shift, - dense_context_fmha=args.dense_context_fmha, - moe_config=args.moe_config, - max_lora_rank=args.max_lora_rank) - quantize_kwargs = {} - if args.use_smooth_quant or args.use_weight_only: - if args.weight_only_precision == 'int4_awq': - exclude_modules = ['lm_head'] if not args.quantize_lm_head else [] - quantize_kwargs = { - "group_size": args.group_size, - "zero": False, - "pre_quant_scale": True, - "exclude_modules": exclude_modules, - } - elif args.weight_only_precision == 'int4_gptq': - quantize_kwargs = { - "group_size": args.group_size, - "zero": True, - "pre_quant_scale": False, - } - elif args.enable_fp8 or args.fp8_kv_cache: - logger.info(f'Loading scaling factors from ' - f'{args.quantized_fp8_model_path}') - quant_scales = get_scaling_factors(args.quantized_fp8_model_path, - num_layers=args.n_layer, - quant_mode=args.quant_mode) - quantize_kwargs = {"quant_scales": quant_scales} - - if args.use_weight_only and args.moe_config.has_moe(): - if 'exclude_modules' in quantize_kwargs: - quantize_kwargs['exclude_modules'].append('router') - else: - quantize_kwargs['exclude_modules'] = ['lm_head', 'router'] - - tensorrt_llm_llama = quantize_model(tensorrt_llm_llama, args.quant_mode, - **quantize_kwargs) - if args.per_group: - if args.weight_only_precision == 'int4_awq': - load_from_awq_llama(tensorrt_llm_llama=tensorrt_llm_llama, - quant_ckpt_path=args.quant_ckpt_path, - quantize_lm_head=args.quantize_lm_head, - mapping=mapping, - dtype=args.dtype, - bin_model_dir=args.bin_model_dir) - else: - load_from_gptq_llama(tensorrt_llm_llama=tensorrt_llm_llama, - quant_ckpt_path=args.quant_ckpt_path, - mapping=mapping, - dtype=args.dtype, - bin_model_dir=args.bin_model_dir) - elif args.meta_ckpt_dir is not None: - load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, mapping, - args.dtype) - elif args.model_dir is not None: - logger.info(f'Loading HF LLaMA ... from {args.model_dir}') - tik = time.time() - if not args.load_by_shard: - if args.model_type == "llava": - hf_llava = LlavaForConditionalGeneration.from_pretrained( - args.model_dir, torch_dtype="auto") - hf_llama = hf_llava.language_model - else: - hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM - hf_llama = hf_model.from_pretrained( - args.model_dir, - device_map={ - "model": "cpu", - "lm_head": "cpu", - "embed_tokens": "cpu", - "layers": "cpu", - "norm": "cpu", - }, # Load to CPU memory - torch_dtype='auto', - ) - use_gemm_woq_plugin = not args.disable_weight_only_quant_plugin - # hf_llama.config.num_hidden_layers = 1 # only for debug - load_from_hf_llama(tensorrt_llm_llama, - hf_llama, - mapping=mapping, - dtype=args.dtype, - use_gemm_woq_plugin=use_gemm_woq_plugin, - lora_config=args.lora_config) - del hf_llama - else: - load_from_hf_checkpoint(tensorrt_llm_llama, - args.model_dir, - mapping, - dtype=args.dtype, - lora_config=args.lora_config) - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'HF LLaMA loaded. Total time: {t}') - - elif args.bin_model_dir is not None: - load_from_binary(tensorrt_llm_llama, - args.bin_model_dir, - mapping, - fp16=(args.dtype == 'float16'), - multi_query_mode=(args.n_kv_head != args.n_head)) - - return tensorrt_llm_llama - - -def update_plugin_configs(args, network): - if args.use_gpt_attention_plugin: - network.plugin_config.set_gpt_attention_plugin( - dtype=args.use_gpt_attention_plugin) - if args.use_gemm_plugin: - if not args.enable_fp8: - network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin) - else: - logger.info( - "Gemm plugin does not support FP8. Disabled Gemm plugin.") - if args.use_rmsnorm_plugin: - network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin) - if args.use_lora_plugin: - network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin) - if args.use_lookup_plugin: - network.plugin_config.set_lookup_plugin(dtype=args.use_lookup_plugin) - if args.use_gather_last_token_plugin: - network.plugin_config.set_gather_last_token_plugin(dtype=args.use_gather_last_token_plugin) - if args.use_activation_plugin: - network.plugin_config.set_activation_plugin(dtype=args.use_activation_plugin) - if args.use_elementwise_plugin: - network.plugin_config.set_elementwise_plugin(dtype=args.use_elementwise_plugin) - if args.use_cast_plugin: - network.plugin_config.set_cast_plugin() - - # Quantization plugins. - if args.use_smooth_quant: - network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype) - network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype) - network.plugin_config.set_quantize_tensor_plugin() - network.plugin_config.set_quantize_per_token_plugin() - assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc) - if args.enable_context_fmha: - network.plugin_config.set_context_fmha(ContextFMHAType.enabled) - if args.enable_context_fmha_fp32_acc: - network.plugin_config.set_context_fmha( - ContextFMHAType.enabled_with_fp32_acc) - if args.multi_block_mode: - network.plugin_config.enable_mmha_multi_block_mode() - if not args.disable_xqa: - network.plugin_config.enable_xqa_optimization() - - if args.use_weight_only and not args.disable_weight_only_quant_plugin: - if args.per_group: - network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin( - dtype=args.dtype) - else: - network.plugin_config.set_weight_only_quant_matmul_plugin( - dtype=args.dtype) - if args.world_size > 1: - network.plugin_config.set_nccl_plugin(args.dtype, - args.use_custom_all_reduce) - if args.remove_input_padding: - network.plugin_config.enable_remove_input_padding() - if args.paged_kv_cache: - network.plugin_config.enable_paged_kv_cache(args.tokens_per_block) - return - - -def build_rank_engine(builder: Builder, - builder_config: tensorrt_llm.builder.BuilderConfig, - engine_name, rank, args): - ''' - @brief: Build the engine on the given rank. - @param rank: The rank to build the engine. - @param args: The cmd line arguments. - @return: The built engine. - ''' - dtype = str_dtype_to_trt(args.dtype) - mapping = Mapping(world_size=args.world_size, - rank=rank, - tp_size=args.tp_size, - pp_size=args.pp_size) - - assert args.n_layer % args.pp_size == 0, \ - f"num_layers {args.n_layer} must be a multiple of pipeline parallelism size {args.pp_size}" - - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {rank} Engine build starts') - # Initialize Module - tensorrt_llm_llama = get_model_object(args, - mapping=mapping, - trt_dtype=dtype) - - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {rank} model weight loaded.') - - # Module -> Network - logger.debug("[Python]llama exampels, convert module to network....") - network = builder.create_network() - network.trt_network.name = engine_name - update_plugin_configs(args, network) - - if args.use_paged_context_fmha: - assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc, "context fmha must be enabled" - network.plugin_config.set_paged_context_fmha() - - logger.debug(f"[Python]llama exampels, network.plugin_config: \n{network.plugin_config}") - with net_guard(network): - # Prepare - network.set_named_parameters(tensorrt_llm_llama.named_parameters()) - - # Forward - inputs = tensorrt_llm_llama.prepare_inputs( - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - max_seq_len=args.max_input_len + args.max_output_len, - use_cache=True, - max_beam_width=args.max_beam_width, - max_num_tokens=args.max_num_tokens, - prompt_embedding_table_size=args.max_prompt_embedding_table_size, - gather_context_logits=args.gather_context_logits, - gather_generation_logits=args.gather_generation_logits, - lora_target_modules=args.lora_target_modules) - logger.info(f"[Python]llama exampels, forward....\n") - tensorrt_llm_llama(*inputs) - logger.info(f"[Python]llama exampels, forward finished\n") - if args.enable_debug_output: - # mark intermediate nodes' outputs - for k, v in tensorrt_llm_llama.named_network_outputs(): - logger.debug(f"enable_debug_output, debug tensor name: {k}") - v = v.trt_tensor - v.name = k - network.trt_network.mark_output(v) - v.dtype = dtype - if args.visualize: - model_path = os.path.join(args.output_dir, 'test.onnx') - to_onnx(network.trt_network, model_path) - - logger.debug("[Python]llama examples, tensorrt_llm.graph_rewriting.optimize....") - tensorrt_llm.graph_rewriting.optimize(network) - - engine = None - - # Network -> Engine - logger.debug("[Python]llama examples, builder.build_engine....") - engine = builder.build_engine(network, builder_config) - if rank == 0: - config_path = os.path.join(args.output_dir, 'config.json') - builder.save_config(builder_config, config_path) - - return engine - - -def get_builder_config_namespace(args, cache): - # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT - # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ) - # OR INT8 KV cache is set to contiguous (without paged KV cache enabled). - int8_trt_flag = (args.quant_mode.has_act_or_weight_quant() - and not args.quant_mode.has_per_group_scaling()) or ( - not args.paged_kv_cache - and args.quant_mode.has_int8_kv_cache()) - config = argparse.Namespace( - name=MODEL_NAME, - precision=args.dtype, - timing_cache=args.timing_cache if cache is None else cache, - profiling_verbosity=args.profiling_verbosity, - tensor_parallel=args.tp_size, - pipeline_parallel=args.pp_size, - parallel_build=args.parallel_build, - num_layers=args.n_layer, - num_heads=args.n_head, - num_kv_heads=args.n_kv_head, - hidden_size=args.n_embd, - vocab_size=args.vocab_size, - hidden_act=args.hidden_act, - max_position_embeddings=args.n_positions, - max_batch_size=args.max_batch_size, - max_beam_width=args.max_beam_width, - max_input_len=args.max_input_len, - max_output_len=args.max_output_len, - max_num_tokens=args.max_num_tokens, - int8=int8_trt_flag, - quant_mode=args.quant_mode, - strongly_typed=args.strongly_typed, - opt_level=args.builder_opt, - max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - gather_context_logits=args.gather_context_logits, - gather_generation_logits=args.gather_generation_logits, - lora_target_modules=args.lora_target_modules, - mlp_hidden_size=args.inter_size, - hf_modules_to_trtllm_modules=args.lora_config. - hf_modules_to_trtllm_modules, - trtllm_modules_to_hf_modules=args.lora_config. - trtllm_modules_to_hf_modules, - ) - return config - - -def build(rank, args): - torch.cuda.set_device(rank % args.gpus_per_node) - logger.set_level(args.log_level) - os.makedirs(args.output_dir, exist_ok=True) - - # when doing serializing build, all ranks share one engine - builder = Builder() - cache = None - for cur_rank in range(args.world_size): - # skip other ranks if parallel_build is enabled - if args.parallel_build and cur_rank != rank: - continue - tik = time.time() - - # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT - # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ) - # OR INT8 KV cache is set to contiguous (without paged KV cache enabled). - int8_trt_flag = (args.quant_mode.has_act_or_weight_quant() - and not args.quant_mode.has_per_group_scaling()) or ( - not args.paged_kv_cache - and args.quant_mode.has_int8_kv_cache()) - builder_config = builder.create_builder_config( - **vars(get_builder_config_namespace(args, cache))) - engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size, - args.pp_size, cur_rank) - logger.debug("[Python]llama example, build_rank_engine....") - engine = build_rank_engine(builder, builder_config, engine_name, - cur_rank, args) - assert engine is not None, f'Failed to build engine for rank {cur_rank}' - - local_num_kv_heads = (args.n_kv_head + args.world_size - - 1) // args.world_size - kv_dtype = str_dtype_to_trt(args.dtype) - if args.quant_mode.has_int8_kv_cache(): - kv_dtype = str_dtype_to_trt('int8') - elif args.quant_mode.has_fp8_kv_cache(): - kv_dtype = str_dtype_to_trt('fp8') - - # FIXME (Not Support libnvidia-ml.so) - # profiler.check_gpt_mem_usage( - # engine=engine, - # kv_dtype=kv_dtype, - # use_gpt_attention_plugin=args.use_gpt_attention_plugin, - # paged_kv_cache=args.paged_kv_cache, - # max_batch_size=args.max_batch_size, - # max_beam_width=args.max_beam_width, - # max_seq_len=args.max_input_len + args.max_output_len, - # local_num_kv_heads=local_num_kv_heads, - # head_size=args.n_embd / args.n_head, - # num_layers=args.n_layer) - - if cur_rank == 0: - # Use in-memory timing cache for multiple builder passes. - if not args.parallel_build: - cache = builder_config.trt_builder_config.get_timing_cache() - - serialize_engine(engine, os.path.join(args.output_dir, engine_name)) - del engine - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {cur_rank} Engine serialized') - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info( - f'Rank {cur_rank} Engine build time: {t} - {tok - tik} (sec)') - - if rank == 0: - ok = builder.save_timing_cache( - builder_config, os.path.join(args.output_dir, "model.cache")) - assert ok, "Failed to save timing cache." - - -if __name__ == '__main__': - args = parse_arguments() - print(args) - tik = time.time() - if args.parallel_build and args.world_size > 1 and \ - torch.cuda.device_count() >= args.world_size: - logger.warning( - f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.' - ) - mp.spawn(build, nprocs=args.world_size, args=(args, )) - else: - args.parallel_build = False - logger.info('Serially build TensorRT engines.') - build(0, args) - - tok = time.time() - build_engine_time = tok - tik - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Total time of building all {args.world_size} engines: {t}') - - if args.total_build_time_target != 0: - status = build_engine_time <= args.total_build_time_target - if status: - print("successful.") - else: - print(f"Build engine time check failed! Target: {args.total_build_time_target}, Actual: {build_engine_time}") - sys.exit(int(not status)) diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/convert_checkpoint.py b/models/nlp/large_language_model/llama2-70b/trtllm/convert_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..6c44e840456923d9e77b43c1a509e77658d175ae --- /dev/null +++ b/models/nlp/large_language_model/llama2-70b/trtllm/convert_checkpoint.py @@ -0,0 +1,500 @@ +import argparse +import json +import os +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed + +from transformers import AutoConfig + +import tensorrt_llm +from tensorrt_llm._utils import release_gc +from tensorrt_llm.layers import MoeConfig +from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import LLaMAForCausalLM +from tensorrt_llm.models.convert_utils import has_safetensors +from tensorrt_llm.models.llama.convert import load_hf_llama +from tensorrt_llm.models.modeling_utils import QuantConfig +from tensorrt_llm.quantization import QuantAlgo + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', type=str, default=None) + parser.add_argument('--meta_ckpt_dir', type=str, default=None) + + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') + parser.add_argument( + '--moe_tp_size', + type=int, + default=-1, + help= + 'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE' + ) + parser.add_argument('--dtype', + type=str, + default='float16', + choices=['float32', 'bfloat16', 'float16']) + parser.add_argument('--vocab_size', type=int, default=32000) + parser.add_argument('--n_positions', type=int, default=2048) + parser.add_argument('--n_layer', type=int, default=32) + parser.add_argument('--n_head', type=int, default=32) + parser.add_argument('--n_kv_head', type=int, default=None) + parser.add_argument('--n_embd', type=int, default=4096) + parser.add_argument('--inter_size', type=int, default=11008) + parser.add_argument('--multiple_of', type=int, default=None) + parser.add_argument('--ffn_dim_multiplier', type=float, default=None) + parser.add_argument('--rms_norm_eps', type=float, default=1e-06) + + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--disable_weight_only_quant_plugin', + default=False, + action="store_true", + help= + 'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4', 'int4_gptq'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--calib_dataset', + type=str, + default='ccdv/cnn_dailymail', + help= + "The huggingface dataset name or the local directory of the dataset for calibration." + ) + parser.add_argument( + "--smoothquant", + "-sq", + type=float, + default=None, + help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)" + " to Smoothquant the model, and output int8 weights." + " A good first try is 0.5. Must be in [0, 1]") + parser.add_argument( + '--per_channel', + action="store_true", + default=False, + help= + 'By default, we use a single static scaling factor for the GEMM\'s result. ' + 'per_channel instead uses a different static scaling factor for each channel. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--per_token', + action="store_true", + default=False, + help= + 'By default, we use a single static scaling factor to scale activations in the int8 range. ' + 'per_token chooses at run time, and for each token, a custom scaling factor. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--int8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' + ) + parser.add_argument( + '--fp8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV' + ) + parser.add_argument( + '--quant_ckpt_path', + type=str, + default=None, + help='Path of a quantized model checkpoint in .safetensors format') + parser.add_argument("--use_fp8_rowwise", + action="store_true", + default=False, + help="Enable Fp8 per-token per-channel quantization") + + parser.add_argument( + '--per_group', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor to scale weights in the int4 range. ' + 'per_group chooses at run time, and for each group, a custom scaling factor. ' + 'The flag is built for GPTQ/AWQ quantization.') + + parser.add_argument('--load_by_shard', + action='store_true', + help='Load a pretrained model shard-by-shard.') + parser.add_argument('--hidden_act', type=str, default='silu') + + parser.add_argument('--rotary_base', type=float, default=10000.0) + + parser.add_argument('--group_size', + type=int, + default=128, + help='Group size used in GPTQ quantization.' + ) # AWQ is only supported by quantize.py script + + parser.add_argument("--load_model_on_cpu", action="store_true") + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=0, + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' + ) + parser.add_argument( + '--use_embedding_sharing', + action="store_true", + default=False, + help= + 'Try to reduce the engine size by sharing the embedding lookup table between two layers.' + 'Note: the flag might not take effect when the criteria are not met.') + parser.add_argument('--output_dir', + type=str, + default='tllm_checkpoint', + help='The path to save the TensorRT-LLM checkpoint') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') + parser.add_argument( + '--moe_num_experts', + default=0, + type=int, + help='Specify the number of experts to use for MOE layers') + parser.add_argument( + '--moe_top_k', + default=0, + type=int, + help= + 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' + ) + parser.add_argument( + '--moe_renorm_mode', + default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, + type=int, + help= + 'Controls renormalization after gate logits. Check layers/moe.py for accepted values', + ) + parser.add_argument( + '--save_config_only', + action="store_true", + default=False, + help= + 'Only save the model config w/o read and converting weights, be careful, this is for debug only' + ) + parser.add_argument( + '--remove_duplicated_kv_heads', + action="store_true", + default=False, + help= + 'Only used to remove the duplicated kv heads of llama-3.1 405B HF model.' + ) + parser.add_argument('--log_level', type=str, default='info') + + args = parser.parse_args() + # changing the default to be consistent as the cli help said. + if args.moe_num_experts and args.moe_top_k == 0: + args.moe_top_k = 1 + return args + + +def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: + '''return config dict with quantization info based on the command line args + ''' + quant_config = QuantConfig() + if args.use_weight_only: + if args.weight_only_precision == 'int8': + quant_config.quant_algo = QuantAlgo.W8A16 + elif args.weight_only_precision == 'int4': + quant_config.quant_algo = QuantAlgo.W4A16 + elif args.smoothquant: + quant_config.smoothquant_val = args.smoothquant + if args.per_channel: + if args.per_token: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN + else: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN + else: + if args.per_token: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN + else: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN + elif args.use_fp8_rowwise: + quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN + # this will be overwritten if specified in the hf config. + quant_config.clamp_val = [-1200.0, 1200.0] + + if args.int8_kv_cache: + quant_config.kv_cache_quant_algo = QuantAlgo.INT8 + + if args.fp8_kv_cache: + quant_config.kv_cache_quant_algo = QuantAlgo.FP8 + + if args.weight_only_precision == 'int4_gptq': + quant_config.group_size = args.group_size + quant_config.has_zero_point = True + quant_config.pre_quant_scale = False + quant_config.quant_algo = QuantAlgo.W4A16_GPTQ + + return quant_config + + +def update_quant_config_from_hf(quant_config, hf_config) -> QuantConfig: + hf_config_dict = hf_config.to_dict() + if hf_config_dict.get('quantization_config'): + # update the quant_algo, and clamp_val. + if hf_config_dict['quantization_config'].get( + 'quant_method') == 'fbgemm_fp8': + logger.info( + "Load quantization configs from huggingface model_config.") + quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN + activation_scale_ub = hf_config_dict['quantization_config'].get( + 'activation_scale_ub', 1200.0) + quant_config.clamp_val = [-activation_scale_ub, activation_scale_ub] + return quant_config + + +def convert_and_save_meta(args, rank): + mapping = Mapping(world_size=args.tp_size * args.pp_size, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size, + rank=rank) + llama = LLaMAForCausalLM.from_meta_ckpt( + args.meta_ckpt_dir, + args.dtype, + quant_config=args_to_quant_config(args), + mapping=mapping, + use_parallel_embedding=args.use_parallel_embedding, + embedding_sharding_dim=args.embedding_sharding_dim) + llama.save_checkpoint(args.output_dir, save_config=(rank == 0)) + + +def args_to_build_options(args): + return { + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + 'disable_weight_only_quant_plugin': + args.disable_weight_only_quant_plugin, + 'remove_duplicated_kv_heads': args.remove_duplicated_kv_heads, + 'quant_ckpt_path': args.quant_ckpt_path, + 'load_model_on_cpu': args.load_model_on_cpu, + } + + +def from_cli_args(args): + n_kv_head = args.n_kv_head if args.n_kv_head is not None else args.n_head + config = { + 'architecture': "LlamaForCausalLM", + 'dtype': args.dtype, + 'logits_dtype': 'float32', + 'num_hidden_layers': args.n_layer, + 'num_attention_heads': args.n_head, + 'hidden_size': args.n_embd, + 'intermediate_size': args.inter_size, + 'ffn_dim_multiplier': args.ffn_dim_multiplier, + 'multiple_of': args.multiple_of, + 'num_key_value_heads': n_kv_head, + 'vocab_size': args.vocab_size, + 'position_embedding_type': 'rope_gpt_neox', + 'max_position_embeddings': args.n_positions, + 'hidden_act': args.hidden_act, + 'rotary_base': args.rotary_base, + 'norm_epsilon': args.rms_norm_eps, + 'moe': { + 'num_experts': args.moe_num_experts, + 'top_k': args.moe_top_k, + 'normalization_mode': args.moe_renorm_mode, + }, + 'mapping': { + 'world_size': args.tp_size * args.pp_size, + 'tp_size': args.tp_size, + 'pp_size': args.pp_size, + 'moe_tp_size': args.moe_tp_size, + 'moe_ep_size': args.moe_ep_size, + }, + 'quantization': args_to_quant_config(args).to_dict() + } + config.update(args_to_build_options(args)) + return config + + +def convert_and_save_hf(args): + model_dir = args.model_dir + load_model_on_cpu = args.load_model_on_cpu + load_by_shard = args.load_by_shard + world_size = args.tp_size * args.pp_size + # Need to convert the cli args to the kay-value pairs and override them in the generate config dict. + # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now, + # before the refactor is done. + override_fields = {} + override_fields.update(args_to_build_options(args)) + + quant_config = args_to_quant_config(args) + + try: + hf_config = AutoConfig.from_pretrained(model_dir, + trust_remote_code=True) + quant_config = update_quant_config_from_hf(quant_config, hf_config) + except: + # llava_llama needs its own defined config. + logger.warning("AutoConfig cannot load the huggingface config.") + + if args.smoothquant is not None or args.int8_kv_cache: + assert not args.load_by_shard, "When using quantization, TRT-LLM needs to load the whole HF model, thus load by shard not supported" + mapping = Mapping(world_size=world_size, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) + # TODO: support moe quantization for tp + ep + LLaMAForCausalLM.quantize( + args.model_dir, + args.output_dir, + dtype=args.dtype, + mapping=mapping, + quant_config=quant_config, + device='cpu' if args.load_model_on_cpu else 'cuda', + calib_dataset=args.calib_dataset, + **override_fields) + else: + # When not loading by shard, preload one complete model and then slice per rank weights from this + # this saves the disk reloading time + hf_model = None + if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER") is not None \ + and os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER").strip() == "2": + if "vila" in model_dir or "llava" in model_dir: + hf_model = load_hf_llama(model_dir, load_model_on_cpu) + elif not (args.load_by_shard or + (has_safetensors(model_dir) + and not quant_config.quant_mode.has_any_quant())): + hf_model = load_hf_llama(model_dir, load_model_on_cpu) + + def convert_and_save_rank(args, rank): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) + llama = LLaMAForCausalLM.from_hugging_face( + model_dir if hf_model is None else hf_model, + args.dtype, + mapping=mapping, + quant_config=quant_config, + load_by_shard=load_by_shard, + **override_fields, + ) + llama.save_checkpoint(args.output_dir, save_config=(rank == 0)) + del llama + + execute(args.workers, [convert_and_save_rank] * world_size, args) + release_gc() + + +def execute(workers, func, args): + if workers == 1: + for rank, f in enumerate(func): + f(args, rank) + else: + with ThreadPoolExecutor(max_workers=workers) as p: + futures = [p.submit(f, args, rank) for rank, f in enumerate(func)] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + +def main(): + print(tensorrt_llm.__version__) + args = parse_arguments() + logger.set_level(args.log_level) + + world_size = args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" + tik = time.time() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + if (args.model_dir is None + and args.meta_ckpt_dir is None): # generate fake config.json + config = from_cli_args(args) + with open(os.path.join(args.output_dir, 'config.json'), 'w') as f: + json.dump(config, f, indent=4) + elif args.meta_ckpt_dir is not None: + assert args.model_dir is None, "Shall not specify both meta checkpoint dir and hugging face dir" + execute(args.workers, [convert_and_save_meta] * world_size, args) + else: # all other paths from hf model + assert args.model_dir is not None + assert ( + args.quant_ckpt_path is not None + and args.weight_only_precision == 'int4_gptq' + ) or args.quant_ckpt_path is None, "only gptq weights only needs this option" + convert_and_save_hf(args) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Total time of converting checkpoints: {t}') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/run.py b/models/nlp/large_language_model/llama2-70b/trtllm/run.py index 3899ec9d55a33bca6eeeac4840353345467b474d..5590749592d3237b3087f2b745fd9abb9569bf51 100644 --- a/models/nlp/large_language_model/llama2-70b/trtllm/run.py +++ b/models/nlp/large_language_model/llama2-70b/trtllm/run.py @@ -16,63 +16,45 @@ import argparse import ast import csv +import os from pathlib import Path import sys import time +import sys +import time import numpy as np import torch +from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, + add_common_args, load_tokenizer, read_decoder_start_token_id, + read_model_name, supports_inflight_batching, + throttle_generator) + import tensorrt_llm import tensorrt_llm.profiler from tensorrt_llm.logger import logger from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner -from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, - load_tokenizer, read_model_name, throttle_generator) - if PYTHON_BINDINGS: from tensorrt_llm.runtime import ModelRunnerCpp def parse_arguments(args=None): + # see `add_common_args` for extended list of arguments parser = argparse.ArgumentParser() + parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument('--max_output_len', type=int, required=True) - parser.add_argument( - '--max_attention_window_size', - type=int, - default=None, - help= - 'The attention window size that controls the sliding window attention / cyclic kv cache behaviour' - ) - parser.add_argument('--sink_token_length', - type=int, - default=None, - help='The sink token length.') - parser.add_argument('--log_level', type=str, default='error') - parser.add_argument('--engine_dir', type=str, default='engine_outputs') - parser.add_argument('--use_py_session', - default=False, - action='store_true', - help="Whether or not to use Python runtime session") parser.add_argument( '--input_text', type=str, nargs='+', default=["Born in north-east France, Soyer trained as a"]) - parser.add_argument( - '--no_prompt_template', - dest='use_prompt_template', - default=True, - action='store_false', - help= - "Whether or not to use default prompt template to wrap the input text.") parser.add_argument( '--input_file', type=str, help= 'CSV or Numpy file containing tokenized input. Alternative to text input.', default=None) - parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument('--output_csv', type=str, help='CSV file where the tokenized output is stored.', @@ -87,89 +69,26 @@ def parse_arguments(args=None): help= 'Numpy file where the generation logits are stored. Use only when num_beams==1', default=None) - parser.add_argument('--tokenizer_dir', - help="HF tokenizer config path", - default='gpt2') - parser.add_argument( - '--tokenizer_type', - help= - 'Specify that argument when providing a .model file as the tokenizer_dir. ' - 'It allows AutoTokenizer to instantiate the correct tokenizer type.') - parser.add_argument('--vocab_file', - help="Used for sentencepiece tokenizers") - parser.add_argument('--num_beams', - type=int, - help="Use beam search if num_beams >1", - default=1) - parser.add_argument('--temperature', type=float, default=1.0) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--top_p', type=float, default=0.0) - parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--repetition_penalty', type=float, default=1.0) - parser.add_argument('--presence_penalty', type=float, default=0.0) - parser.add_argument('--frequency_penalty', type=float, default=0.0) - parser.add_argument('--debug_mode', - default=False, - action='store_true', - help="Whether or not to turn on the debug mode") - parser.add_argument('--no_add_special_tokens', - dest='add_special_tokens', - default=True, - action='store_false', - help="Whether or not to add special tokens") - parser.add_argument('--streaming', default=False, action='store_true') - parser.add_argument('--streaming_interval', - type=int, - help="How often to return tokens when streaming.", - default=5) - parser.add_argument( - '--prompt_table_path', - type=str, - help="Path to .npy file, exported by nemo_prompt_convert.py") - parser.add_argument( - '--prompt_tasks', - help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0") - parser.add_argument('--lora_dir', + parser.add_argument('--output_log_probs_npy', type=str, - default=None, - nargs="+", - help="The directory of LoRA weights") - parser.add_argument( - '--lora_task_uids', - type=str, - default=None, - nargs="+", - help="The list of LoRA task uids; use -1 to disable the LoRA module") - parser.add_argument('--lora_ckpt_source', + help='Numpy file where the log_probs are stored', + default=None) + parser.add_argument('--output_cum_log_probs_npy', type=str, - default="hf", - choices=["hf", "nemo"], - help="The source of lora checkpoint.") - parser.add_argument( - '--num_prepend_vtokens', - nargs="+", - type=int, - help="Number of (default) virtual tokens to prepend to each sentence." - " For example, '--num_prepend_vtokens=10' will prepend the tokens" - " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") + help='Numpy file where the cum_log_probs are stored', + default=None) parser.add_argument( '--run_profiling', default=False, action='store_true', help="Run several 10 iterations to profile the inference latencies.") - parser.add_argument( - '--medusa_choices', - type=str, - default=None, - help="Medusa choice to use, if not none, will use Medusa decoding." - " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." - ) parser.add_argument('--target_load_engine_time', type=float, default=0) parser.add_argument('--target_qps', type=float, default=0) + parser = add_common_args(parser) return parser.parse_args(args=args) @@ -182,7 +101,8 @@ def parse_input(tokenizer, max_input_length=923, pad_id=None, num_prepend_vtokens=[], - model_name=None): + model_name=None, + model_version=None): if pad_id is None: pad_id = tokenizer.pad_token_id @@ -211,13 +131,12 @@ def parse_input(tokenizer, elif input_file.endswith('.txt'): with open(input_file, 'r', encoding='utf-8', errors='replace') as txt_file: - input_text = txt_file.read() - input_ids = tokenizer.encode( + input_text = txt_file.readlines() + batch_input_ids = tokenizer( input_text, add_special_tokens=add_special_tokens, truncation=True, - max_length=max_input_length) - batch_input_ids.append(input_ids) + max_length=max_input_length)["input_ids"] else: print('Input file format not supported.') raise SystemExit @@ -230,9 +149,11 @@ def parse_input(tokenizer, batch_input_ids[i] = list( range(base_vocab_size, base_vocab_size + length)) + batch_input_ids[i] - if model_name == 'glm_10b': + + if input_file is None and 'GLM' in model_name and model_version == 'glm': for ids in batch_input_ids: ids.append(tokenizer.sop_token_id) + batch_input_ids = [ torch.tensor(x, dtype=torch.int32) for x in batch_input_ids ] @@ -247,7 +168,11 @@ def print_output(tokenizer, output_npy=None, context_logits=None, generation_logits=None, - output_logits_npy=None): + cum_log_probs=None, + log_probs=None, + output_logits_npy=None, + output_cum_log_probs_npy=None, + output_log_probs_npy=None): batch_size, num_beams, _ = output_ids.size() if output_csv is None and output_npy is None: for batch_idx in range(batch_size): @@ -265,7 +190,6 @@ def print_output(tokenizer, f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') output_ids = output_ids.reshape((-1, output_ids.size(2))) - if output_csv is not None: output_file = Path(output_csv) output_file.parent.mkdir(exist_ok=True, parents=True) @@ -303,6 +227,20 @@ def print_output(tokenizer, dtype='float32') np.save(output_generation_logits_file, generation_outputs) + # Save cum log probs + if cum_log_probs is not None and output_cum_log_probs_npy is not None: + cum_log_probs_file = Path(output_cum_log_probs_npy) + cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), + dtype='float32') + np.save(cum_log_probs_file, cum_log_probs_outputs) + + # Save cum log probs + if log_probs is not None and output_log_probs_npy is not None: + log_probs_file = Path(output_log_probs_npy) + log_probs_outputs = np.array(log_probs.cpu().contiguous(), + dtype='float32') + np.save(log_probs_file, log_probs_outputs) + def check_status(args, load_engine_time, qps): print("==================== check status ====================") @@ -320,28 +258,35 @@ def main(args): runtime_rank = tensorrt_llm.mpi_rank() logger.set_level(args.log_level) - model_name = read_model_name(args.engine_dir) - if args.tokenizer_dir is None: + # different handling if encoder-decoder models + is_enc_dec = { + name + for name in os.listdir(args.engine_dir) + if os.path.isdir(os.path.join(args.engine_dir, name)) + } == {'encoder', 'decoder'} + if is_enc_dec: + logger.warning( + "This path is an encoder-decoder model. Using different handling.") + assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." + + model_name, model_version = read_model_name( + args.engine_dir) if not is_enc_dec else ("", "") + if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS: + logger.warning( + "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] tokenizer, pad_id, end_id = load_tokenizer( tokenizer_dir=args.tokenizer_dir, vocab_file=args.vocab_file, model_name=model_name, + model_version=model_version, tokenizer_type=args.tokenizer_type, ) - # # An example to stop generation when the model generate " London" on first sentence, " eventually became" on second sentence - # stop_words_list = [[" London"], ["eventually became"]] - # stop_words_list = tensorrt_llm.runtime.to_word_list_format(stop_words_list, tokenizer) - # stop_words_list = torch.Tensor(stop_words_list).to(torch.int32).to("cuda").contiguous() - stop_words_list = None - - # # An example to prevent generating " chef" on first sentence, " eventually" and " chef before" on second sentence - # bad_words_list = [[" chef"], [" eventually, chef before"]] - # bad_words_list = tensorrt_llm.runtime.to_word_list_format(bad_words_list, tokenizer) - # bad_words_list = torch.Tensor(bad_words_list).to(torch.int32).to("cuda").contiguous() - bad_words_list = None + if args.end_id: + end_id = args.end_id prompt_template = None if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: @@ -354,8 +299,47 @@ def main(args): max_input_length=args.max_input_length, pad_id=pad_id, num_prepend_vtokens=args.num_prepend_vtokens, - model_name=model_name) - input_lengths = [x.size(0) for x in batch_input_ids] + model_name=model_name, + model_version=model_version) + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + if model_version == 'glm4': # add default stop token ids for GLM-4 + glm4_stop_ids = [[151329], [151336], [151338]] + if stop_words_list is None: + stop_words_list = [glm4_stop_ids] * len(batch_input_ids) + else: + for req_stop_words_list in stop_words_list: + req_stop_words_list.extend(glm4_stop_ids) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + + if is_enc_dec: + encoder_input_ids = batch_input_ids + decoder_start_token_id = read_decoder_start_token_id( + os.path.join(args.engine_dir, "decoder")) + decoder_input_ids = [ + torch.tensor([decoder_start_token_id], dtype=torch.int32) + for _ in batch_input_ids + ] + + input_lengths = [x.size(0) for x in decoder_input_ids + ] if is_enc_dec else [x.size(0) for x in batch_input_ids] + encoder_input_lengths = [x.size(0) + for x in encoder_input_ids] if is_enc_dec else None + + if not args.use_py_session and not supports_inflight_batching( + os.path.join(args.engine_dir, "decoder") if is_enc_dec else args. + engine_dir): + logger.warning( + "The given engine does not support in-flight batching, fallback to python session" + ) + args.use_py_session = True if not PYTHON_BINDINGS and not args.use_py_session: logger.warning( @@ -367,34 +351,60 @@ def main(args): "Debug mode is not supported in C++ session for now, fallback to Python session." ) args.use_py_session = True + if args.return_all_generated_tokens and args.use_py_session: + raise ValueError( + "Returning all the generated tokens at each step is not supported in the Python session, use C++ session instead." + ) + if (not args.return_all_generated_tokens) and args.streaming and ( + args.num_beams > 1): + logger.warning( + "Setting return_all_generated_tokens to True since streaming AND beam search are done simultaneously. " + "Returning the full beams at each streaming step is needed because beam search + streaming can change previous outputs. " + "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." + ) + args.return_all_generated_tokens = True runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict(engine_dir=args.engine_dir, - lora_dir=args.lora_dir, - rank=runtime_rank, - debug_mode=args.debug_mode, - lora_ckpt_source=args.lora_ckpt_source) + runner_kwargs = dict( + engine_dir=args.engine_dir, + lora_dir=args.lora_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + lora_ckpt_source=args.lora_ckpt_source, + gpu_weights_percent=args.gpu_weights_percent, + ) + if not args.use_py_session: + runner_kwargs.update(is_enc_dec=is_enc_dec) if args.medusa_choices is not None: args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.use_py_session, "Medusa is only supported by py_session" - assert args.temperature == 0, "Medusa should use temperature == 0" + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" assert args.num_beams == 1, "Medusa should use num_beams == 1" runner_kwargs.update(medusa_choices=args.medusa_choices) if not args.use_py_session: runner_kwargs.update( max_batch_size=len(batch_input_ids), - max_input_len=max(input_lengths), + max_input_len=max( + encoder_input_lengths if is_enc_dec else input_lengths), max_output_len=args.max_output_len, max_beam_width=args.num_beams, max_attention_window_size=args.max_attention_window_size, sink_token_length=args.sink_token_length, - ) + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode) + runner_kwargs.update( + enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) runner = runner_cls.from_dir(**runner_kwargs) torch.cuda.synchronize() start_time = time.time() with torch.no_grad(): outputs = runner.generate( - batch_input_ids, + batch_input_ids=decoder_input_ids + if is_enc_dec else batch_input_ids, + encoder_input_ids=encoder_input_ids if is_enc_dec else None, max_new_tokens=args.max_output_len, max_attention_window_size=args.max_attention_window_size, sink_token_length=args.sink_token_length, @@ -405,27 +415,32 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, + no_repeat_ngram_size=args.no_repeat_ngram_size, return_dict=True, - medusa_choices=args.medusa_choices) + medusa_choices=args.medusa_choices, + return_all_generated_tokens=args.return_all_generated_tokens) torch.cuda.synchronize() - - status = False + end_time = time.time() + if runtime_rank == 0: num_inputs = sum([torch.numel(x) for x in batch_input_ids]) num_outputs = torch.numel(outputs["output_ids"]) num_gens = num_outputs - num_inputs - load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine") qps = num_gens/(end_time-start_time) logger.info(f'Load engine takes: {load_engine_time} sec') @@ -433,29 +448,46 @@ def main(args): status = check_status(args, load_engine_time, qps) else: status = True - + if args.streaming: for curr_outputs in throttle_generator(outputs, args.streaming_interval): if runtime_rank == 0: output_ids = curr_outputs['output_ids'] sequence_lengths = curr_outputs['sequence_lengths'] - print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy) + cum_log_probs = None + log_probs = None + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output( + tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) else: if runtime_rank == 0: output_ids = outputs['output_ids'] sequence_lengths = outputs['sequence_lengths'] context_logits = None generation_logits = None + cum_log_probs = None + log_probs = None if runner.gather_context_logits: context_logits = outputs['context_logits'] if runner.gather_generation_logits: generation_logits = outputs['generation_logits'] + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] print_output(tokenizer, output_ids, input_lengths, @@ -464,7 +496,11 @@ def main(args): output_npy=args.output_npy, context_logits=context_logits, generation_logits=generation_logits, - output_logits_npy=args.output_logits_npy) + output_logits_npy=args.output_logits_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) if args.run_profiling: ite = 10 @@ -482,17 +518,24 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, - return_dict=True) + return_dict=True, + return_all_generated_tokens=args.return_all_generated_tokens + ) torch.cuda.synchronize() tensorrt_llm.profiler.start("tmp") @@ -509,23 +552,31 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, - return_dict=True) + return_dict=True, + return_all_generated_tokens=args.return_all_generated_tokens + ) torch.cuda.synchronize() tensorrt_llm.profiler.stop("tmp") print( f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" ) + if status: print("successful.") else: @@ -536,4 +587,4 @@ def main(args): if __name__ == '__main__': args = parse_arguments() print(args) - main(args) + main(args) \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/requirements.txt b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/requirements.txt index f7cbbb8b7e9bbd8aab6303fd8b5de1dacbd353b8..38e019fe009252d42e512c6b71cc261bb7788de7 100644 --- a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/requirements.txt +++ b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/requirements.txt @@ -1,30 +1,43 @@ -accelerate +accelerate>=0.25.0 build colored # cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image. -diffusers +# diffusers>=0.27.0 lark mpi4py -numpy +numpy<2 onnx>=1.12.0 +openai polygraphy psutil pybind11 -pynvml>=11.5.0 -sentencepiece>=0.1.99 -# tensorrt==9.2.0.post12.dev5 -# torch -# nvidia-ammo~=0.5.0; platform_machine=="x86_64" -transformers +# pynvml>=11.5.0 +pulp +pandas +h5py==3.10.0 +StrEnum +# tensorrt~=10.3.0 +# https://github.com/pytorch/pytorch/blob/v2.4.0/version.txt uses 2.4.0a0. +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07 uses 2.4.0a0. +# torch>=2.4.0a0,<=2.4.0 +# nvidia-modelopt~=0.15.0 +transformers>=4.38.2,<=4.42.4 +#transformers +pillow==10.3.0 wheel optimum -evaluate janus -parameterized -scikit-learn +mpmath>=1.3.0 +click +click_option_group +aenum +datasets==2.14.6 +evaluate~=0.4.1 +rouge_score~=0.1.2 +sentencepiece~=0.1.99 + -# special -scipy==1.11.4 -pandas==1.5.3 -nltk -rouge_score +setuptools +parameterized +# scikit-learn +# scipy==1.11.4 \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8.sh b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8.sh index 042f1fd06abedbe3b856a726ff15a07982426cd6..2f260c8860f85b378538c24d98f58194ad712d4e 100644 --- a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8.sh +++ b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8.sh @@ -1,15 +1,34 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + #!/bin/bash +echo "start run $0" + EXIT_STATUS=0 -LOG_LEVEL=info +LOG_LEVEL=${LOG_LEVEL:-INFO} BS=${BS:-1} DTYPE=${DTYPE:-"float16"} +LOAD_TIME_TARGET=${LOAD_TIME_TARGET:-39} +TPS_TARGET=${TPS_TARGET:-14.8} PROJECT_DIR="./" DATASET_DIR=${DATASET_DIR:-"${PROJECT_DIR}/data/datasets_cnn_dailymail"} MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-70b-chat"} -ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}/checkpoints/"} +ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}"} export TLLM_LOG_LEVEL=${LOG_LEVEL} export PLUGIN_DTYPE="float16" @@ -24,7 +43,7 @@ check_status() export TASK_DATA_PATH=${DATASET_DIR} -# target is 95% of best (load engine time: 14.65, rouge1: 29.19, tps: 18.59) +# target is 80% of best (load engine time: 27, rouge1: 29.19, tps: 18.50) mpirun -n 8 --allow-run-as-root \ python3 ${PROJECT_DIR}/summarize.py \ --test_trt_llm \ @@ -35,8 +54,8 @@ python3 ${PROJECT_DIR}/summarize.py \ --tokenizer_dir ${MODEL_DIR} \ --tokenizer_type "llama" \ --engine_dir ${ENGINE_DIR} \ ---target_load_engine_time 15.4 \ --tensorrt_llm_rouge1_threshold 27.73 \ ---target_tps 17.66 \ +--target_load_engine_time ${LOAD_TIME_TARGET} \ +--target_tps ${TPS_TARGET} \ --use_py_session "$@"; check_status -exit ${EXIT_STATUS} +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8_build.sh b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8_build.sh index 27af165062b22f46443d127e9da2ce8057f72b7b..4ff5c49fe1d50a129c6024b789e6f34804436902 100644 --- a/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8_build.sh +++ b/models/nlp/large_language_model/llama2-70b/trtllm/scripts/test_trtllm_llama2_70b_gpu8_build.sh @@ -1,14 +1,33 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + #!/bin/bash +echo "start run $0" + EXIT_STATUS=0 -LOG_LEVEL=info +LOG_LEVEL=${LOG_LEVEL:-INFO} BS=${BS:-1} DTYPE=${DTYPE:-"float16"} +BUILD_TIME_TARGET=${BUILD_TIME_TARGET:-435} PROJECT_DIR="./" MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-70b-chat"} -OUTPUT_DIR=${OUTPUT_DIR:-"${PROJECT_DIR}/checkpoints/"} +ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}"} +CHECKPOINT_DIR="${ENGINE_DIR}/checkpoints" export TLLM_LOG_LEVEL=${LOG_LEVEL} export PLUGIN_DTYPE="float16" @@ -20,16 +39,23 @@ check_status() fi } -python3 ${PROJECT_DIR}/build.py \ ---log_level ${LOG_LEVEL} \ ---dtype ${DTYPE} \ +export TRTLLM_DISABLE_UNIFIED_CONVERTER=2 +python3 convert_checkpoint.py \ --model_dir ${MODEL_DIR} \ ---remove_input_padding \ ---use_gpt_attention_plugin float16 --use_gemm_plugin float16 \ ---enable_context_fmha \ ---world_size 8 \ +--output_dir ${CHECKPOINT_DIR} \ --tp_size 8 \ ---output_dir ${OUTPUT_DIR} "$@"; check_status -exit ${EXIT_STATUS} - - +--workers 8 \ +--dtype ${DTYPE} + + +# best(build engine time: 304) is 70% of target 435 +trtllm-build \ +--log_level ${LOG_LEVEL} \ +--max_batch_size ${BS} \ +--checkpoint_dir ${CHECKPOINT_DIR} \ +--remove_input_padding enable \ +--context_fmha enable \ +--workers 8 \ +--total_build_time_target ${BUILD_TIME_TARGET} \ +--output_dir ${ENGINE_DIR} "$@"; check_status +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/summarize.py b/models/nlp/large_language_model/llama2-70b/trtllm/summarize.py index acf06abd7708c098c30a40bc905a52d84d83deb6..8896ab0bfb500807ea8e6d8a2b9568a3afc257f6 100644 --- a/models/nlp/large_language_model/llama2-70b/trtllm/summarize.py +++ b/models/nlp/large_language_model/llama2-70b/trtllm/summarize.py @@ -26,11 +26,12 @@ import torch from datasets import load_dataset, load_from_disk from transformers import (AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, GenerationConfig) -from utils import DEFAULT_HF_MODEL_DIRS, load_tokenizer, read_model_name +from utils import (DEFAULT_HF_MODEL_DIRS, add_common_args, load_tokenizer, + read_model_name, supports_inflight_batching) import tensorrt_llm import tensorrt_llm.profiler as profiler -from tensorrt_llm._utils import str_dtype_to_torch +from tensorrt_llm._utils import mpi_broadcast, str_dtype_to_torch from tensorrt_llm.logger import logger from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner from tensorrt_llm.tools.ppl import ppl @@ -58,19 +59,26 @@ def main(args): runtime_rank = tensorrt_llm.mpi_rank() logger.set_level(args.log_level) - model_name = read_model_name(args.engine_dir) + test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0 + test_trt_llm = args.test_trt_llm + model_name, model_version = read_model_name(args.engine_dir) if args.hf_model_dir is None: - args.hf_model_dir = DEFAULT_HF_MODEL_DIRS[model_name] + logger.warning( + "hf_model_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) + if model_name in DEFAULT_HF_MODEL_DIRS: + args.hf_model_dir = DEFAULT_HF_MODEL_DIRS[model_name] + else: + args.hf_model_dir = None if args.tokenizer_dir is None: args.tokenizer_dir = args.hf_model_dir - test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0 - test_trt_llm = args.test_trt_llm profiler.start('load tokenizer') tokenizer, pad_id, end_id = load_tokenizer( tokenizer_dir=args.tokenizer_dir, vocab_file=args.vocab_file, model_name=model_name, + model_version=model_version, tokenizer_type=args.tokenizer_type, ) profiler.stop('load tokenizer') @@ -96,24 +104,34 @@ def main(args): dataset_input_key = 'input' dataset_output_key = 'output' dataset_split = 'validation' # only this split contains reference strings - - + elif args.eval_task == "eval_context_ppl": + dataset_name = "SlimPajama-6B" + dataset_revision = None + dataset_input_key = 'text' + dataset_output_key = 'text' + dataset_split = 'test' + args.output_len = 1 # Only want to compute the ppl of context + args.eval_ppl = True + logger.warning( + f"Run task '{args.eval_task}', setting 'output_len' to 1, and enable 'eval_ppl'." + ) + if args.dataset_dir is not None and isinstance(args.dataset_dir, str): + args.dataset_dir = args.dataset_dir.rstrip('/') + if args.dataset_dir.endswith(dataset_name): + dataset_name = args.dataset_dir + else: + dataset_name = f"{args.dataset_dir}/{dataset_name}" + logger.info(f"prepare datasets....") if os.getenv("TASK_DATA_PATH"): dataset = load_from_disk(os.getenv("TASK_DATA_PATH"))[dataset_split] else: - # dataset = load_dataset(dataset_name, - # dataset_revision, - # cache_dir=args.dataset_path, - # split=dataset_split, - # trust_remote_code=True) - dataset = load_dataset(dataset_name, dataset_revision, - cache_dir=args.dataset_path, + cache_dir=args.dataset_cache_dir, split=dataset_split) - logger.info(f"datasets is ready.") + max_batch_size = args.batch_size # runtime parameters @@ -124,77 +142,35 @@ def main(args): max_attention_window_size = args.max_attention_window_size sink_token_length = args.sink_token_length + if args.end_id: + end_id = args.end_id + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + if model_version == 'glm4': # add default stop token ids for GLM-4 + glm4_stop_ids = [[151329], [151336], [151338]] + if stop_words_list is None: + stop_words_list = [glm4_stop_ids] * args.batch_size + else: + for req_stop_words_list in stop_words_list: + req_stop_words_list.extend(glm4_stop_ids) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + # random_seed = 5 temperature = args.temperature num_beams = args.num_beams length_penalty = args.length_penalty + early_stopping = args.early_stopping repetition_penalty = args.repetition_penalty presence_penalty = args.presence_penalty frequency_penalty = args.frequency_penalty - if test_trt_llm: - if not PYTHON_BINDINGS and not args.use_py_session: - logger.warning( - "Python bindings of C++ session is unavailable, fallback to Python session." - ) - args.use_py_session = True - runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict(engine_dir=args.engine_dir, - rank=runtime_rank, - debug_mode=args.debug_mode) - if args.medusa_choices is not None: - args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.use_py_session, "Medusa is only supported by py_session" - assert args.temperature == 0, "Medusa should use temperature == 0" - assert args.num_beams == 1, "Medusa should use num_beams == 1" - runner_kwargs.update(medusa_choices=args.medusa_choices) - if not args.use_py_session: - runner_kwargs.update( - max_batch_size=max_batch_size, - max_input_len=test_token_num, - max_output_len=output_len, - max_beam_width=num_beams, - max_attention_window_size=max_attention_window_size, - sink_token_length=sink_token_length) - runner = runner_cls.from_dir(**runner_kwargs) - assert not (args.eval_ppl and not (runner.gather_context_logits and runner.gather_generation_logits)), \ - "PPL evaluation requires engine built with gather_all_token_logits enabled" - - if test_hf: - profiler.start('load HF model') - dtype_alias_mapping = { - 'fp32': 'float32', - 'fp16': 'float16', - 'bf16': 'bfloat16' - } - args.data_type = dtype_alias_mapping.get(args.data_type, args.data_type) - if model_name.startswith('chatglm'): - auto_model_cls = AutoModel - elif model_name.startswith('glm'): - auto_model_cls = AutoModelForSeq2SeqLM - else: - auto_model_cls = AutoModelForCausalLM - model = auto_model_cls.from_pretrained( - args.hf_model_dir, - trust_remote_code=True, - torch_dtype=str_dtype_to_torch(args.data_type), - device_map='auto' if args.hf_device_map_auto else None) - try: - model.to_bettertransformer() - except ValueError as e: - logger.warning( - f'Fail to call model.to_bettertransformer(), exception:\n{str(e)}' - ) - if not args.hf_device_map_auto: - model.cuda() - if model_name == 'qwen': - model.generation_config = GenerationConfig.from_pretrained( - args.hf_model_dir, trust_remote_code=True) - profiler.stop('load HF model') - logger.info( - f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec' - ) - output_dir = Path(args.output_dir) if args.output_dir else None if output_dir is not None: output_dir.mkdir(exist_ok=True, parents=True) @@ -207,9 +183,21 @@ def main(args): f.write(f'Model path: {args.hf_model_dir}\n') f.write(f'Tokenizer path: {args.tokenizer_dir}\n') + # TODO: Add random_seed flag in gptj + rouge_dir = args.rouge_dir if args.rouge_dir and os.path.exists( + args.rouge_dir) else "rouge" + metric_tensorrt_llm = [evaluate.load(rouge_dir) for _ in range(num_beams)] + metric_hf = [evaluate.load(rouge_dir) for _ in range(num_beams)] + for i in range(num_beams): + metric_tensorrt_llm[i].seed = 0 + metric_hf[i].seed = 0 + ppls_trt_llm = [[] for _ in range(num_beams)] + ppls_hf = [[] for _ in range(num_beams)] + def _prepare_inputs(batch_input_texts, eval_task='summarize', - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(batch_input_texts) append_str = ' TL;DR: ' if eval_task == 'summarize' else '' batch_input_ids = [] @@ -218,12 +206,13 @@ def main(args): curr_text = curr_text.strip().replace(" n't", "n't") # TODO: The below lines are used to be compatible with the original code; may need fix - if model_name.startswith(('chatglm2', 'chatglm3')): + if 'GLM' in model_name and model_version in ('chatglm2', + 'chatglm3'): input_ids = tokenizer.encode(curr_text, return_tensors='pt').squeeze(0) input_ids = input_ids[:test_token_num] - elif model_name == 'qwen': - from qwen.utils.utils import make_context + elif 'qwen' in model_name.lower() and model_version == 'qwen': + from tensorrt_llm.models.qwen.utils import make_context # use make_content to generate prompt system_prompt = "You are a useful assistant, please directly output the corresponding summary according to the article entered by the user." _, input_id_list = make_context( @@ -235,6 +224,18 @@ def main(args): ) input_ids = torch.tensor(input_id_list) else: + if 'qwen' in model_name.lower() and 'qwen2' in model_version: + messages = [{ + "role": + "system", + "content": + "You are a helpful assistant, please summarize the article entered by the user with one or two sentences." + }, { + "role": "user", + "content": curr_text + }] + curr_text = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) input_ids = tokenizer.encode( curr_text, return_tensors='pt', @@ -242,17 +243,23 @@ def main(args): truncation=True, max_length=test_token_num).squeeze(0) - batch_input_ids.append(input_ids) + if input_ids.numel() > min_input_length: + batch_input_ids.append(input_ids) return batch_input_ids def eval_trt_llm(datapoint, eval_task='summarize', eval_ppl=False, - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(datapoint[dataset_input_key]) batch_input_ids = _prepare_inputs(datapoint[dataset_input_key], eval_task=eval_task, - add_special_tokens=add_special_tokens) + add_special_tokens=add_special_tokens, + min_input_length=min_input_length) + batch_size = len(batch_input_ids) + if batch_size == 0: + return [], [], [], {} input_lengths = [x.size(0) for x in batch_input_ids] with torch.no_grad(): @@ -266,11 +273,15 @@ def main(args): temperature=temperature, top_k=top_k, top_p=top_p, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, num_beams=num_beams, length_penalty=length_penalty, + early_stopping=early_stopping, repetition_penalty=repetition_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + lora_uids=args.lora_task_uids, output_sequence_lengths=True, return_dict=True, medusa_choices=args.medusa_choices) @@ -327,7 +338,8 @@ def main(args): def eval_hf(datapoint, eval_task='summarize', eval_ppl=False, - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(datapoint[dataset_input_key]) if batch_size > 1: logger.warning( @@ -335,7 +347,11 @@ def main(args): ) batch_input_ids = _prepare_inputs(datapoint[dataset_input_key], eval_task=eval_task, - add_special_tokens=add_special_tokens) + add_special_tokens=add_special_tokens, + min_input_length=min_input_length) + batch_size = len(batch_input_ids) + if batch_size == 0: + return [], [], [], [[] for _ in range(batch_size)] input_lengths = [x.size(0) for x in batch_input_ids] # Left padding for HF max_length = max(input_lengths) @@ -349,6 +365,12 @@ def main(args): batch_input_ids = torch.stack(batch_input_ids) batch_input_ids = batch_input_ids.cuda() + # specialization for HF + if early_stopping in [0, 1]: + local_early_stopping = bool(early_stopping) + else: + local_early_stopping = "never" + with torch.no_grad(): outputs = model.generate(batch_input_ids, max_new_tokens=output_len, @@ -358,8 +380,8 @@ def main(args): pad_token_id=pad_id, num_beams=num_beams, num_return_sequences=num_beams, - early_stopping=True, length_penalty=length_penalty, + early_stopping=local_early_stopping, output_scores=True, return_dict_in_generate=True) if eval_ppl and batch_size == 1: @@ -384,7 +406,14 @@ def main(args): output_ids != pad_id).sum(dim=-1) context_logits = context_outputs['logits'] # Remove the first generation logits which are same to last context logits - generation_logits = torch.stack(outputs['scores'][1:], dim=1) + generation_logits = outputs['scores'][1:] + # When output_len is 1, generation_logits would be () and lead to error if we do torch.stack + if len(generation_logits) == 0: + generation_logits = torch.empty( + [context_logits.shape[0], 0, context_logits.shape[-1]], + device=context_logits.device) + else: + generation_logits = torch.stack(generation_logits, dim=1) _, max_gen_len, voc_size = generation_logits.size() generation_logits = generation_logits.view(batch_size, num_beams, max_gen_len, voc_size) @@ -410,12 +439,58 @@ def main(args): return output_lines_list, tokens_list, ppls if test_trt_llm: + if not supports_inflight_batching(args.engine_dir): + logger.warning( + "The given engine does not support in-flight batching, fallback to python session" + ) + args.use_py_session = True + + if not PYTHON_BINDINGS and not args.use_py_session: + logger.warning( + "Python bindings of C++ session is unavailable, fallback to Python session." + ) + args.use_py_session = True + if args.return_all_generated_tokens: + raise ValueError( + "Returning all the generated tokens at each step is not supported in summarize.py" + ) + runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp + runner_kwargs = dict(engine_dir=args.engine_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + gpu_weights_percent=args.gpu_weights_percent) + if args.medusa_choices is not None: + args.medusa_choices = ast.literal_eval(args.medusa_choices) + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" + assert args.num_beams == 1, "Medusa should use num_beams == 1" + runner_kwargs.update(medusa_choices=args.medusa_choices) + if not args.use_py_session: + runner_kwargs.update( + max_batch_size=max_batch_size, + max_input_len=test_token_num, + max_output_len=output_len, + max_beam_width=num_beams, + max_attention_window_size=max_attention_window_size, + sink_token_length=sink_token_length, + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode) + runner_kwargs.update( + enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) + runner = runner_cls.from_dir(**runner_kwargs) + assert not (args.eval_ppl and not (runner.gather_context_logits and runner.gather_generation_logits)), \ + "PPL evaluation requires engine built with gather_all_token_logits enabled" + datapoint = dataset[0:1] output, *_ = eval_trt_llm(datapoint, eval_task=args.eval_task, eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - if runtime_rank == 0: + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + if runtime_rank == 0 and args.eval_task != "eval_context_ppl": logger.info( "---------------------------------------------------------") logger.info("TensorRT-LLM Generated : ") @@ -424,71 +499,50 @@ def main(args): logger.info(f"\n Output : {output}") logger.info( "---------------------------------------------------------") - if test_hf: - datapoint = dataset[0:1] - output, *_ = eval_hf(datapoint, - eval_task=args.eval_task, - eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - logger.info("---------------------------------------------------------") - logger.info("HF Generated : ") - logger.info(f" Input : {datapoint[dataset_input_key]}") - logger.info(f"\n Reference : {datapoint[dataset_output_key]}") - logger.info(f"\n Output : {output}") - logger.info("---------------------------------------------------------") - # TODO: Add random_seed flag in gptj - metric_tensorrt_llm = [evaluate.load("rouge") for _ in range(num_beams)] - metric_hf = [evaluate.load("rouge") for _ in range(num_beams)] - for i in range(num_beams): - metric_tensorrt_llm[i].seed = 0 - metric_hf[i].seed = 0 - ppls_trt_llm = [[] for _ in range(num_beams)] - ppls_hf = [[] for _ in range(num_beams)] + ite_count = 0 + data_point_idx = 0 + total_output_token_count_trt_llm = 0 # only valid for runtime_rank == 0 + + if args.stability_test: + logger.info(f"stability test, need {args.stability_test_hours} hours") + else: + logger.info(f"dataset size: {len(dataset)}, max_ite: {args.max_ite}") + stability_start_time = time.time() - ite_count = 0 - data_point_idx = 0 - total_output_token_count_trt_llm = 0 # only valid for runtime_rank == 0 - - if args.stability_test: - logger.info(f"stability test, need {args.stability_test_hours} hours") - else: - logger.info(f"dataset size: {len(dataset)}, max_ite: {args.max_ite}") - stability_start_time = time.time() - while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): - if runtime_rank == 0: - logger.debug( - f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" - ) - datapoint = dataset[data_point_idx:(data_point_idx + max_batch_size)] + while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): + if runtime_rank == 0: + logger.debug( + f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" + ) + datapoint = dataset[data_point_idx:(data_point_idx + + max_batch_size)] - if test_trt_llm: profiler.start('tensorrt_llm') output_tensorrt_llm, output_ids_trt_llm, curr_ppls_trt_llm, lengths_info = eval_trt_llm( datapoint, eval_task=args.eval_task, eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) profiler.stop('tensorrt_llm') + + empty_batch = (runtime_rank == 0 and len(output_tensorrt_llm) == 0) + empty_batch = mpi_broadcast(empty_batch, 0) + if empty_batch: + # No valid samples in the current batch, skip this iteration + data_point_idx += max_batch_size + continue + if runtime_rank == 0: input_lengths = lengths_info['input_lengths'] seq_lengths = lengths_info['seq_lengths'] output_token_count_trt_llm = sum( - seq_lengths[idx][0] - input_lengths[idx] - for idx in range(len(input_lengths))) + seq_lengths[bs][bm] - input_lengths[bs] + for bm in range(len(output_tensorrt_llm[0])) + for bs in range(len(output_tensorrt_llm))) total_output_token_count_trt_llm += output_token_count_trt_llm - if test_hf: - profiler.start('hf') - output_hf, _, curr_ppls_hf = eval_hf( - datapoint, - eval_task=args.eval_task, - eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - profiler.stop('hf') - - if runtime_rank == 0: - if test_trt_llm: for batch_idx in range(len(output_tensorrt_llm)): for beam_idx in range(num_beams): metric_tensorrt_llm[beam_idx].add_batch( @@ -502,13 +556,121 @@ def main(args): ppls_trt_llm[beam_idx].append( curr_ppls_trt_llm[batch_idx][beam_idx]) if output_dir is not None: - # yapf: disable for i in range(len(output_tensorrt_llm[0])): for beam_idx in range(num_beams): with (output_dir / 'trtllm.out').open('a') as f: - f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_tensorrt_llm[beam_idx][i]}\n') - # yapf: enable - if test_hf: + f.write( + f'[{data_point_idx + i}] [Beam {beam_idx}] {output_tensorrt_llm[beam_idx][i]}\n' + ) + + logger.debug('-' * 100) + logger.debug(f"Input : {datapoint[dataset_input_key]}") + logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}') + logger.debug(f"Reference : {datapoint[dataset_output_key]}") + + data_point_idx += max_batch_size + ite_count += 1 + + if args.stability_test: + test_time_hours = round((time.time() - stability_start_time) / 3600, 1) + if test_time_hours > args.stability_test_hours: + if runtime_rank == 0: + logger.info(f"Stability Test Finished. Total run {test_time_hours} hours.") + break + else: + data_point_idx = data_point_idx % len(dataset) + ite_count = ite_count % args.max_ite + if runtime_rank == 0 and ite_count % 100 == 0: + logger.info( + f"stability test, remain {round(args.stability_test_hours - test_time_hours, 1)} hours") + elif runtime_rank == 0 and ite_count % 10 == 0: + logger.info(f"data_point_idx: {data_point_idx}, ite_count: {ite_count}") + + del runner + + if test_hf and runtime_rank == 0: + profiler.start('load HF model') + dtype_alias_mapping = { + 'fp32': 'float32', + 'fp16': 'float16', + 'bf16': 'bfloat16' + } + args.hf_data_type = dtype_alias_mapping.get(args.hf_data_type, + args.hf_data_type) + if 'GLM' in model_name and model_version == 'glm': + auto_model_cls = AutoModelForSeq2SeqLM + elif 'GLM' in model_name and model_version == 'chatglm': + auto_model_cls = AutoModel + else: + auto_model_cls = AutoModelForCausalLM + model = auto_model_cls.from_pretrained( + args.hf_model_dir, + trust_remote_code=True, + torch_dtype=str_dtype_to_torch(args.hf_data_type), + device_map='auto' if args.hf_device_map_auto else None) + try: + model.to_bettertransformer() + except Exception as e: + logger.warning( + f'Fail to call model.to_bettertransformer(), exception:\n{str(e)}' + ) + if not args.hf_device_map_auto: + model.cuda() + if model_name == 'qwen': + model.generation_config = GenerationConfig.from_pretrained( + args.hf_model_dir, trust_remote_code=True) + profiler.stop('load HF model') + logger.info( + f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec' + ) + + datapoint = dataset[0:1] + output, *_ = eval_hf(datapoint, + eval_task=args.eval_task, + eval_ppl=args.eval_ppl, + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + if runtime_rank == 0 and args.eval_task != "eval_context_ppl": + logger.info( + "---------------------------------------------------------") + logger.info("HF Generated : ") + logger.info(f" Input : {datapoint[dataset_input_key]}") + logger.info(f"\n Reference : {datapoint[dataset_output_key]}") + logger.info(f"\n Output : {output}") + logger.info( + "---------------------------------------------------------") + + ite_count = 0 + data_point_idx = 0 + total_output_token_count_hf = 0 # only valid for runtime_rank == 0 + while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): + if runtime_rank == 0: + logger.debug( + f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" + ) + datapoint = dataset[data_point_idx:(data_point_idx + + max_batch_size)] + + profiler.start('hf') + output_hf, token_list, curr_ppls_hf = eval_hf( + datapoint, + eval_task=args.eval_task, + eval_ppl=args.eval_ppl, + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + profiler.stop('hf') + + # HF model runs on rank 0 only + empty_batch = len(output_hf) == 0 + if empty_batch: + # No valid samples in the current batch, skip this iteration + data_point_idx += max_batch_size + continue + + if runtime_rank == 0: + seq_lengths = [len(tokens) for tokens in token_list] + total_output_token_count_hf += sum(seq_lengths) + for beam_idx in range(num_beams): for batch_idx in range(len(output_hf[beam_idx])): metric_hf[beam_idx].add_batch( @@ -520,37 +682,21 @@ def main(args): ppls_hf[beam_idx].append( curr_ppls_hf[batch_idx][beam_idx]) if output_dir is not None: - # yapf: disable for i in range(len(output_hf[0])): for beam_idx in range(num_beams): with (output_dir / 'hf.out').open('a') as f: - f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_hf[beam_idx][i]}\n') - # yapf: enable + f.write( + f'[{data_point_idx + i}] [Beam {beam_idx}] {output_hf[beam_idx][i]}\n' + ) - logger.debug('-' * 100) - logger.debug(f"Input : {datapoint[dataset_input_key]}") - if test_trt_llm: - logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}') - if test_hf: + logger.debug('-' * 100) + logger.debug(f"Input : {datapoint[dataset_input_key]}") logger.debug(f'HF Output: {output_hf}') - logger.debug(f"Reference : {datapoint[dataset_output_key]}") + logger.debug(f"Reference : {datapoint[dataset_output_key]}") - data_point_idx += max_batch_size - ite_count += 1 - - if args.stability_test: - test_time_hours = round((time.time() - stability_start_time)/3600, 1) - if test_time_hours > args.stability_test_hours: - if runtime_rank == 0: - logger.info(f"Stability Test Finished. Total run {test_time_hours} hours.") - break - else: - data_point_idx = data_point_idx % len(dataset) - ite_count = ite_count % args.max_ite - if runtime_rank == 0 and ite_count % 100 == 0: - logger.info(f"stability test, remain {round(args.stability_test_hours - test_time_hours, 1)} hours") - elif runtime_rank == 0 and ite_count % 10 == 0: - logger.info(f"data_point_idx: {data_point_idx}, ite_count: {ite_count}") + data_point_idx += max_batch_size + ite_count += 1 + del model if runtime_rank == 0: if test_trt_llm: @@ -558,6 +704,7 @@ def main(args): logger.info( f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)' ) + logger.info( f'TensorRT-LLM (total output tokens: {total_output_token_count_trt_llm})' ) @@ -567,30 +714,30 @@ def main(args): rouge1 = 0 tps = total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm") - + for beam_idx in range(num_beams): logger.info(f"TensorRT-LLM beam {beam_idx} result") - computed_metrics_tensorrt_llm = metric_tensorrt_llm[ - beam_idx].compute() - for key in computed_metrics_tensorrt_llm.keys(): - logger.info( - f' {key} : {computed_metrics_tensorrt_llm[key]*100}') - - if args.check_accuracy and beam_idx == 0: - assert computed_metrics_tensorrt_llm[ - 'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold - - if beam_idx == 0: - rouge1 = computed_metrics_tensorrt_llm['rouge1'] * 100 - + if args.eval_task != "eval_context_ppl": + computed_metrics_tensorrt_llm = metric_tensorrt_llm[ + beam_idx].compute() + for key in computed_metrics_tensorrt_llm.keys(): + logger.info( + f' {key} : {computed_metrics_tensorrt_llm[key]*100}' + ) + if args.check_accuracy and beam_idx == 0: + assert computed_metrics_tensorrt_llm[ + 'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold + + if beam_idx == 0: + rouge1 = computed_metrics_tensorrt_llm['rouge1'] * 100 if args.eval_ppl: logger.info( f" Per-token perplexity: {np.mean(ppls_trt_llm[beam_idx])}" ) if args.check_accuracy and beam_idx == 0: - assert np.mean(ppls_trt_llm[beam_idx] - ) < args.tensorrt_llm_ppl_threshold - + avg_ppl = np.mean(ppls_trt_llm[beam_idx]) + assert avg_ppl < args.tensorrt_llm_ppl_threshold, f"[FAILED] average PPL ({avg_ppl}) is larger than threshold ({args.tensorrt_llm_ppl_threshold})" + load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine") logger.info(f'Load engine takes: {load_engine_time} sec') @@ -599,19 +746,27 @@ def main(args): print("successful.") else: print("failed.") - - sys.exit(int(not status)) - + + sys.exit(int(not status)) + if test_hf: np.random.seed(0) # rouge score use sampling to compute the score logger.info( f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)' ) + logger.info( + f'Hugging Face (total output tokens: {total_output_token_count_hf})' + ) + logger.info( + f'Hugging Face (tokens per second: {total_output_token_count_hf / profiler.elapsed_time_in_sec("hf")})' + ) + for beam_idx in range(num_beams): logger.info(f"HF beam {beam_idx} result") computed_metrics_hf = metric_hf[beam_idx].compute() - for key in computed_metrics_hf.keys(): - logger.info(f' {key} : {computed_metrics_hf[key]*100}') + if args.eval_task != "eval_context_ppl": + for key in computed_metrics_hf.keys(): + logger.info(f' {key} : {computed_metrics_hf[key]*100}') if args.eval_ppl and args.batch_size == 1: logger.info( f" Per-token perplexity: {np.mean(ppls_hf[beam_idx])}") @@ -619,34 +774,15 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) - parser.add_argument( - '--tokenizer_dir', - default=None, - help='tokenizer path; defaults to hf_model_dir if left unspecified') - parser.add_argument( - '--tokenizer_type', - help= - 'Specify that argument when providing a .model file as the tokenizer_dir. ' - 'It allows AutoTokenizer to instantiate the correct tokenizer type.') - parser.add_argument('--vocab_file') parser.add_argument('--test_hf', action='store_true') parser.add_argument('--test_trt_llm', action='store_true') - parser.add_argument( - '--data_type', - type=str, - choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'], - default='fp16') - parser.add_argument('--engine_dir', type=str, default='engine_outputs') - parser.add_argument('--use_py_session', - default=False, - action='store_true', - help="Whether or not to use Python runtime session") - parser.add_argument( - '--eval_task', - type=str, - default='summarize', - choices=['summarize', 'summarize_long', 'code_completion']) + parser.add_argument('--eval_task', + type=str, + default='summarize', + choices=[ + 'summarize', 'summarize_long', 'code_completion', + 'eval_context_ppl' + ]) parser.add_argument('--check_accuracy', action='store_true') parser.add_argument('--tensorrt_llm_rouge1_threshold', type=float, @@ -655,51 +791,33 @@ if __name__ == '__main__': parser.add_argument('--tensorrt_llm_ppl_threshold', type=float, default=15.0) + parser.add_argument( + '--dataset_dir', + type=str, + default=None, + help="The local directory of the dataset for evaluation; " + "will download the dataset from huggingface hub if not specified.") + parser.add_argument( + '--dataset_cache_dir', + type=str, + default=None, + help="The local cache directory for dataset; " + "will use `~/.cache/huggingface/datasets` if not specified.") parser.add_argument('--target_load_engine_time', type=float, default=0) parser.add_argument('--target_tps', type=float, default=0) - parser.add_argument('--dataset_path', type=str, default='') - parser.add_argument('--log_level', type=str, default='info') parser.add_argument('--batch_size', type=int, default=1) parser.add_argument('--max_ite', type=int, default=20) parser.add_argument('--output_len', type=int, default=100) parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument( - '--max_attention_window_size', + '--min_input_length', type=int, - default=None, - help= - 'The attention window size that controls the sliding window attention / cyclic kv cache behaviour' - ) - parser.add_argument('--sink_token_length', - type=int, - default=None, - help='The sink token length.') - parser.add_argument('--num_beams', type=int, default=1) - parser.add_argument('--temperature', type=float, default=1.0) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--top_p', type=float, default=0.0) - parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--repetition_penalty', type=float, default=1.0) - parser.add_argument('--presence_penalty', type=float, default=0.0) - parser.add_argument('--frequency_penalty', type=float, default=0.0) - parser.add_argument('--debug_mode', - default=False, - action='store_true', - help="Whether or not to turn on the debug mode") - parser.add_argument('--no_add_special_tokens', - dest='add_special_tokens', - default=True, - action='store_false', - help="Whether or not to add special tokens") - parser.add_argument( - '--hf_device_map_auto', - action='store_true', - help="Use device map 'auto' to load a pretrained HF model. This may " - "help to test a large model that cannot fit into a singlue GPU.") + default=0, + help='skip the sentences which are shorter than min_input_length.') parser.add_argument( '--output_dir', type=str, @@ -708,17 +826,19 @@ if __name__ == '__main__': "TensorRT-LLM outputs, and 'hf.out' for HF outputs. If None, do not " "save outputs.") parser.add_argument( - '--medusa_choices', - type=str, + '--rouge_dir', default=None, - help="Medusa choice to use, if not none, will use Medusa decoding." - " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." + type=str, + help= + "evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF." ) parser.add_argument('--stability_test', default=False, action='store_true', help="Whether or not to run stability test for tensorrt_llm.") parser.add_argument('--stability_test_hours', type=float, default=24.0) + parser = add_common_args(parser) args = parser.parse_args() print(args) - main(args) + + main(args) \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/utils.py b/models/nlp/large_language_model/llama2-70b/trtllm/utils.py index 44042d9e2dcb44dd6cd917ab16a00010e4005202..340ea03995dc62d200234e43ec3e73a4d4923bbb 100644 --- a/models/nlp/large_language_model/llama2-70b/trtllm/utils.py +++ b/models/nlp/large_language_model/llama2-70b/trtllm/utils.py @@ -12,55 +12,90 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import json from pathlib import Path from typing import Optional -from transformers import AutoTokenizer, T5Tokenizer +from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -import tensorrt_llm +from tensorrt_llm.bindings import GptJsonConfig +from tensorrt_llm.builder import get_engine_version DEFAULT_HF_MODEL_DIRS = { - 'baichuan': 'baichuan-inc/Baichuan-13B-Chat', - 'bloom': 'bigscience/bloom-560m', - 'chatglm_6b': 'THUDM/chatglm-6b', - 'chatglm2_6b': 'THUDM/chatglm2-6b', - 'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k', - 'chatglm3_6b': 'THUDM/chatglm3-6b', - 'chatglm3_6b_base': 'THUDM/chatglm3-6b-base', - 'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k', - 'falcon': 'tiiuae/falcon-rw-1b', - 'glm_10b': 'THUDM/glm-10b', - 'gpt': 'gpt2-medium', - 'gptj': 'EleutherAI/gpt-j-6b', - 'gptneox': 'EleutherAI/gpt-neox-20b', - 'internlm': 'internlm/internlm-chat-7b', - 'llama': 'meta-llama/Llama-2-7b-hf', - 'mpt': 'mosaicml/mpt-7b', - 'phi': 'microsoft/phi-2', - 'opt': 'facebook/opt-350m', - 'qwen': 'Qwen/Qwen-7B', + 'BaichuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat', + 'BaiChuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat', + 'BloomForCausalLM': 'bigscience/bloom-560m', + 'GLMModel': 'THUDM/glm-10b', + 'ChatGLMModel': 'THUDM/chatglm3-6b', + 'ChatGLMForCausalLM': 'THUDM/chatglm3-6b', + 'RWForCausalLM': 'tiiuae/falcon-rw-1b', + 'FalconForCausalLM': 'tiiuae/falcon-rw-1b', + 'GPT2LMHeadModel': 'gpt2', + 'GPT2LMHeadCustomModel': 'gpt2', + 'Starcoder2ForCausalLM': 'bigcode/starcoder2-3b', + 'GPTForCausalLM': 'gpt2', + 'GPTJForCausalLM': 'EleutherAI/gpt-j-6b', + 'GPTNeoXForCausalLM': 'EleutherAI/gpt-neox-20b', + 'InternLMForCausalLM': 'internlm/internlm-chat-7b', + 'InternLM2ForCausalLM': 'internlm/internlm2-chat-7b', + 'LlamaForCausalLM': 'meta-llama/Llama-2-7b-hf', + 'MPTForCausalLM': 'mosaicml/mpt-7b', + 'PhiForCausalLM': 'microsoft/phi-2', + 'OPTForCausalLM': 'facebook/opt-350m', + 'QWenLMHeadModel': 'Qwen/Qwen-7B', + 'QWenForCausalLM': 'Qwen/Qwen-7B', + 'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B', + 'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B', + 'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b', } +INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (书生·浦语). +- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. +- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文. +""" + +QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n" + DEFAULT_PROMPT_TEMPLATES = { - 'internlm': - "<|User|>:{input_text}\n<|Bot|>:", - 'qwen': - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", + 'InternLMForCausalLM': "<|User|>:{input_text}\n<|Bot|>:", + 'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION + + "<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", + 'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE, + 'QWenForCausalLM': QWEN_PROMPT_TEMPLATE, + 'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE, + 'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE, } +def supports_inflight_batching(engine_dir): + config_path = Path(engine_dir) / "config.json" + json_config = GptJsonConfig.parse_file(config_path) + model_config = json_config.model_config + return model_config.supports_inflight_batching + + +def read_decoder_start_token_id(engine_dir): + with open(Path(engine_dir) / "config.json", 'r') as f: + config = json.load(f) + return config['pretrained_config']['decoder_start_token_id'] + + def read_model_name(engine_dir: str): - engine_version = tensorrt_llm.runtime.engine.get_engine_version(engine_dir) + engine_version = get_engine_version(engine_dir) with open(Path(engine_dir) / "config.json", 'r') as f: config = json.load(f) if engine_version is None: - return config['builder_config']['name'] + return config['builder_config']['name'], None - return config['pretrained_config']['architecture'] + model_arch = config['pretrained_config']['architecture'] + model_version = None + if 'GLM' in model_arch: + model_version = config['pretrained_config']['chatglm_version'] + if 'qwen' in model_arch.lower(): + model_version = config['pretrained_config']['qwen_type'] + return model_arch, model_version def throttle_generator(generator, stream_interval): @@ -74,7 +109,8 @@ def throttle_generator(generator, stream_interval): def load_tokenizer(tokenizer_dir: Optional[str] = None, vocab_file: Optional[str] = None, - model_name: str = 'gpt', + model_name: str = 'GPTForCausalLM', + model_version: Optional[str] = None, tokenizer_type: Optional[str] = None): if vocab_file is None: use_fast = True @@ -86,28 +122,34 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None, padding_side='left', truncation_side='left', trust_remote_code=True, - tokenizer_type=tokenizer_type, + # tokenizer_type=tokenizer_type, # adapt to llama3 use_fast=use_fast) + elif model_name == 'GemmaForCausalLM' or model_name == 'RecurrentGemmaForCausalLM': + from transformers import GemmaTokenizer + + # Initialize tokenizer from vocab file. + tokenizer = GemmaTokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False) + elif model_name == 'Grok1ModelForCausalLM': + tokenizer = LlamaTokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False, + use_fast=False) else: # For gpt-next, directly load from tokenizer.model - assert model_name == 'gpt' tokenizer = T5Tokenizer(vocab_file=vocab_file, padding_side='left', - truncation_side='left') - - if model_name == 'qwen': + truncation_side='left', + legacy=False) + if 'qwen' in model_name.lower() and model_version == 'qwen': with open(Path(tokenizer_dir) / "generation_config.json") as f: gen_config = json.load(f) - chat_format = gen_config['chat_format'] - if chat_format == 'raw': - pad_id = gen_config['pad_token_id'] - end_id = gen_config['eos_token_id'] - elif chat_format == 'chatml': - pad_id = tokenizer.im_end_id - end_id = tokenizer.im_end_id - else: - raise Exception(f"unknown chat format: {chat_format}") - elif model_name == 'glm_10b': + pad_id = gen_config['pad_token_id'] + end_id = gen_config['eos_token_id'] + elif 'GLM' in model_name and model_version == 'glm': pad_id = tokenizer.pad_token_id end_id = tokenizer.eop_token_id else: @@ -117,3 +159,212 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None, end_id = tokenizer.eos_token_id return tokenizer, pad_id, end_id + + +def add_common_args(parser): + # sampling arguments + parser.add_argument('--num_beams', + type=int, + help="Use beam search if num_beams > 1", + default=1) + parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--top_k', type=int, default=1) + parser.add_argument('--top_p', type=float, default=0.0) + parser.add_argument('--length_penalty', type=float, default=1.0) + parser.add_argument('--repetition_penalty', type=float, default=1.0) + parser.add_argument('--presence_penalty', type=float, default=0.0) + parser.add_argument('--frequency_penalty', type=float, default=0.0) + parser.add_argument('--beam_search_diversity_rate', type=float, default=0.0) + parser.add_argument('--random_seed', type=int, default=0) + parser.add_argument('--early_stopping', + type=int, + help='Use early stopping if num_beams > 1, ' + '1 for early-stopping, 0 for non-early-stopping' + 'other values for stopping by length', + default=1) + parser.add_argument( + '--end_id', + default=None, + type=int, + help="Override tokenizer end_id to stop on given end_id token.") + parser.add_argument( + '--stop_words', + default=None, + type=str, + nargs="+", + action='append', + help= + 'Set stop words for a batch. Successive invocations of --stop_words set stop words for other batches.' + ' E.g.: --stop_words " London" " chef" --stop_words "eventually became" "was not"', + ) + parser.add_argument( + '--bad_words', + default=None, + type=str, + nargs="+", + action='append', + help= + 'Set bad words for a batch. Successive invocations of --bad_words set bad words for other batches.' + ' E.g.: --bad_words " London" " chef" --bad_words "eventually became" "was not"', + ) + parser.add_argument('--no_repeat_ngram_size', type=int, default=None) + + # common runtime arguments + parser.add_argument('--sink_token_length', + type=int, + default=None, + help='The sink token length.') + parser.add_argument( + '--max_attention_window_size', + type=int, + default=None, + help= + 'The attention window size that controls the sliding window attention / cyclic kv cache behavior' + ) + parser.add_argument( + '--multi_block_mode', + action='store_true', + help= + "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel." + ) + parser.add_argument('--enable_context_fmha_fp32_acc', + action='store_true', + help="Enable FMHA runner FP32 accumulation.") + parser.add_argument('--log_level', type=str, default='info') + parser.add_argument( + '--no_prompt_template', + dest='use_prompt_template', + default=True, + action='store_false', + help= + "Whether or not to use default prompt template to wrap the input text.") + parser.add_argument('--use_py_session', + default=False, + action='store_true', + help="Whether or not to use Python runtime session") + parser.add_argument('--debug_mode', + default=False, + action='store_true', + help="Whether or not to turn on the debug mode") + parser.add_argument('--streaming', default=False, action='store_true') + parser.add_argument('--streaming_interval', + type=int, + help="How often to return tokens when streaming.", + default=5) + parser.add_argument( + '--prompt_table_path', + type=str, + help="Path to .npy file, exported by nemo_prompt_convert.py") + parser.add_argument( + '--prompt_tasks', + help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0") + parser.add_argument('--lora_dir', + type=str, + default=None, + nargs="+", + help="The directory of LoRA weights") + parser.add_argument('--lora_ckpt_source', + type=str, + default="hf", + choices=["hf", "nemo"], + help="The source of lora checkpoint.") + parser.add_argument( + '--lora_task_uids', + type=str, + default=None, + nargs="+", + help="The list of LoRA task uids; use -1 to disable the LoRA module") + parser.add_argument( + '--num_prepend_vtokens', + nargs="+", + type=int, + help="Number of (default) virtual tokens to prepend to each sentence." + " For example, '--num_prepend_vtokens=10' will prepend the tokens" + " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") + parser.add_argument( + '--medusa_choices', + type=str, + default=None, + help="Medusa choice to use, if not none, will use Medusa decoding." + " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." + ) + + # model arguments + parser.add_argument('--engine_dir', type=str, default='engine_outputs') + parser.add_argument( + '--tokenizer_type', + help= + 'Specify that argument when providing a .model file as the tokenizer_dir. ' + 'It allows AutoTokenizer to instantiate the correct tokenizer type.') + parser.add_argument('--vocab_file', + help="Used for sentencepiece tokenizers") + parser.add_argument('--no_add_special_tokens', + dest='add_special_tokens', + default=True, + action='store_false', + help="Whether or not to add special tokens") + parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) + parser.add_argument( + '--tokenizer_dir', + default=None, + help='tokenizer path; defaults to hf_model_dir if left unspecified') + + # memory argument + parser.add_argument( + '--gpu_weights_percent', + default=1, + type=float, + help= + 'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.', + ) + parser.add_argument( + '--max_tokens_in_paged_kv_cache', + default=None, + type=int, + help= + 'Specify the maximum number of tokens in a kv cache page (only available with cpp session).', + ) + parser.add_argument( + '--kv_cache_enable_block_reuse', + action='store_true', + help= + 'Enables block reuse in kv cache (only available with cpp session).', + ) + parser.add_argument( + '--kv_cache_free_gpu_memory_fraction', + default=0.9, + type=float, + help='Specify the free gpu memory fraction.', + ) + parser.add_argument( + '--enable_chunked_context', + action='store_true', + help='Enables chunked context (only available with cpp session).', + ) + + # hf model argument (if use hf model) + parser.add_argument( + '--hf_data_type', + '--data_type', + type=str, + choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'], + default='fp16', + help="The data type for hf model.") + parser.add_argument( + '--hf_device_map_auto', + action='store_true', + help="Use device map 'auto' to load a pretrained HF model. This may " + "help to test a large model that cannot fit into a singlue GPU.") + + parser.add_argument( + "--return_all_generated_tokens", + default=False, + action="store_true", + help="This option changes the token output only for streaming. " + "If not specified, return only generated tokens at each step. " + "If specified, return the full beams/outputs at each step. " + "It is automatically enabled for num_beams>1 (only available with cpp session). " + "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." + ) + + return parser \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/build.py b/models/nlp/large_language_model/llama2-7b/trtllm/build.py deleted file mode 100644 index 4ff0c9eaa0cedfd382783a5cfcca9175bf38acad..0000000000000000000000000000000000000000 --- a/models/nlp/large_language_model/llama2-7b/trtllm/build.py +++ /dev/null @@ -1,1163 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import json -import math -import os -import sys -import time -from pathlib import Path - -# isort: off -import torch -import torch.multiprocessing as mp -import tensorrt as trt -# isort: on -from transformers import LlamaConfig, LlamaForCausalLM - -try: - from transformers import MixtralForCausalLM -except ImportError: - MixtralForCausalLM = None - -try: - from transformers import LlavaConfig, LlavaForConditionalGeneration -except ImportError: - pass - -import tensorrt_llm -from tensorrt_llm import profiler -from tensorrt_llm._common import check_max_num_tokens -from tensorrt_llm._utils import str_dtype_to_trt -from tensorrt_llm.builder import Builder -from tensorrt_llm.layers import MoeConfig -from tensorrt_llm.layers.attention import PositionEmbeddingType -from tensorrt_llm.logger import logger -from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import quantize_model -from tensorrt_llm.network import net_guard -from tensorrt_llm.plugin.plugin import ContextFMHAType -from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime.lora_manager import LoraConfig - -from tensorrt_llm.models.llama.weight import ( # isort:skip - get_scaling_factors, load_from_awq_llama, load_from_binary, - load_from_gptq_llama, load_from_hf_checkpoint, load_from_hf_llama, - load_from_meta_llama, parse_bin_config) - -MODEL_NAME = "llama" - -# 2 routines: get_engine_name, serialize_engine -# are direct copy from gpt example, TODO: put in utils? - -import onnx -from onnx import TensorProto, helper - - -def trt_dtype_to_onnx(dtype): - if dtype == trt.float16: - return TensorProto.DataType.FLOAT16 - if dtype == trt.bfloat16: - return TensorProto.DataType.BFLOAT16 - elif dtype == trt.float32: - return TensorProto.DataType.FLOAT - elif dtype == trt.int32: - return TensorProto.DataType.INT32 - elif dtype == trt.int64: - return TensorProto.DataType.INT64 - elif dtype == trt.bool: - return TensorProto.DataType.BOOL - else: - raise TypeError("%s is not supported" % dtype) - - -def to_onnx(network, path): - inputs = [] - for i in range(network.num_inputs): - network_input = network.get_input(i) - inputs.append( - helper.make_tensor_value_info( - network_input.name, trt_dtype_to_onnx(network_input.dtype), - list(network_input.shape))) - - outputs = [] - for i in range(network.num_outputs): - network_output = network.get_output(i) - outputs.append( - helper.make_tensor_value_info( - network_output.name, trt_dtype_to_onnx(network_output.dtype), - list(network_output.shape))) - - nodes = [] - for i in range(network.num_layers): - layer = network.get_layer(i) - layer_inputs = [] - for j in range(layer.num_inputs): - ipt = layer.get_input(j) - if ipt is not None: - layer_inputs.append(layer.get_input(j).name) - layer_outputs = [ - layer.get_output(j).name for j in range(layer.num_outputs) - ] - nodes.append( - helper.make_node(str(layer.type), - name=layer.name, - inputs=layer_inputs, - outputs=layer_outputs, - domain="com.nvidia")) - - onnx_model = helper.make_model(helper.make_graph(nodes, - 'attention', - inputs, - outputs, - initializer=None), - producer_name='NVIDIA') - onnx.save(onnx_model, path) - - -def get_engine_name(model, dtype, tp_size, pp_size, rank): - if pp_size == 1: - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) - return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, - pp_size, rank) - - -def serialize_engine(engine, path): - logger.info(f'Serializing engine to {path}...') - tik = time.time() - with open(path, 'wb') as f: - f.write(engine) - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Engine serialized. Total time: {t}') - - -def parse_arguments(cmd_args=None): - parser = argparse.ArgumentParser() - parser.add_argument('--world_size', type=int, default=1) - parser.add_argument('--tp_size', type=int, default=1) - parser.add_argument('--pp_size', type=int, default=1) - parser.add_argument('--model_dir', type=str, default=None) - parser.add_argument('--bin_model_dir', type=str, default=None) - parser.add_argument('--meta_ckpt_dir', type=str, default=None) - parser.add_argument('--quant_ckpt_path', type=str, default=None) - parser.add_argument('--dtype', - type=str, - default='float16', - choices=['float32', 'bfloat16', 'float16']) - parser.add_argument( - '--timing_cache', - type=str, - default='model.cache', - help= - 'The path of to read timing cache from, will be ignored if the file does not exist' - ) - parser.add_argument( - '--profiling_verbosity', - type=str, - default='layer_names_only', - choices=['layer_names_only', 'detailed', 'none'], - help= - 'The profiling verbosity for the generated TRT engine. Set to detailed can inspect tactic choices and kernel parameters.' - ) - parser.add_argument('--log_level', type=str, default='info') - parser.add_argument('--vocab_size', type=int, default=32000) - parser.add_argument('--n_layer', type=int, default=32) - parser.add_argument('--n_positions', type=int, default=2048) - parser.add_argument('--n_embd', type=int, default=4096) - parser.add_argument('--n_head', type=int, default=32) - parser.add_argument('--n_kv_head', type=int, default=None) - parser.add_argument('--multiple_of', type=int, default=256) - parser.add_argument('--ffn_dim_multiplier', type=float, default=1.0) - parser.add_argument('--inter_size', type=int, default=None) - parser.add_argument('--hidden_act', type=str, default='silu') - parser.add_argument('--rms_norm_eps', type=float, default=1e-06) - parser.add_argument('--max_batch_size', type=int, default=8) - parser.add_argument('--max_input_len', type=int, default=2048) - parser.add_argument('--max_output_len', type=int, default=512) - parser.add_argument('--max_beam_width', type=int, default=1) - parser.add_argument('--rotary_base', type=float, default=10000.0) - parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None) - parser.add_argument('--use_gpt_attention_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_gemm_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_rmsnorm_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_lookup_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'bfloat16', 'float32']) - parser.add_argument('--use_gather_last_token_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_activation_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument('--use_elementwise_plugin', - nargs='?', - const='float16', - type=str, - default=False, - choices=['float16', 'float32', 'bfloat16']) - parser.add_argument("--use_cast_plugin", action="store_true") - - parser.add_argument('--parallel_build', default=False, action='store_true') - parser.add_argument('--enable_context_fmha', - default=False, - action='store_true') - parser.add_argument('--enable_context_fmha_fp32_acc', - default=False, - action='store_true') - parser.add_argument( - '--use_paged_context_fmha', - action='store_true', - help= - 'Activates paged context FMHA. This mode of the context FMHA is required for chunked context, speculative decoding and reuse of KV cache blocks. Context FMHA performance is worse when this mode is on.' - ) - parser.add_argument( - '--multi_block_mode', - default=False, - action='store_true', - help= - 'Split long kv sequence into multiple blocks (applied to generation MHA kernels). \ - It is beneficial when batch x num_heads cannot fully utilize GPU.' - ) - parser.add_argument( - '--disable_xqa', - default=False, - action='store_true', - help= - 'Disable XQA optimization for the generation MHA. See more details in docs/gpt_attention.' - ) - parser.add_argument('--visualize', default=False, action='store_true') - parser.add_argument('--load_by_shard', - action='store_true', - help='Load a pretrained model shard-by-shard.') - parser.add_argument('--enable_debug_output', - default=False, - action='store_true') - parser.add_argument('--gpus_per_node', type=int, default=8) - parser.add_argument('--builder_opt', type=int, default=None) - parser.add_argument( - '--output_dir', - type=str, - default='engine_outputs', - help= - 'The path to save the serialized engine files, timing cache file and model configs' - ) - parser.add_argument('--remove_input_padding', - default=False, - action='store_true') - parser.add_argument( - '--use_fused_mlp', - default=False, - action='store_true', - help= - 'Enable horizontal fusion in GatedMLP, reduces layer input traffic and potentially improves performance. ' - 'For FP8 PTQ, the downside is slight reduction of accuracy because one of the quantization scaling factors are discarded ' - '(0.45734 vs 0.45755 for LLaMA-v2 7B using ammo/examples/hf/instruct_eval/mmlu.py).' - ) - parser.add_argument('--enable_pos_shift', - default=False, - action='store_true', - help='Enable position shift for streamingllm method') - parser.add_argument( - '--dense_context_fmha', - default=False, - action='store_true', - help= - 'Enable dense fmha in context phase, otherwise sliding window attention.' - 'If dense_context_fmha=False, the sliding window size is the max attention window size.' - ) - # Arguments related to the quantization of the model. - parser.add_argument( - '--use_smooth_quant', - default=False, - action="store_true", - help= - 'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.' - 'See --per_channel and --per_token for finer-grained quantization options.' - ) - parser.add_argument( - '--per_channel', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor for the GEMM\'s result. ' - 'per_channel instead uses a different static scaling factor for each channel. ' - 'The latter is usually more accurate, but a little slower.') - parser.add_argument( - '--per_token', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor to scale activations in the int8 range. ' - 'per_token chooses at run time, and for each token, a custom scaling factor. ' - 'The latter is usually more accurate, but a little slower.') - parser.add_argument( - '--per_group', - default=False, - action="store_true", - help= - 'By default, we use a single static scaling factor to scale weights in the int4 range. ' - 'per_group chooses at run time, and for each group, a custom scaling factor. ' - 'The flag is built for GPTQ/AWQ quantization.') - parser.add_argument('--group_size', - type=int, - default=128, - help='Group size used in GPTQ/AWQ quantization.') - parser.add_argument( - '--int8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' - ) - parser.add_argument( - '--use_parallel_embedding', - action="store_true", - default=False, - help= - 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' - ) - parser.add_argument( - '--embedding_sharding_dim', - type=int, - default=1, # Meta does TP on hidden dim - choices=[0, 1], - help= - 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' - 'To shard it along hidden dimension, set embedding_sharding_dim=1' - 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' - ) - parser.add_argument( - '--enable_fp8', - default=False, - action='store_true', - help='Use FP8 Linear layer for Attention QKV/Dense and MLP.') - parser.add_argument( - '--fp8_kv_cache', - default=False, - action="store_true", - help= - 'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV' - ) - parser.add_argument( - '--quantized_fp8_model_path', - type=str, - default=None, - help='Path of a quantized model checkpoint in .npz format') - parser.add_argument( - '--use_weight_only', - default=False, - action="store_true", - help='Quantize weights for the various GEMMs to INT4/INT8.' - 'See --weight_only_precision to set the precision') - parser.add_argument( - '--disable_weight_only_quant_plugin', - default=False, - action="store_true", - help= - 'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.' - 'You must also use --use_weight_only for that argument to have an impact.' - ) - parser.add_argument( - '--weight_only_precision', - const='int8', - type=str, - nargs='?', - default='int8', - choices=['int8', 'int4', 'int4_awq', 'int4_gptq'], - help= - 'Define the precision for the weights when using weight-only quantization.' - 'You must also use --use_weight_only for that argument to have an impact.' - ) - parser.add_argument( - '--quantize_lm_head', - default=False, - action="store_true", - help='Quantize lm_head weights as well when using int4_awq.') - parser.add_argument( - '--use_inflight_batching', - action="store_true", - default=False, - help="Activates inflight batching mode of gptAttentionPlugin.") - parser.add_argument( - '--paged_kv_cache', - action="store_true", - default=False, - help= - 'By default we use contiguous KV cache. By setting this flag you enable paged KV cache' - ) - parser.add_argument('--tokens_per_block', - type=int, - default=128, - help='Number of tokens per block in paged KV cache') - parser.add_argument( - '--max_num_tokens', - type=int, - default=None, - help= - 'Define the max number of tokens supported by the engine, note that it takes no effect if --remove_input_padding is not set' - ) - parser.add_argument( - '--strongly_typed', - default=False, - action="store_true", - help= - 'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.' - ) - parser.add_argument( - '--use_custom_all_reduce', - action='store_true', - help= - 'Activates latency-optimized algorithm for all-reduce instead of NCCL.') - parser.add_argument( - '--max_prompt_embedding_table_size', - type=int, - default=0, - help='Setting to a value > 0 enables support for prompt tuning.') - parser.add_argument( - '--gather_all_token_logits', - action='store_true', - default=False, - help='Enable both gather_context_logits and gather_generation_logits') - parser.add_argument('--gather_context_logits', - action='store_true', - default=False, - help='Gather context logits') - parser.add_argument('--gather_generation_logits', - action='store_true', - default=False, - help='Gather generation logits') - parser.add_argument( - '--use_lora_plugin', - nargs='?', - const=None, - default=False, - choices=['float16', 'float32', 'bfloat16'], - help="Activates the lora plugin which enables embedding sharing.") - parser.add_argument( - '--lora_target_modules', - nargs='+', - default=None, - choices=[ - "attn_qkv", - "attn_q", - "attn_k", - "attn_v", - "attn_dense", - "mlp_h_to_4h", - "mlp_gate", - "mlp_4h_to_h", - ], - help= - "Add lora in which modules. Only be activated when use_lora_plugin is enabled." - ) - parser.add_argument('--hf_lora_dir', type=str, default=None) - parser.add_argument( - '--max_lora_rank', - type=int, - default=64, - help='maximum lora rank for different lora modules. ' - 'It is used to compute the workspace size of lora plugin.') - parser.add_argument( - '--moe_num_experts', - default=0, - type=int, - help='Specify the number of experts to use for MOE layers') - parser.add_argument( - '--moe_top_k', - default=0, - type=int, - help= - 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' - ) - parser.add_argument( - '--moe_tp_mode', - default=MoeConfig.ParallelismMode.TENSOR_PARALLEL, - type=int, - help= - 'Controls how to distribute experts in TP. Check layers/moe.py for accepted values', - ) - parser.add_argument( - '--moe_renorm_mode', - default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, - type=int, - help= - 'Controls renormalization after gate logits. Check layers/moe.py for accepted values', - ) - parser.add_argument("--total_build_time_target", type=float, default=0) - - args = parser.parse_args(cmd_args) - logger.set_level(args.log_level) - - assert args.total_build_time_target >= 0, "total_build_time_target must bigger than 0" - - assert not ( - args.use_smooth_quant and args.use_weight_only - ), "You cannot enable both SmoothQuant and INT8 weight-only together." - - if not args.remove_input_padding: - if args.use_gpt_attention_plugin: - logger.warning( - f"It is recommended to specify --remove_input_padding when using GPT attention plugin" - ) - - if args.use_inflight_batching: - if not args.use_gpt_attention_plugin: - args.use_gpt_attention_plugin = 'float16' - logger.info( - f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'" - ) - if not args.remove_input_padding: - args.remove_input_padding = True - logger.info( - "Using remove input padding for inflight batching mode.") - if not args.paged_kv_cache: - args.paged_kv_cache = True - logger.info("Using paged KV cache for inflight batching mode.") - - if args.use_smooth_quant: - args.quant_mode = QuantMode.use_smooth_quant(args.per_token, - args.per_channel) - elif args.use_weight_only: - args.quant_mode = QuantMode.from_description( - quantize_weights=True, - quantize_activations=False, - per_token=False, - per_channel=False, - per_group=args.per_group, - use_int4_weights="int4" in args.weight_only_precision) - else: - args.quant_mode = QuantMode(0) - - if args.int8_kv_cache: - args.quant_mode = args.quant_mode.set_int8_kv_cache() - elif args.fp8_kv_cache: - args.quant_mode = args.quant_mode.set_fp8_kv_cache() - if args.enable_fp8: - args.quant_mode = args.quant_mode.set_fp8_qdq() - - if args.rotary_scaling is not None: - assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin." - rotary_scaling = { - "type": args.rotary_scaling[0], - "factor": float(args.rotary_scaling[1]) - } - assert rotary_scaling["type"] in ["linear", "dynamic"] - assert rotary_scaling["factor"] > 1.0 - args.rotary_scaling = rotary_scaling - - if args.model_dir is not None: - hf_config = LlamaConfig.from_pretrained(args.model_dir) - if hf_config.model_type == "llava": - # LLaVA = Vision model + Llama LLM - # We load a llava config and use its' text config as llama config - hf_config = LlavaConfig.from_pretrained(args.model_dir).text_config - hf_config.model_type = "llava" # Replace llama with llava - - args.inter_size = hf_config.intermediate_size # override the inter_size for LLaMA - args.n_embd = hf_config.hidden_size - args.n_head = hf_config.num_attention_heads - if hasattr(hf_config, "num_key_value_heads"): - args.n_kv_head = hf_config.num_key_value_heads - - # hf_config.num_hidden_layers = 1 # only for debug - args.n_layer = hf_config.num_hidden_layers - args.n_positions = hf_config.max_position_embeddings - args.vocab_size = hf_config.vocab_size if hf_config.vocab_size is not None else args.vocab_size - args.hidden_act = hf_config.hidden_act - args.rms_norm_eps = hf_config.rms_norm_eps - # These attributes only exists with Mixtral, for the moment - args.moe_num_experts = getattr(hf_config, "num_local_experts", - args.moe_num_experts) - args.moe_top_k = getattr(hf_config, "num_experts_per_tok", - args.moe_top_k) - args.rotary_base = getattr(hf_config, "rope_theta", args.rotary_base) - args.model_type = hf_config.model_type - if hf_config.model_type == "mixtral": - # HF LLaMA-type models are implicitly using gated activation. - # With our MoE implementation, we must make it explicit - args.hidden_act = "swiglu" - - elif args.meta_ckpt_dir is not None: - with open(Path(args.meta_ckpt_dir, "params.json")) as fp: - meta_config: dict = json.load(fp) - args.n_embd = meta_config["dim"] - args.n_head = meta_config["n_heads"] - args.n_layer = meta_config["n_layers"] - args.n_kv_head = meta_config.get("n_kv_heads", args.n_head) - if "hidden_dim" in meta_config: - args.inter_size = meta_config["hidden_dim"] - else: - args.multiple_of = meta_config.get("multiple_of", 1) - n_embd = int(4 * args.n_embd * 2 / 3) - args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1) - args.inter_size = args.multiple_of * ( - (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) - // args.multiple_of) - args.rms_norm_eps = meta_config["norm_eps"] - args.moe_num_experts = meta_config.get("moe", {}).get("num_experts", 0) - args.moe_top_k = meta_config.get("moe", {}).get("num_experts_per_tok", - 0) - elif args.bin_model_dir is not None: - n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head = parse_bin_config( - Path(args.bin_model_dir) / "config.ini") - args.inter_size = inter_size # override the inter_size for LLaMA - args.n_kv_head = n_kv_head - args.n_embd = n_embd - args.n_head = n_head - args.n_layer = n_layer - args.n_positions = n_positions - args.vocab_size = vocab_size if args.vocab_size is None else args.vocab_size - args.hidden_act = hidden_act - args.rms_norm_eps = 1e-06 - logger.warning("Set rms_norm_eps to 1e-06 directly.") - if args.n_kv_head is None: - args.n_kv_head = args.n_head - elif args.n_kv_head != args.n_head: - assert (args.n_head % args.n_kv_head) == 0, \ - "MQA/GQA requires the number of heads to be divisible by the number of K/V heads." - assert (args.n_kv_head % args.tp_size) == 0 or (args.tp_size % args.n_kv_head) == 0, \ - "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \ - "the tensor parallelism size to be divisible by the number of K/V heads." - - hf_modules_to_trtllm_modules = { - "q_proj": "attn_q", - "k_proj": "attn_k", - "v_proj": "attn_v", - "o_proj": "attn_dense", - "gate_proj": "mlp_h_to_4h", - "down_proj": "mlp_4h_to_h", - "up_proj": "mlp_gate" - } # lora modules on llama - - trtllm_modules_to_hf_modules = { - "attn_q": "q_proj", - "attn_k": "k_proj", - "attn_v": "v_proj", - "attn_dense": "o_proj", - "mlp_h_to_4h": "gate_proj", - "mlp_4h_to_h": "down_proj", - "mlp_gate": "up_proj", - } - - lora_config = LoraConfig.from_hf(args.hf_lora_dir, - hf_modules_to_trtllm_modules, - trtllm_modules_to_hf_modules) - - if lora_config.is_valid: - if args.lora_target_modules is None: - args.lora_target_modules = lora_config.lora_target_modules - # the lora checkpoint might finetune the embedding - if lora_config.vocab_size != 0: - args.vocab_size = lora_config.vocab_size - - args.lora_config = lora_config - - if args.weight_only_precision == 'int4_awq': - inter_alignment = args.tp_size * 128 - if args.inter_size % inter_alignment != 0: - args.inter_size = int((args.inter_size + inter_alignment - 1) / - inter_alignment) * inter_alignment - logger.info("To use awq we pad intermediate_size to {}.".format( - args.inter_size)) - - if args.quantize_lm_head: - vocab_alignment = args.tp_size * 64 - if args.vocab_size % vocab_alignment != 0: - args.vocab_size = int((args.vocab_size + vocab_alignment - 1) / - vocab_alignment) * vocab_alignment - logger.info("To use awq we pad vocab_size to {}.".format( - args.vocab_size)) - - assert args.pp_size * args.tp_size == args.world_size - - args.max_num_tokens = check_max_num_tokens( - max_num_tokens=args.max_num_tokens, - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - remove_input_padding=args.remove_input_padding) - - assert (math.log2(args.tokens_per_block).is_integer() - ), "tokens_per_block must be power of 2" - if args.enable_context_fmha or args.enable_context_fmha_fp32_acc: - assert (args.tokens_per_block >= - 128), "Context fMHA requires >= 128 tokens per block" - - if args.inter_size is None: - # this should not be need when loading a real model - # but it is helpful when creating a dummy model without loading any real weights - n_embd = int(4 * args.n_embd * 2 / 3) - args.inter_size = args.multiple_of * ( - (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) // - args.multiple_of) - logger.info(f"Setting inter_size to {args.inter_size}.") - - if args.enable_pos_shift: - assert args.use_gpt_attention_plugin, "Position shift is only support in the gpt attention plugin." - assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc - - if args.moe_num_experts and args.moe_top_k == 0: - args.moe_top_k = 1 - args.moe_config = MoeConfig(args.moe_num_experts, args.moe_top_k, - args.moe_tp_mode, - args.moe_renorm_mode).validate() - - if args.gather_all_token_logits: - args.gather_context_logits = True - args.gather_generation_logits = True - - return args - - -def get_model_object(args, mapping, trt_dtype=None): - if trt_dtype is None: - trt_dtype = str_dtype_to_trt(args.dtype) - # Initialize Module - logger.debug("[Python]llama exampels, Initialize tensorrt_llm.models.LLaMAForCausalLM....") - tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM( - num_layers=args.n_layer, - num_heads=args.n_head, - num_kv_heads=args.n_kv_head, - hidden_size=args.n_embd, - vocab_size=args.vocab_size, - hidden_act=args.hidden_act, - max_position_embeddings=args.n_positions, - dtype=trt_dtype, - mlp_hidden_size=args.inter_size, - position_embedding_type=PositionEmbeddingType.rope_gpt_neox, - mapping=mapping, - rotary_base=args.rotary_base, - rotary_scaling=args.rotary_scaling, - use_parallel_embedding=args.use_parallel_embedding, - embedding_sharding_dim=args.embedding_sharding_dim, - quant_mode=args.quant_mode, - rms_norm_eps=args.rms_norm_eps, - use_fused_mlp=args.use_fused_mlp, - use_prompt_tuning=args.max_prompt_embedding_table_size > 0, - enable_pos_shift=args.enable_pos_shift, - dense_context_fmha=args.dense_context_fmha, - moe_config=args.moe_config, - max_lora_rank=args.max_lora_rank) - quantize_kwargs = {} - if args.use_smooth_quant or args.use_weight_only: - if args.weight_only_precision == 'int4_awq': - exclude_modules = ['lm_head'] if not args.quantize_lm_head else [] - quantize_kwargs = { - "group_size": args.group_size, - "zero": False, - "pre_quant_scale": True, - "exclude_modules": exclude_modules, - } - elif args.weight_only_precision == 'int4_gptq': - quantize_kwargs = { - "group_size": args.group_size, - "zero": True, - "pre_quant_scale": False, - } - elif args.enable_fp8 or args.fp8_kv_cache: - logger.info(f'Loading scaling factors from ' - f'{args.quantized_fp8_model_path}') - quant_scales = get_scaling_factors(args.quantized_fp8_model_path, - num_layers=args.n_layer, - quant_mode=args.quant_mode) - quantize_kwargs = {"quant_scales": quant_scales} - - if args.use_weight_only and args.moe_config.has_moe(): - if 'exclude_modules' in quantize_kwargs: - quantize_kwargs['exclude_modules'].append('router') - else: - quantize_kwargs['exclude_modules'] = ['lm_head', 'router'] - - tensorrt_llm_llama = quantize_model(tensorrt_llm_llama, args.quant_mode, - **quantize_kwargs) - if args.per_group: - if args.weight_only_precision == 'int4_awq': - load_from_awq_llama(tensorrt_llm_llama=tensorrt_llm_llama, - quant_ckpt_path=args.quant_ckpt_path, - quantize_lm_head=args.quantize_lm_head, - mapping=mapping, - dtype=args.dtype, - bin_model_dir=args.bin_model_dir) - else: - load_from_gptq_llama(tensorrt_llm_llama=tensorrt_llm_llama, - quant_ckpt_path=args.quant_ckpt_path, - mapping=mapping, - dtype=args.dtype, - bin_model_dir=args.bin_model_dir) - elif args.meta_ckpt_dir is not None: - load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, mapping, - args.dtype) - elif args.model_dir is not None: - logger.info(f'Loading HF LLaMA ... from {args.model_dir}') - tik = time.time() - if not args.load_by_shard: - if args.model_type == "llava": - hf_llava = LlavaForConditionalGeneration.from_pretrained( - args.model_dir, torch_dtype="auto") - hf_llama = hf_llava.language_model - else: - hf_model = LlamaForCausalLM if args.model_type != "mixtral" else MixtralForCausalLM - hf_llama = hf_model.from_pretrained( - args.model_dir, - device_map={ - "model": "cpu", - "lm_head": "cpu", - "embed_tokens": "cpu", - "layers": "cpu", - "norm": "cpu", - }, # Load to CPU memory - torch_dtype='auto', - ) - use_gemm_woq_plugin = not args.disable_weight_only_quant_plugin - # hf_llama.config.num_hidden_layers = 1 # only for debug - load_from_hf_llama(tensorrt_llm_llama, - hf_llama, - mapping=mapping, - dtype=args.dtype, - use_gemm_woq_plugin=use_gemm_woq_plugin, - lora_config=args.lora_config) - del hf_llama - else: - load_from_hf_checkpoint(tensorrt_llm_llama, - args.model_dir, - mapping, - dtype=args.dtype, - lora_config=args.lora_config) - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'HF LLaMA loaded. Total time: {t}') - - elif args.bin_model_dir is not None: - load_from_binary(tensorrt_llm_llama, - args.bin_model_dir, - mapping, - fp16=(args.dtype == 'float16'), - multi_query_mode=(args.n_kv_head != args.n_head)) - - return tensorrt_llm_llama - - -def update_plugin_configs(args, network): - if args.use_gpt_attention_plugin: - network.plugin_config.set_gpt_attention_plugin( - dtype=args.use_gpt_attention_plugin) - if args.use_gemm_plugin: - if not args.enable_fp8: - network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin) - else: - logger.info( - "Gemm plugin does not support FP8. Disabled Gemm plugin.") - if args.use_rmsnorm_plugin: - network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin) - if args.use_lora_plugin: - network.plugin_config.set_lora_plugin(dtype=args.use_lora_plugin) - if args.use_lookup_plugin: - network.plugin_config.set_lookup_plugin(dtype=args.use_lookup_plugin) - if args.use_gather_last_token_plugin: - network.plugin_config.set_gather_last_token_plugin(dtype=args.use_gather_last_token_plugin) - if args.use_activation_plugin: - network.plugin_config.set_activation_plugin(dtype=args.use_activation_plugin) - if args.use_elementwise_plugin: - network.plugin_config.set_elementwise_plugin(dtype=args.use_elementwise_plugin) - if args.use_cast_plugin: - network.plugin_config.set_cast_plugin() - - # Quantization plugins. - if args.use_smooth_quant: - network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype) - network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype) - network.plugin_config.set_quantize_tensor_plugin() - network.plugin_config.set_quantize_per_token_plugin() - assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc) - if args.enable_context_fmha: - network.plugin_config.set_context_fmha(ContextFMHAType.enabled) - if args.enable_context_fmha_fp32_acc: - network.plugin_config.set_context_fmha( - ContextFMHAType.enabled_with_fp32_acc) - if args.multi_block_mode: - network.plugin_config.enable_mmha_multi_block_mode() - if not args.disable_xqa: - network.plugin_config.enable_xqa_optimization() - - if args.use_weight_only and not args.disable_weight_only_quant_plugin: - if args.per_group: - network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin( - dtype=args.dtype) - else: - network.plugin_config.set_weight_only_quant_matmul_plugin( - dtype=args.dtype) - if args.world_size > 1: - network.plugin_config.set_nccl_plugin(args.dtype, - args.use_custom_all_reduce) - if args.remove_input_padding: - network.plugin_config.enable_remove_input_padding() - if args.paged_kv_cache: - network.plugin_config.enable_paged_kv_cache(args.tokens_per_block) - return - - -def build_rank_engine(builder: Builder, - builder_config: tensorrt_llm.builder.BuilderConfig, - engine_name, rank, args): - ''' - @brief: Build the engine on the given rank. - @param rank: The rank to build the engine. - @param args: The cmd line arguments. - @return: The built engine. - ''' - dtype = str_dtype_to_trt(args.dtype) - mapping = Mapping(world_size=args.world_size, - rank=rank, - tp_size=args.tp_size, - pp_size=args.pp_size) - - assert args.n_layer % args.pp_size == 0, \ - f"num_layers {args.n_layer} must be a multiple of pipeline parallelism size {args.pp_size}" - - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {rank} Engine build starts') - # Initialize Module - tensorrt_llm_llama = get_model_object(args, - mapping=mapping, - trt_dtype=dtype) - - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {rank} model weight loaded.') - - # Module -> Network - logger.debug("[Python]llama exampels, convert module to network....") - network = builder.create_network() - network.trt_network.name = engine_name - update_plugin_configs(args, network) - - if args.use_paged_context_fmha: - assert args.enable_context_fmha or args.enable_context_fmha_fp32_acc, "context fmha must be enabled" - network.plugin_config.set_paged_context_fmha() - - logger.debug(f"[Python]llama exampels, network.plugin_config: \n{network.plugin_config}") - with net_guard(network): - # Prepare - network.set_named_parameters(tensorrt_llm_llama.named_parameters()) - - # Forward - inputs = tensorrt_llm_llama.prepare_inputs( - max_batch_size=args.max_batch_size, - max_input_len=args.max_input_len, - max_seq_len=args.max_input_len + args.max_output_len, - use_cache=True, - max_beam_width=args.max_beam_width, - max_num_tokens=args.max_num_tokens, - prompt_embedding_table_size=args.max_prompt_embedding_table_size, - gather_context_logits=args.gather_context_logits, - gather_generation_logits=args.gather_generation_logits, - lora_target_modules=args.lora_target_modules) - logger.info(f"[Python]llama exampels, forward....\n") - tensorrt_llm_llama(*inputs) - logger.info(f"[Python]llama exampels, forward finished\n") - if args.enable_debug_output: - # mark intermediate nodes' outputs - for k, v in tensorrt_llm_llama.named_network_outputs(): - logger.debug(f"enable_debug_output, debug tensor name: {k}") - v = v.trt_tensor - v.name = k - network.trt_network.mark_output(v) - v.dtype = dtype - if args.visualize: - model_path = os.path.join(args.output_dir, 'test.onnx') - to_onnx(network.trt_network, model_path) - - logger.debug("[Python]llama examples, tensorrt_llm.graph_rewriting.optimize....") - tensorrt_llm.graph_rewriting.optimize(network) - - engine = None - - # Network -> Engine - logger.debug("[Python]llama examples, builder.build_engine....") - engine = builder.build_engine(network, builder_config) - if rank == 0: - config_path = os.path.join(args.output_dir, 'config.json') - builder.save_config(builder_config, config_path) - - return engine - - -def get_builder_config_namespace(args, cache): - # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT - # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ) - # OR INT8 KV cache is set to contiguous (without paged KV cache enabled). - int8_trt_flag = (args.quant_mode.has_act_or_weight_quant() - and not args.quant_mode.has_per_group_scaling()) or ( - not args.paged_kv_cache - and args.quant_mode.has_int8_kv_cache()) - config = argparse.Namespace( - name=MODEL_NAME, - precision=args.dtype, - timing_cache=args.timing_cache if cache is None else cache, - profiling_verbosity=args.profiling_verbosity, - tensor_parallel=args.tp_size, - pipeline_parallel=args.pp_size, - parallel_build=args.parallel_build, - num_layers=args.n_layer, - num_heads=args.n_head, - num_kv_heads=args.n_kv_head, - hidden_size=args.n_embd, - vocab_size=args.vocab_size, - hidden_act=args.hidden_act, - max_position_embeddings=args.n_positions, - max_batch_size=args.max_batch_size, - max_beam_width=args.max_beam_width, - max_input_len=args.max_input_len, - max_output_len=args.max_output_len, - max_num_tokens=args.max_num_tokens, - int8=int8_trt_flag, - quant_mode=args.quant_mode, - strongly_typed=args.strongly_typed, - opt_level=args.builder_opt, - max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, - gather_context_logits=args.gather_context_logits, - gather_generation_logits=args.gather_generation_logits, - lora_target_modules=args.lora_target_modules, - mlp_hidden_size=args.inter_size, - hf_modules_to_trtllm_modules=args.lora_config. - hf_modules_to_trtllm_modules, - trtllm_modules_to_hf_modules=args.lora_config. - trtllm_modules_to_hf_modules, - ) - return config - - -def build(rank, args): - torch.cuda.set_device(rank % args.gpus_per_node) - logger.set_level(args.log_level) - os.makedirs(args.output_dir, exist_ok=True) - - # when doing serializing build, all ranks share one engine - builder = Builder() - cache = None - for cur_rank in range(args.world_size): - # skip other ranks if parallel_build is enabled - if args.parallel_build and cur_rank != rank: - continue - tik = time.time() - - # NOTE: int8 flag is required to be true when INT8 tensors are exposed to TRT - # TRT-LLM has INT8 I/O when act/weights are quantized without group-scaling (AWQ, GPTQ) - # OR INT8 KV cache is set to contiguous (without paged KV cache enabled). - int8_trt_flag = (args.quant_mode.has_act_or_weight_quant() - and not args.quant_mode.has_per_group_scaling()) or ( - not args.paged_kv_cache - and args.quant_mode.has_int8_kv_cache()) - builder_config = builder.create_builder_config( - **vars(get_builder_config_namespace(args, cache))) - engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size, - args.pp_size, cur_rank) - logger.debug("[Python]llama example, build_rank_engine....") - engine = build_rank_engine(builder, builder_config, engine_name, - cur_rank, args) - assert engine is not None, f'Failed to build engine for rank {cur_rank}' - - local_num_kv_heads = (args.n_kv_head + args.world_size - - 1) // args.world_size - kv_dtype = str_dtype_to_trt(args.dtype) - if args.quant_mode.has_int8_kv_cache(): - kv_dtype = str_dtype_to_trt('int8') - elif args.quant_mode.has_fp8_kv_cache(): - kv_dtype = str_dtype_to_trt('fp8') - - # FIXME (Not Support libnvidia-ml.so) - # profiler.check_gpt_mem_usage( - # engine=engine, - # kv_dtype=kv_dtype, - # use_gpt_attention_plugin=args.use_gpt_attention_plugin, - # paged_kv_cache=args.paged_kv_cache, - # max_batch_size=args.max_batch_size, - # max_beam_width=args.max_beam_width, - # max_seq_len=args.max_input_len + args.max_output_len, - # local_num_kv_heads=local_num_kv_heads, - # head_size=args.n_embd / args.n_head, - # num_layers=args.n_layer) - - if cur_rank == 0: - # Use in-memory timing cache for multiple builder passes. - if not args.parallel_build: - cache = builder_config.trt_builder_config.get_timing_cache() - - serialize_engine(engine, os.path.join(args.output_dir, engine_name)) - del engine - # FIXME (Not Support libnvidia-ml.so) - # profiler.print_memory_usage(f'Rank {cur_rank} Engine serialized') - - tok = time.time() - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info( - f'Rank {cur_rank} Engine build time: {t} - {tok - tik} (sec)') - - if rank == 0: - ok = builder.save_timing_cache( - builder_config, os.path.join(args.output_dir, "model.cache")) - assert ok, "Failed to save timing cache." - - -if __name__ == '__main__': - args = parse_arguments() - print(args) - tik = time.time() - if args.parallel_build and args.world_size > 1 and \ - torch.cuda.device_count() >= args.world_size: - logger.warning( - f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.' - ) - mp.spawn(build, nprocs=args.world_size, args=(args, )) - else: - args.parallel_build = False - logger.info('Serially build TensorRT engines.') - build(0, args) - - tok = time.time() - build_engine_time = tok - tik - t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) - logger.info(f'Total time of building all {args.world_size} engines: {t}') - - if args.total_build_time_target != 0: - status = build_engine_time <= args.total_build_time_target - if status: - print("successful.") - else: - print(f"Build engine time check failed! Target: {args.total_build_time_target}, Actual: {build_engine_time}") - sys.exit(int(not status)) diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/convert_checkpoint.py b/models/nlp/large_language_model/llama2-7b/trtllm/convert_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..6c44e840456923d9e77b43c1a509e77658d175ae --- /dev/null +++ b/models/nlp/large_language_model/llama2-7b/trtllm/convert_checkpoint.py @@ -0,0 +1,500 @@ +import argparse +import json +import os +import time +import traceback +from concurrent.futures import ThreadPoolExecutor, as_completed + +from transformers import AutoConfig + +import tensorrt_llm +from tensorrt_llm._utils import release_gc +from tensorrt_llm.layers import MoeConfig +from tensorrt_llm.logger import logger +from tensorrt_llm.mapping import Mapping +from tensorrt_llm.models import LLaMAForCausalLM +from tensorrt_llm.models.convert_utils import has_safetensors +from tensorrt_llm.models.llama.convert import load_hf_llama +from tensorrt_llm.models.modeling_utils import QuantConfig +from tensorrt_llm.quantization import QuantAlgo + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--model_dir', type=str, default=None) + parser.add_argument('--meta_ckpt_dir', type=str, default=None) + + parser.add_argument('--tp_size', + type=int, + default=1, + help='N-way tensor parallelism size') + parser.add_argument('--pp_size', + type=int, + default=1, + help='N-way pipeline parallelism size') + parser.add_argument( + '--moe_tp_size', + type=int, + default=-1, + help= + 'N-way tensor parallelism size for MOE, default is tp_size, which will do tp-only for MoE' + ) + parser.add_argument( + '--moe_ep_size', + type=int, + default=-1, + help= + 'N-way expert parallelism size for MOE, default is 1, which will do tp-only for MoE' + ) + parser.add_argument('--dtype', + type=str, + default='float16', + choices=['float32', 'bfloat16', 'float16']) + parser.add_argument('--vocab_size', type=int, default=32000) + parser.add_argument('--n_positions', type=int, default=2048) + parser.add_argument('--n_layer', type=int, default=32) + parser.add_argument('--n_head', type=int, default=32) + parser.add_argument('--n_kv_head', type=int, default=None) + parser.add_argument('--n_embd', type=int, default=4096) + parser.add_argument('--inter_size', type=int, default=11008) + parser.add_argument('--multiple_of', type=int, default=None) + parser.add_argument('--ffn_dim_multiplier', type=float, default=None) + parser.add_argument('--rms_norm_eps', type=float, default=1e-06) + + parser.add_argument( + '--use_weight_only', + default=False, + action="store_true", + help='Quantize weights for the various GEMMs to INT4/INT8.' + 'See --weight_only_precision to set the precision') + parser.add_argument( + '--disable_weight_only_quant_plugin', + default=False, + action="store_true", + help= + 'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--weight_only_precision', + const='int8', + type=str, + nargs='?', + default='int8', + choices=['int8', 'int4', 'int4_gptq'], + help= + 'Define the precision for the weights when using weight-only quantization.' + 'You must also use --use_weight_only for that argument to have an impact.' + ) + parser.add_argument( + '--calib_dataset', + type=str, + default='ccdv/cnn_dailymail', + help= + "The huggingface dataset name or the local directory of the dataset for calibration." + ) + parser.add_argument( + "--smoothquant", + "-sq", + type=float, + default=None, + help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)" + " to Smoothquant the model, and output int8 weights." + " A good first try is 0.5. Must be in [0, 1]") + parser.add_argument( + '--per_channel', + action="store_true", + default=False, + help= + 'By default, we use a single static scaling factor for the GEMM\'s result. ' + 'per_channel instead uses a different static scaling factor for each channel. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--per_token', + action="store_true", + default=False, + help= + 'By default, we use a single static scaling factor to scale activations in the int8 range. ' + 'per_token chooses at run time, and for each token, a custom scaling factor. ' + 'The latter is usually more accurate, but a little slower.') + parser.add_argument( + '--int8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV' + ) + parser.add_argument( + '--fp8_kv_cache', + default=False, + action="store_true", + help= + 'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV' + ) + parser.add_argument( + '--quant_ckpt_path', + type=str, + default=None, + help='Path of a quantized model checkpoint in .safetensors format') + parser.add_argument("--use_fp8_rowwise", + action="store_true", + default=False, + help="Enable Fp8 per-token per-channel quantization") + + parser.add_argument( + '--per_group', + default=False, + action="store_true", + help= + 'By default, we use a single static scaling factor to scale weights in the int4 range. ' + 'per_group chooses at run time, and for each group, a custom scaling factor. ' + 'The flag is built for GPTQ/AWQ quantization.') + + parser.add_argument('--load_by_shard', + action='store_true', + help='Load a pretrained model shard-by-shard.') + parser.add_argument('--hidden_act', type=str, default='silu') + + parser.add_argument('--rotary_base', type=float, default=10000.0) + + parser.add_argument('--group_size', + type=int, + default=128, + help='Group size used in GPTQ quantization.' + ) # AWQ is only supported by quantize.py script + + parser.add_argument("--load_model_on_cpu", action="store_true") + parser.add_argument( + '--use_parallel_embedding', + action="store_true", + default=False, + help= + 'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled' + ) + parser.add_argument( + '--embedding_sharding_dim', + type=int, + default=0, + choices=[0, 1], + help= + 'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). ' + 'To shard it along hidden dimension, set embedding_sharding_dim=1' + 'Note: embedding sharing is only enabled when embedding_sharding_dim = 0' + ) + parser.add_argument( + '--use_embedding_sharing', + action="store_true", + default=False, + help= + 'Try to reduce the engine size by sharing the embedding lookup table between two layers.' + 'Note: the flag might not take effect when the criteria are not met.') + parser.add_argument('--output_dir', + type=str, + default='tllm_checkpoint', + help='The path to save the TensorRT-LLM checkpoint') + parser.add_argument( + '--workers', + type=int, + default=1, + help='The number of workers for converting checkpoint in parallel') + parser.add_argument( + '--moe_num_experts', + default=0, + type=int, + help='Specify the number of experts to use for MOE layers') + parser.add_argument( + '--moe_top_k', + default=0, + type=int, + help= + 'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set' + ) + parser.add_argument( + '--moe_renorm_mode', + default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE, + type=int, + help= + 'Controls renormalization after gate logits. Check layers/moe.py for accepted values', + ) + parser.add_argument( + '--save_config_only', + action="store_true", + default=False, + help= + 'Only save the model config w/o read and converting weights, be careful, this is for debug only' + ) + parser.add_argument( + '--remove_duplicated_kv_heads', + action="store_true", + default=False, + help= + 'Only used to remove the duplicated kv heads of llama-3.1 405B HF model.' + ) + parser.add_argument('--log_level', type=str, default='info') + + args = parser.parse_args() + # changing the default to be consistent as the cli help said. + if args.moe_num_experts and args.moe_top_k == 0: + args.moe_top_k = 1 + return args + + +def args_to_quant_config(args: argparse.Namespace) -> QuantConfig: + '''return config dict with quantization info based on the command line args + ''' + quant_config = QuantConfig() + if args.use_weight_only: + if args.weight_only_precision == 'int8': + quant_config.quant_algo = QuantAlgo.W8A16 + elif args.weight_only_precision == 'int4': + quant_config.quant_algo = QuantAlgo.W4A16 + elif args.smoothquant: + quant_config.smoothquant_val = args.smoothquant + if args.per_channel: + if args.per_token: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN + else: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN + else: + if args.per_token: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN + else: + quant_config.quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN + elif args.use_fp8_rowwise: + quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN + # this will be overwritten if specified in the hf config. + quant_config.clamp_val = [-1200.0, 1200.0] + + if args.int8_kv_cache: + quant_config.kv_cache_quant_algo = QuantAlgo.INT8 + + if args.fp8_kv_cache: + quant_config.kv_cache_quant_algo = QuantAlgo.FP8 + + if args.weight_only_precision == 'int4_gptq': + quant_config.group_size = args.group_size + quant_config.has_zero_point = True + quant_config.pre_quant_scale = False + quant_config.quant_algo = QuantAlgo.W4A16_GPTQ + + return quant_config + + +def update_quant_config_from_hf(quant_config, hf_config) -> QuantConfig: + hf_config_dict = hf_config.to_dict() + if hf_config_dict.get('quantization_config'): + # update the quant_algo, and clamp_val. + if hf_config_dict['quantization_config'].get( + 'quant_method') == 'fbgemm_fp8': + logger.info( + "Load quantization configs from huggingface model_config.") + quant_config.quant_algo = QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN + activation_scale_ub = hf_config_dict['quantization_config'].get( + 'activation_scale_ub', 1200.0) + quant_config.clamp_val = [-activation_scale_ub, activation_scale_ub] + return quant_config + + +def convert_and_save_meta(args, rank): + mapping = Mapping(world_size=args.tp_size * args.pp_size, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size, + rank=rank) + llama = LLaMAForCausalLM.from_meta_ckpt( + args.meta_ckpt_dir, + args.dtype, + quant_config=args_to_quant_config(args), + mapping=mapping, + use_parallel_embedding=args.use_parallel_embedding, + embedding_sharding_dim=args.embedding_sharding_dim) + llama.save_checkpoint(args.output_dir, save_config=(rank == 0)) + + +def args_to_build_options(args): + return { + 'use_parallel_embedding': args.use_parallel_embedding, + 'embedding_sharding_dim': args.embedding_sharding_dim, + 'share_embedding_table': args.use_embedding_sharing, + 'disable_weight_only_quant_plugin': + args.disable_weight_only_quant_plugin, + 'remove_duplicated_kv_heads': args.remove_duplicated_kv_heads, + 'quant_ckpt_path': args.quant_ckpt_path, + 'load_model_on_cpu': args.load_model_on_cpu, + } + + +def from_cli_args(args): + n_kv_head = args.n_kv_head if args.n_kv_head is not None else args.n_head + config = { + 'architecture': "LlamaForCausalLM", + 'dtype': args.dtype, + 'logits_dtype': 'float32', + 'num_hidden_layers': args.n_layer, + 'num_attention_heads': args.n_head, + 'hidden_size': args.n_embd, + 'intermediate_size': args.inter_size, + 'ffn_dim_multiplier': args.ffn_dim_multiplier, + 'multiple_of': args.multiple_of, + 'num_key_value_heads': n_kv_head, + 'vocab_size': args.vocab_size, + 'position_embedding_type': 'rope_gpt_neox', + 'max_position_embeddings': args.n_positions, + 'hidden_act': args.hidden_act, + 'rotary_base': args.rotary_base, + 'norm_epsilon': args.rms_norm_eps, + 'moe': { + 'num_experts': args.moe_num_experts, + 'top_k': args.moe_top_k, + 'normalization_mode': args.moe_renorm_mode, + }, + 'mapping': { + 'world_size': args.tp_size * args.pp_size, + 'tp_size': args.tp_size, + 'pp_size': args.pp_size, + 'moe_tp_size': args.moe_tp_size, + 'moe_ep_size': args.moe_ep_size, + }, + 'quantization': args_to_quant_config(args).to_dict() + } + config.update(args_to_build_options(args)) + return config + + +def convert_and_save_hf(args): + model_dir = args.model_dir + load_model_on_cpu = args.load_model_on_cpu + load_by_shard = args.load_by_shard + world_size = args.tp_size * args.pp_size + # Need to convert the cli args to the kay-value pairs and override them in the generate config dict. + # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now, + # before the refactor is done. + override_fields = {} + override_fields.update(args_to_build_options(args)) + + quant_config = args_to_quant_config(args) + + try: + hf_config = AutoConfig.from_pretrained(model_dir, + trust_remote_code=True) + quant_config = update_quant_config_from_hf(quant_config, hf_config) + except: + # llava_llama needs its own defined config. + logger.warning("AutoConfig cannot load the huggingface config.") + + if args.smoothquant is not None or args.int8_kv_cache: + assert not args.load_by_shard, "When using quantization, TRT-LLM needs to load the whole HF model, thus load by shard not supported" + mapping = Mapping(world_size=world_size, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) + # TODO: support moe quantization for tp + ep + LLaMAForCausalLM.quantize( + args.model_dir, + args.output_dir, + dtype=args.dtype, + mapping=mapping, + quant_config=quant_config, + device='cpu' if args.load_model_on_cpu else 'cuda', + calib_dataset=args.calib_dataset, + **override_fields) + else: + # When not loading by shard, preload one complete model and then slice per rank weights from this + # this saves the disk reloading time + hf_model = None + if os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER") is not None \ + and os.environ.get("TRTLLM_DISABLE_UNIFIED_CONVERTER").strip() == "2": + if "vila" in model_dir or "llava" in model_dir: + hf_model = load_hf_llama(model_dir, load_model_on_cpu) + elif not (args.load_by_shard or + (has_safetensors(model_dir) + and not quant_config.quant_mode.has_any_quant())): + hf_model = load_hf_llama(model_dir, load_model_on_cpu) + + def convert_and_save_rank(args, rank): + mapping = Mapping(world_size=world_size, + rank=rank, + tp_size=args.tp_size, + pp_size=args.pp_size, + moe_tp_size=args.moe_tp_size, + moe_ep_size=args.moe_ep_size) + llama = LLaMAForCausalLM.from_hugging_face( + model_dir if hf_model is None else hf_model, + args.dtype, + mapping=mapping, + quant_config=quant_config, + load_by_shard=load_by_shard, + **override_fields, + ) + llama.save_checkpoint(args.output_dir, save_config=(rank == 0)) + del llama + + execute(args.workers, [convert_and_save_rank] * world_size, args) + release_gc() + + +def execute(workers, func, args): + if workers == 1: + for rank, f in enumerate(func): + f(args, rank) + else: + with ThreadPoolExecutor(max_workers=workers) as p: + futures = [p.submit(f, args, rank) for rank, f in enumerate(func)] + exceptions = [] + for future in as_completed(futures): + try: + future.result() + except Exception as e: + traceback.print_exc() + exceptions.append(e) + assert len( + exceptions + ) == 0, "Checkpoint conversion failed, please check error log." + + +def main(): + print(tensorrt_llm.__version__) + args = parse_arguments() + logger.set_level(args.log_level) + + world_size = args.tp_size * args.pp_size + if (args.moe_tp_size == -1 and args.moe_ep_size == -1): + # moe default to tp-only + args.moe_tp_size = args.tp_size + args.moe_ep_size = 1 + elif (args.moe_tp_size == -1): + args.moe_tp_size = args.tp_size // args.moe_ep_size + elif (args.moe_ep_size == -1): + args.moe_ep_size = args.tp_size // args.moe_tp_size + assert (args.moe_tp_size * args.moe_ep_size == args.tp_size + ), "moe_tp_size * moe_ep_size must equal to tp_size" + tik = time.time() + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + if (args.model_dir is None + and args.meta_ckpt_dir is None): # generate fake config.json + config = from_cli_args(args) + with open(os.path.join(args.output_dir, 'config.json'), 'w') as f: + json.dump(config, f, indent=4) + elif args.meta_ckpt_dir is not None: + assert args.model_dir is None, "Shall not specify both meta checkpoint dir and hugging face dir" + execute(args.workers, [convert_and_save_meta] * world_size, args) + else: # all other paths from hf model + assert args.model_dir is not None + assert ( + args.quant_ckpt_path is not None + and args.weight_only_precision == 'int4_gptq' + ) or args.quant_ckpt_path is None, "only gptq weights only needs this option" + convert_and_save_hf(args) + + tok = time.time() + t = time.strftime('%H:%M:%S', time.gmtime(tok - tik)) + print(f'Total time of converting checkpoints: {t}') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/run.py b/models/nlp/large_language_model/llama2-7b/trtllm/run.py index 3899ec9d55a33bca6eeeac4840353345467b474d..5590749592d3237b3087f2b745fd9abb9569bf51 100644 --- a/models/nlp/large_language_model/llama2-7b/trtllm/run.py +++ b/models/nlp/large_language_model/llama2-7b/trtllm/run.py @@ -16,63 +16,45 @@ import argparse import ast import csv +import os from pathlib import Path import sys import time +import sys +import time import numpy as np import torch +from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, + add_common_args, load_tokenizer, read_decoder_start_token_id, + read_model_name, supports_inflight_batching, + throttle_generator) + import tensorrt_llm import tensorrt_llm.profiler from tensorrt_llm.logger import logger from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner -from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES, - load_tokenizer, read_model_name, throttle_generator) - if PYTHON_BINDINGS: from tensorrt_llm.runtime import ModelRunnerCpp def parse_arguments(args=None): + # see `add_common_args` for extended list of arguments parser = argparse.ArgumentParser() + parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument('--max_output_len', type=int, required=True) - parser.add_argument( - '--max_attention_window_size', - type=int, - default=None, - help= - 'The attention window size that controls the sliding window attention / cyclic kv cache behaviour' - ) - parser.add_argument('--sink_token_length', - type=int, - default=None, - help='The sink token length.') - parser.add_argument('--log_level', type=str, default='error') - parser.add_argument('--engine_dir', type=str, default='engine_outputs') - parser.add_argument('--use_py_session', - default=False, - action='store_true', - help="Whether or not to use Python runtime session") parser.add_argument( '--input_text', type=str, nargs='+', default=["Born in north-east France, Soyer trained as a"]) - parser.add_argument( - '--no_prompt_template', - dest='use_prompt_template', - default=True, - action='store_false', - help= - "Whether or not to use default prompt template to wrap the input text.") parser.add_argument( '--input_file', type=str, help= 'CSV or Numpy file containing tokenized input. Alternative to text input.', default=None) - parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument('--output_csv', type=str, help='CSV file where the tokenized output is stored.', @@ -87,89 +69,26 @@ def parse_arguments(args=None): help= 'Numpy file where the generation logits are stored. Use only when num_beams==1', default=None) - parser.add_argument('--tokenizer_dir', - help="HF tokenizer config path", - default='gpt2') - parser.add_argument( - '--tokenizer_type', - help= - 'Specify that argument when providing a .model file as the tokenizer_dir. ' - 'It allows AutoTokenizer to instantiate the correct tokenizer type.') - parser.add_argument('--vocab_file', - help="Used for sentencepiece tokenizers") - parser.add_argument('--num_beams', - type=int, - help="Use beam search if num_beams >1", - default=1) - parser.add_argument('--temperature', type=float, default=1.0) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--top_p', type=float, default=0.0) - parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--repetition_penalty', type=float, default=1.0) - parser.add_argument('--presence_penalty', type=float, default=0.0) - parser.add_argument('--frequency_penalty', type=float, default=0.0) - parser.add_argument('--debug_mode', - default=False, - action='store_true', - help="Whether or not to turn on the debug mode") - parser.add_argument('--no_add_special_tokens', - dest='add_special_tokens', - default=True, - action='store_false', - help="Whether or not to add special tokens") - parser.add_argument('--streaming', default=False, action='store_true') - parser.add_argument('--streaming_interval', - type=int, - help="How often to return tokens when streaming.", - default=5) - parser.add_argument( - '--prompt_table_path', - type=str, - help="Path to .npy file, exported by nemo_prompt_convert.py") - parser.add_argument( - '--prompt_tasks', - help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0") - parser.add_argument('--lora_dir', + parser.add_argument('--output_log_probs_npy', type=str, - default=None, - nargs="+", - help="The directory of LoRA weights") - parser.add_argument( - '--lora_task_uids', - type=str, - default=None, - nargs="+", - help="The list of LoRA task uids; use -1 to disable the LoRA module") - parser.add_argument('--lora_ckpt_source', + help='Numpy file where the log_probs are stored', + default=None) + parser.add_argument('--output_cum_log_probs_npy', type=str, - default="hf", - choices=["hf", "nemo"], - help="The source of lora checkpoint.") - parser.add_argument( - '--num_prepend_vtokens', - nargs="+", - type=int, - help="Number of (default) virtual tokens to prepend to each sentence." - " For example, '--num_prepend_vtokens=10' will prepend the tokens" - " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") + help='Numpy file where the cum_log_probs are stored', + default=None) parser.add_argument( '--run_profiling', default=False, action='store_true', help="Run several 10 iterations to profile the inference latencies.") - parser.add_argument( - '--medusa_choices', - type=str, - default=None, - help="Medusa choice to use, if not none, will use Medusa decoding." - " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." - ) parser.add_argument('--target_load_engine_time', type=float, default=0) parser.add_argument('--target_qps', type=float, default=0) + parser = add_common_args(parser) return parser.parse_args(args=args) @@ -182,7 +101,8 @@ def parse_input(tokenizer, max_input_length=923, pad_id=None, num_prepend_vtokens=[], - model_name=None): + model_name=None, + model_version=None): if pad_id is None: pad_id = tokenizer.pad_token_id @@ -211,13 +131,12 @@ def parse_input(tokenizer, elif input_file.endswith('.txt'): with open(input_file, 'r', encoding='utf-8', errors='replace') as txt_file: - input_text = txt_file.read() - input_ids = tokenizer.encode( + input_text = txt_file.readlines() + batch_input_ids = tokenizer( input_text, add_special_tokens=add_special_tokens, truncation=True, - max_length=max_input_length) - batch_input_ids.append(input_ids) + max_length=max_input_length)["input_ids"] else: print('Input file format not supported.') raise SystemExit @@ -230,9 +149,11 @@ def parse_input(tokenizer, batch_input_ids[i] = list( range(base_vocab_size, base_vocab_size + length)) + batch_input_ids[i] - if model_name == 'glm_10b': + + if input_file is None and 'GLM' in model_name and model_version == 'glm': for ids in batch_input_ids: ids.append(tokenizer.sop_token_id) + batch_input_ids = [ torch.tensor(x, dtype=torch.int32) for x in batch_input_ids ] @@ -247,7 +168,11 @@ def print_output(tokenizer, output_npy=None, context_logits=None, generation_logits=None, - output_logits_npy=None): + cum_log_probs=None, + log_probs=None, + output_logits_npy=None, + output_cum_log_probs_npy=None, + output_log_probs_npy=None): batch_size, num_beams, _ = output_ids.size() if output_csv is None and output_npy is None: for batch_idx in range(batch_size): @@ -265,7 +190,6 @@ def print_output(tokenizer, f'Output [Text {batch_idx} Beam {beam}]: \"{output_text}\"') output_ids = output_ids.reshape((-1, output_ids.size(2))) - if output_csv is not None: output_file = Path(output_csv) output_file.parent.mkdir(exist_ok=True, parents=True) @@ -303,6 +227,20 @@ def print_output(tokenizer, dtype='float32') np.save(output_generation_logits_file, generation_outputs) + # Save cum log probs + if cum_log_probs is not None and output_cum_log_probs_npy is not None: + cum_log_probs_file = Path(output_cum_log_probs_npy) + cum_log_probs_outputs = np.array(cum_log_probs.cpu().contiguous(), + dtype='float32') + np.save(cum_log_probs_file, cum_log_probs_outputs) + + # Save cum log probs + if log_probs is not None and output_log_probs_npy is not None: + log_probs_file = Path(output_log_probs_npy) + log_probs_outputs = np.array(log_probs.cpu().contiguous(), + dtype='float32') + np.save(log_probs_file, log_probs_outputs) + def check_status(args, load_engine_time, qps): print("==================== check status ====================") @@ -320,28 +258,35 @@ def main(args): runtime_rank = tensorrt_llm.mpi_rank() logger.set_level(args.log_level) - model_name = read_model_name(args.engine_dir) - if args.tokenizer_dir is None: + # different handling if encoder-decoder models + is_enc_dec = { + name + for name in os.listdir(args.engine_dir) + if os.path.isdir(os.path.join(args.engine_dir, name)) + } == {'encoder', 'decoder'} + if is_enc_dec: + logger.warning( + "This path is an encoder-decoder model. Using different handling.") + assert not args.use_py_session, "Encoder-decoder models don't have a unified python runtime, please use its own examples/enc_dec/run.py instead." + + model_name, model_version = read_model_name( + args.engine_dir) if not is_enc_dec else ("", "") + if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS: + logger.warning( + "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) args.tokenizer_dir = DEFAULT_HF_MODEL_DIRS[model_name] tokenizer, pad_id, end_id = load_tokenizer( tokenizer_dir=args.tokenizer_dir, vocab_file=args.vocab_file, model_name=model_name, + model_version=model_version, tokenizer_type=args.tokenizer_type, ) - # # An example to stop generation when the model generate " London" on first sentence, " eventually became" on second sentence - # stop_words_list = [[" London"], ["eventually became"]] - # stop_words_list = tensorrt_llm.runtime.to_word_list_format(stop_words_list, tokenizer) - # stop_words_list = torch.Tensor(stop_words_list).to(torch.int32).to("cuda").contiguous() - stop_words_list = None - - # # An example to prevent generating " chef" on first sentence, " eventually" and " chef before" on second sentence - # bad_words_list = [[" chef"], [" eventually, chef before"]] - # bad_words_list = tensorrt_llm.runtime.to_word_list_format(bad_words_list, tokenizer) - # bad_words_list = torch.Tensor(bad_words_list).to(torch.int32).to("cuda").contiguous() - bad_words_list = None + if args.end_id: + end_id = args.end_id prompt_template = None if args.use_prompt_template and model_name in DEFAULT_PROMPT_TEMPLATES: @@ -354,8 +299,47 @@ def main(args): max_input_length=args.max_input_length, pad_id=pad_id, num_prepend_vtokens=args.num_prepend_vtokens, - model_name=model_name) - input_lengths = [x.size(0) for x in batch_input_ids] + model_name=model_name, + model_version=model_version) + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + if model_version == 'glm4': # add default stop token ids for GLM-4 + glm4_stop_ids = [[151329], [151336], [151338]] + if stop_words_list is None: + stop_words_list = [glm4_stop_ids] * len(batch_input_ids) + else: + for req_stop_words_list in stop_words_list: + req_stop_words_list.extend(glm4_stop_ids) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + + if is_enc_dec: + encoder_input_ids = batch_input_ids + decoder_start_token_id = read_decoder_start_token_id( + os.path.join(args.engine_dir, "decoder")) + decoder_input_ids = [ + torch.tensor([decoder_start_token_id], dtype=torch.int32) + for _ in batch_input_ids + ] + + input_lengths = [x.size(0) for x in decoder_input_ids + ] if is_enc_dec else [x.size(0) for x in batch_input_ids] + encoder_input_lengths = [x.size(0) + for x in encoder_input_ids] if is_enc_dec else None + + if not args.use_py_session and not supports_inflight_batching( + os.path.join(args.engine_dir, "decoder") if is_enc_dec else args. + engine_dir): + logger.warning( + "The given engine does not support in-flight batching, fallback to python session" + ) + args.use_py_session = True if not PYTHON_BINDINGS and not args.use_py_session: logger.warning( @@ -367,34 +351,60 @@ def main(args): "Debug mode is not supported in C++ session for now, fallback to Python session." ) args.use_py_session = True + if args.return_all_generated_tokens and args.use_py_session: + raise ValueError( + "Returning all the generated tokens at each step is not supported in the Python session, use C++ session instead." + ) + if (not args.return_all_generated_tokens) and args.streaming and ( + args.num_beams > 1): + logger.warning( + "Setting return_all_generated_tokens to True since streaming AND beam search are done simultaneously. " + "Returning the full beams at each streaming step is needed because beam search + streaming can change previous outputs. " + "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." + ) + args.return_all_generated_tokens = True runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict(engine_dir=args.engine_dir, - lora_dir=args.lora_dir, - rank=runtime_rank, - debug_mode=args.debug_mode, - lora_ckpt_source=args.lora_ckpt_source) + runner_kwargs = dict( + engine_dir=args.engine_dir, + lora_dir=args.lora_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + lora_ckpt_source=args.lora_ckpt_source, + gpu_weights_percent=args.gpu_weights_percent, + ) + if not args.use_py_session: + runner_kwargs.update(is_enc_dec=is_enc_dec) if args.medusa_choices is not None: args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.use_py_session, "Medusa is only supported by py_session" - assert args.temperature == 0, "Medusa should use temperature == 0" + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" assert args.num_beams == 1, "Medusa should use num_beams == 1" runner_kwargs.update(medusa_choices=args.medusa_choices) if not args.use_py_session: runner_kwargs.update( max_batch_size=len(batch_input_ids), - max_input_len=max(input_lengths), + max_input_len=max( + encoder_input_lengths if is_enc_dec else input_lengths), max_output_len=args.max_output_len, max_beam_width=args.num_beams, max_attention_window_size=args.max_attention_window_size, sink_token_length=args.sink_token_length, - ) + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode) + runner_kwargs.update( + enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) runner = runner_cls.from_dir(**runner_kwargs) torch.cuda.synchronize() start_time = time.time() with torch.no_grad(): outputs = runner.generate( - batch_input_ids, + batch_input_ids=decoder_input_ids + if is_enc_dec else batch_input_ids, + encoder_input_ids=encoder_input_ids if is_enc_dec else None, max_new_tokens=args.max_output_len, max_attention_window_size=args.max_attention_window_size, sink_token_length=args.sink_token_length, @@ -405,27 +415,32 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, + no_repeat_ngram_size=args.no_repeat_ngram_size, return_dict=True, - medusa_choices=args.medusa_choices) + medusa_choices=args.medusa_choices, + return_all_generated_tokens=args.return_all_generated_tokens) torch.cuda.synchronize() - - status = False + end_time = time.time() + if runtime_rank == 0: num_inputs = sum([torch.numel(x) for x in batch_input_ids]) num_outputs = torch.numel(outputs["output_ids"]) num_gens = num_outputs - num_inputs - load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine") qps = num_gens/(end_time-start_time) logger.info(f'Load engine takes: {load_engine_time} sec') @@ -433,29 +448,46 @@ def main(args): status = check_status(args, load_engine_time, qps) else: status = True - + if args.streaming: for curr_outputs in throttle_generator(outputs, args.streaming_interval): if runtime_rank == 0: output_ids = curr_outputs['output_ids'] sequence_lengths = curr_outputs['sequence_lengths'] - print_output(tokenizer, - output_ids, - input_lengths, - sequence_lengths, - output_csv=args.output_csv, - output_npy=args.output_npy) + cum_log_probs = None + log_probs = None + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] + print_output( + tokenizer, + output_ids, + input_lengths, + sequence_lengths, + output_csv=args.output_csv, + output_npy=args.output_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) else: if runtime_rank == 0: output_ids = outputs['output_ids'] sequence_lengths = outputs['sequence_lengths'] context_logits = None generation_logits = None + cum_log_probs = None + log_probs = None if runner.gather_context_logits: context_logits = outputs['context_logits'] if runner.gather_generation_logits: generation_logits = outputs['generation_logits'] + if args.output_cum_log_probs_npy != None: + cum_log_probs = outputs['cum_log_probs'] + if args.output_log_probs_npy != None: + log_probs = outputs['log_probs'] print_output(tokenizer, output_ids, input_lengths, @@ -464,7 +496,11 @@ def main(args): output_npy=args.output_npy, context_logits=context_logits, generation_logits=generation_logits, - output_logits_npy=args.output_logits_npy) + output_logits_npy=args.output_logits_npy, + cum_log_probs=cum_log_probs, + log_probs=log_probs, + output_cum_log_probs_npy=args.output_cum_log_probs_npy, + output_log_probs_npy=args.output_log_probs_npy) if args.run_profiling: ite = 10 @@ -482,17 +518,24 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, - return_dict=True) + return_dict=True, + return_all_generated_tokens=args.return_all_generated_tokens + ) torch.cuda.synchronize() tensorrt_llm.profiler.start("tmp") @@ -509,23 +552,31 @@ def main(args): top_p=args.top_p, num_beams=args.num_beams, length_penalty=args.length_penalty, + early_stopping=args.early_stopping, repetition_penalty=args.repetition_penalty, presence_penalty=args.presence_penalty, frequency_penalty=args.frequency_penalty, stop_words_list=stop_words_list, bad_words_list=bad_words_list, + output_cum_log_probs=(args.output_cum_log_probs_npy != + None), + output_log_probs=(args.output_log_probs_npy != None), + random_seed=args.random_seed, lora_uids=args.lora_task_uids, - prompt_table_path=args.prompt_table_path, + prompt_table=args.prompt_table_path, prompt_tasks=args.prompt_tasks, streaming=args.streaming, output_sequence_lengths=True, - return_dict=True) + return_dict=True, + return_all_generated_tokens=args.return_all_generated_tokens + ) torch.cuda.synchronize() tensorrt_llm.profiler.stop("tmp") print( f"batch_size: {len(batch_input_ids)}, avg latency of {ite} iterations: : {tensorrt_llm.profiler.elapsed_time_in_sec('tmp') / ite} sec" ) + if status: print("successful.") else: @@ -536,4 +587,4 @@ def main(args): if __name__ == '__main__': args = parse_arguments() print(args) - main(args) + main(args) \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/requirements.txt b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/requirements.txt index f7cbbb8b7e9bbd8aab6303fd8b5de1dacbd353b8..38e019fe009252d42e512c6b71cc261bb7788de7 100644 --- a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/requirements.txt +++ b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/requirements.txt @@ -1,30 +1,43 @@ -accelerate +accelerate>=0.25.0 build colored # cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image. -diffusers +# diffusers>=0.27.0 lark mpi4py -numpy +numpy<2 onnx>=1.12.0 +openai polygraphy psutil pybind11 -pynvml>=11.5.0 -sentencepiece>=0.1.99 -# tensorrt==9.2.0.post12.dev5 -# torch -# nvidia-ammo~=0.5.0; platform_machine=="x86_64" -transformers +# pynvml>=11.5.0 +pulp +pandas +h5py==3.10.0 +StrEnum +# tensorrt~=10.3.0 +# https://github.com/pytorch/pytorch/blob/v2.4.0/version.txt uses 2.4.0a0. +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07 uses 2.4.0a0. +# torch>=2.4.0a0,<=2.4.0 +# nvidia-modelopt~=0.15.0 +transformers>=4.38.2,<=4.42.4 +#transformers +pillow==10.3.0 wheel optimum -evaluate janus -parameterized -scikit-learn +mpmath>=1.3.0 +click +click_option_group +aenum +datasets==2.14.6 +evaluate~=0.4.1 +rouge_score~=0.1.2 +sentencepiece~=0.1.99 + -# special -scipy==1.11.4 -pandas==1.5.3 -nltk -rouge_score +setuptools +parameterized +# scikit-learn +# scipy==1.11.4 \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1.sh b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1.sh index 79d1b888ee49a54a7db6cafbddc20c7fbd07498f..e5b8223c8e45ffe3a6713e5f49067d533b5ffedf 100644 --- a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1.sh +++ b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1.sh @@ -1,15 +1,32 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + #!/bin/bash EXIT_STATUS=0 -LOG_LEVEL=info +LOG_LEVEL=${LOG_LEVEL:-INFO} BS=${BS:-1} DTYPE=${DTYPE:-"float16"} +LOAD_TIME_TARGET=${LOAD_TIME_TARGET:-15} +TPS_TARGET=${TPS_TARGET:-27.8} PROJECT_DIR="./" DATASET_DIR=${DATASET_DIR:-"${PROJECT_DIR}/data/datasets_cnn_dailymail"} MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-7b-chat"} -ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}/checkpoints"} +ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}"} export TLLM_LOG_LEVEL=${LOG_LEVEL} export PLUGIN_DTYPE="float16" @@ -24,7 +41,7 @@ check_status() export TASK_DATA_PATH=${DATASET_DIR} -# target is 95% of best (load engine time: 11.78, rouge1: 28.53, tps: 37.78) +# target is 80% of best (load engine time: 10, rouge1: 28.41, tps: 34.85) python3 ${PROJECT_DIR}/summarize.py \ --test_trt_llm \ --log_level ${LOG_LEVEL} \ @@ -34,8 +51,8 @@ python3 ${PROJECT_DIR}/summarize.py \ --tokenizer_dir ${MODEL_DIR} \ --tokenizer_type "llama" \ --engine_dir ${ENGINE_DIR} \ ---target_load_engine_time 12.4 \ --tensorrt_llm_rouge1_threshold 27.1 \ ---target_tps 35.89 \ +--target_load_engine_time ${LOAD_TIME_TARGET} \ +--target_tps ${TPS_TARGET} \ --use_py_session "$@"; check_status -exit ${EXIT_STATUS} +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1_build.sh b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1_build.sh index eb3cb06f0db50575598a105a7e7ebe29d08f4b3e..e849ba14a3e071b91b6b6488b028bcb865dc2bc7 100644 --- a/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1_build.sh +++ b/models/nlp/large_language_model/llama2-7b/trtllm/scripts/test_trtllm_llama2_7b_gpu1_build.sh @@ -1,14 +1,31 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + #!/bin/bash EXIT_STATUS=0 -LOG_LEVEL=info +LOG_LEVEL=${LOG_LEVEL:-INFO} BS=${BS:-1} DTYPE=${DTYPE:-"float16"} +BUILD_TIME_TARGET=${BUILD_TIME_TARGET:-38.57} PROJECT_DIR="./" MODEL_DIR=${MODEL_DIR:-"${PROJECT_DIR}/data/llama2-7b-chat"} -OUTPUT_DIR=${OUTPUT_DIR:-"${PROJECT_DIR}/checkpoints/"} +ENGINE_DIR=${ENGINE_DIR:-"${PROJECT_DIR}"} +CHECKPOINT_DIR="${ENGINE_DIR}/checkpoints" export TLLM_LOG_LEVEL=${LOG_LEVEL} export PLUGIN_DTYPE="float16" @@ -21,13 +38,18 @@ check_status() } -python3 ${PROJECT_DIR}/build.py \ ---log_level ${LOG_LEVEL} \ ---dtype ${DTYPE} \ +python3 convert_checkpoint.py \ --model_dir ${MODEL_DIR} \ ---remove_input_padding \ ---use_gpt_attention_plugin float16 --use_gemm_plugin float16 \ ---enable_context_fmha \ ---disable_xqa \ ---output_dir ${OUTPUT_DIR} "$@"; check_status -exit ${EXIT_STATUS} +--output_dir ${CHECKPOINT_DIR} \ +--dtype ${DTYPE} + + +# best(build engine time: 27) is 70% of target +trtllm-build \ +--log_level ${LOG_LEVEL} \ +--max_batch_size ${BS} \ +--checkpoint_dir ${CHECKPOINT_DIR} \ +--remove_input_padding enable \ +--total_build_time_target ${BUILD_TIME_TARGET} \ +--output_dir ${ENGINE_DIR} "$@"; check_status +exit ${EXIT_STATUS} \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/summarize.py b/models/nlp/large_language_model/llama2-7b/trtllm/summarize.py index acf06abd7708c098c30a40bc905a52d84d83deb6..8896ab0bfb500807ea8e6d8a2b9568a3afc257f6 100644 --- a/models/nlp/large_language_model/llama2-7b/trtllm/summarize.py +++ b/models/nlp/large_language_model/llama2-7b/trtllm/summarize.py @@ -26,11 +26,12 @@ import torch from datasets import load_dataset, load_from_disk from transformers import (AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, GenerationConfig) -from utils import DEFAULT_HF_MODEL_DIRS, load_tokenizer, read_model_name +from utils import (DEFAULT_HF_MODEL_DIRS, add_common_args, load_tokenizer, + read_model_name, supports_inflight_batching) import tensorrt_llm import tensorrt_llm.profiler as profiler -from tensorrt_llm._utils import str_dtype_to_torch +from tensorrt_llm._utils import mpi_broadcast, str_dtype_to_torch from tensorrt_llm.logger import logger from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner from tensorrt_llm.tools.ppl import ppl @@ -58,19 +59,26 @@ def main(args): runtime_rank = tensorrt_llm.mpi_rank() logger.set_level(args.log_level) - model_name = read_model_name(args.engine_dir) + test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0 + test_trt_llm = args.test_trt_llm + model_name, model_version = read_model_name(args.engine_dir) if args.hf_model_dir is None: - args.hf_model_dir = DEFAULT_HF_MODEL_DIRS[model_name] + logger.warning( + "hf_model_dir is not specified. Try to infer from model_name, but this may be incorrect." + ) + if model_name in DEFAULT_HF_MODEL_DIRS: + args.hf_model_dir = DEFAULT_HF_MODEL_DIRS[model_name] + else: + args.hf_model_dir = None if args.tokenizer_dir is None: args.tokenizer_dir = args.hf_model_dir - test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0 - test_trt_llm = args.test_trt_llm profiler.start('load tokenizer') tokenizer, pad_id, end_id = load_tokenizer( tokenizer_dir=args.tokenizer_dir, vocab_file=args.vocab_file, model_name=model_name, + model_version=model_version, tokenizer_type=args.tokenizer_type, ) profiler.stop('load tokenizer') @@ -96,24 +104,34 @@ def main(args): dataset_input_key = 'input' dataset_output_key = 'output' dataset_split = 'validation' # only this split contains reference strings - - + elif args.eval_task == "eval_context_ppl": + dataset_name = "SlimPajama-6B" + dataset_revision = None + dataset_input_key = 'text' + dataset_output_key = 'text' + dataset_split = 'test' + args.output_len = 1 # Only want to compute the ppl of context + args.eval_ppl = True + logger.warning( + f"Run task '{args.eval_task}', setting 'output_len' to 1, and enable 'eval_ppl'." + ) + if args.dataset_dir is not None and isinstance(args.dataset_dir, str): + args.dataset_dir = args.dataset_dir.rstrip('/') + if args.dataset_dir.endswith(dataset_name): + dataset_name = args.dataset_dir + else: + dataset_name = f"{args.dataset_dir}/{dataset_name}" + logger.info(f"prepare datasets....") if os.getenv("TASK_DATA_PATH"): dataset = load_from_disk(os.getenv("TASK_DATA_PATH"))[dataset_split] else: - # dataset = load_dataset(dataset_name, - # dataset_revision, - # cache_dir=args.dataset_path, - # split=dataset_split, - # trust_remote_code=True) - dataset = load_dataset(dataset_name, dataset_revision, - cache_dir=args.dataset_path, + cache_dir=args.dataset_cache_dir, split=dataset_split) - logger.info(f"datasets is ready.") + max_batch_size = args.batch_size # runtime parameters @@ -124,77 +142,35 @@ def main(args): max_attention_window_size = args.max_attention_window_size sink_token_length = args.sink_token_length + if args.end_id: + end_id = args.end_id + + stop_words_list = None + if args.stop_words: + stop_words_list = tensorrt_llm.runtime.decode_words_list( + args.stop_words, tokenizer) + if model_version == 'glm4': # add default stop token ids for GLM-4 + glm4_stop_ids = [[151329], [151336], [151338]] + if stop_words_list is None: + stop_words_list = [glm4_stop_ids] * args.batch_size + else: + for req_stop_words_list in stop_words_list: + req_stop_words_list.extend(glm4_stop_ids) + + bad_words_list = None + if args.bad_words: + bad_words_list = tensorrt_llm.runtime.decode_words_list( + args.bad_words, tokenizer) + # random_seed = 5 temperature = args.temperature num_beams = args.num_beams length_penalty = args.length_penalty + early_stopping = args.early_stopping repetition_penalty = args.repetition_penalty presence_penalty = args.presence_penalty frequency_penalty = args.frequency_penalty - if test_trt_llm: - if not PYTHON_BINDINGS and not args.use_py_session: - logger.warning( - "Python bindings of C++ session is unavailable, fallback to Python session." - ) - args.use_py_session = True - runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp - runner_kwargs = dict(engine_dir=args.engine_dir, - rank=runtime_rank, - debug_mode=args.debug_mode) - if args.medusa_choices is not None: - args.medusa_choices = ast.literal_eval(args.medusa_choices) - assert args.use_py_session, "Medusa is only supported by py_session" - assert args.temperature == 0, "Medusa should use temperature == 0" - assert args.num_beams == 1, "Medusa should use num_beams == 1" - runner_kwargs.update(medusa_choices=args.medusa_choices) - if not args.use_py_session: - runner_kwargs.update( - max_batch_size=max_batch_size, - max_input_len=test_token_num, - max_output_len=output_len, - max_beam_width=num_beams, - max_attention_window_size=max_attention_window_size, - sink_token_length=sink_token_length) - runner = runner_cls.from_dir(**runner_kwargs) - assert not (args.eval_ppl and not (runner.gather_context_logits and runner.gather_generation_logits)), \ - "PPL evaluation requires engine built with gather_all_token_logits enabled" - - if test_hf: - profiler.start('load HF model') - dtype_alias_mapping = { - 'fp32': 'float32', - 'fp16': 'float16', - 'bf16': 'bfloat16' - } - args.data_type = dtype_alias_mapping.get(args.data_type, args.data_type) - if model_name.startswith('chatglm'): - auto_model_cls = AutoModel - elif model_name.startswith('glm'): - auto_model_cls = AutoModelForSeq2SeqLM - else: - auto_model_cls = AutoModelForCausalLM - model = auto_model_cls.from_pretrained( - args.hf_model_dir, - trust_remote_code=True, - torch_dtype=str_dtype_to_torch(args.data_type), - device_map='auto' if args.hf_device_map_auto else None) - try: - model.to_bettertransformer() - except ValueError as e: - logger.warning( - f'Fail to call model.to_bettertransformer(), exception:\n{str(e)}' - ) - if not args.hf_device_map_auto: - model.cuda() - if model_name == 'qwen': - model.generation_config = GenerationConfig.from_pretrained( - args.hf_model_dir, trust_remote_code=True) - profiler.stop('load HF model') - logger.info( - f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec' - ) - output_dir = Path(args.output_dir) if args.output_dir else None if output_dir is not None: output_dir.mkdir(exist_ok=True, parents=True) @@ -207,9 +183,21 @@ def main(args): f.write(f'Model path: {args.hf_model_dir}\n') f.write(f'Tokenizer path: {args.tokenizer_dir}\n') + # TODO: Add random_seed flag in gptj + rouge_dir = args.rouge_dir if args.rouge_dir and os.path.exists( + args.rouge_dir) else "rouge" + metric_tensorrt_llm = [evaluate.load(rouge_dir) for _ in range(num_beams)] + metric_hf = [evaluate.load(rouge_dir) for _ in range(num_beams)] + for i in range(num_beams): + metric_tensorrt_llm[i].seed = 0 + metric_hf[i].seed = 0 + ppls_trt_llm = [[] for _ in range(num_beams)] + ppls_hf = [[] for _ in range(num_beams)] + def _prepare_inputs(batch_input_texts, eval_task='summarize', - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(batch_input_texts) append_str = ' TL;DR: ' if eval_task == 'summarize' else '' batch_input_ids = [] @@ -218,12 +206,13 @@ def main(args): curr_text = curr_text.strip().replace(" n't", "n't") # TODO: The below lines are used to be compatible with the original code; may need fix - if model_name.startswith(('chatglm2', 'chatglm3')): + if 'GLM' in model_name and model_version in ('chatglm2', + 'chatglm3'): input_ids = tokenizer.encode(curr_text, return_tensors='pt').squeeze(0) input_ids = input_ids[:test_token_num] - elif model_name == 'qwen': - from qwen.utils.utils import make_context + elif 'qwen' in model_name.lower() and model_version == 'qwen': + from tensorrt_llm.models.qwen.utils import make_context # use make_content to generate prompt system_prompt = "You are a useful assistant, please directly output the corresponding summary according to the article entered by the user." _, input_id_list = make_context( @@ -235,6 +224,18 @@ def main(args): ) input_ids = torch.tensor(input_id_list) else: + if 'qwen' in model_name.lower() and 'qwen2' in model_version: + messages = [{ + "role": + "system", + "content": + "You are a helpful assistant, please summarize the article entered by the user with one or two sentences." + }, { + "role": "user", + "content": curr_text + }] + curr_text = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) input_ids = tokenizer.encode( curr_text, return_tensors='pt', @@ -242,17 +243,23 @@ def main(args): truncation=True, max_length=test_token_num).squeeze(0) - batch_input_ids.append(input_ids) + if input_ids.numel() > min_input_length: + batch_input_ids.append(input_ids) return batch_input_ids def eval_trt_llm(datapoint, eval_task='summarize', eval_ppl=False, - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(datapoint[dataset_input_key]) batch_input_ids = _prepare_inputs(datapoint[dataset_input_key], eval_task=eval_task, - add_special_tokens=add_special_tokens) + add_special_tokens=add_special_tokens, + min_input_length=min_input_length) + batch_size = len(batch_input_ids) + if batch_size == 0: + return [], [], [], {} input_lengths = [x.size(0) for x in batch_input_ids] with torch.no_grad(): @@ -266,11 +273,15 @@ def main(args): temperature=temperature, top_k=top_k, top_p=top_p, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, num_beams=num_beams, length_penalty=length_penalty, + early_stopping=early_stopping, repetition_penalty=repetition_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, + lora_uids=args.lora_task_uids, output_sequence_lengths=True, return_dict=True, medusa_choices=args.medusa_choices) @@ -327,7 +338,8 @@ def main(args): def eval_hf(datapoint, eval_task='summarize', eval_ppl=False, - add_special_tokens=True): + add_special_tokens=True, + min_input_length=0): batch_size = len(datapoint[dataset_input_key]) if batch_size > 1: logger.warning( @@ -335,7 +347,11 @@ def main(args): ) batch_input_ids = _prepare_inputs(datapoint[dataset_input_key], eval_task=eval_task, - add_special_tokens=add_special_tokens) + add_special_tokens=add_special_tokens, + min_input_length=min_input_length) + batch_size = len(batch_input_ids) + if batch_size == 0: + return [], [], [], [[] for _ in range(batch_size)] input_lengths = [x.size(0) for x in batch_input_ids] # Left padding for HF max_length = max(input_lengths) @@ -349,6 +365,12 @@ def main(args): batch_input_ids = torch.stack(batch_input_ids) batch_input_ids = batch_input_ids.cuda() + # specialization for HF + if early_stopping in [0, 1]: + local_early_stopping = bool(early_stopping) + else: + local_early_stopping = "never" + with torch.no_grad(): outputs = model.generate(batch_input_ids, max_new_tokens=output_len, @@ -358,8 +380,8 @@ def main(args): pad_token_id=pad_id, num_beams=num_beams, num_return_sequences=num_beams, - early_stopping=True, length_penalty=length_penalty, + early_stopping=local_early_stopping, output_scores=True, return_dict_in_generate=True) if eval_ppl and batch_size == 1: @@ -384,7 +406,14 @@ def main(args): output_ids != pad_id).sum(dim=-1) context_logits = context_outputs['logits'] # Remove the first generation logits which are same to last context logits - generation_logits = torch.stack(outputs['scores'][1:], dim=1) + generation_logits = outputs['scores'][1:] + # When output_len is 1, generation_logits would be () and lead to error if we do torch.stack + if len(generation_logits) == 0: + generation_logits = torch.empty( + [context_logits.shape[0], 0, context_logits.shape[-1]], + device=context_logits.device) + else: + generation_logits = torch.stack(generation_logits, dim=1) _, max_gen_len, voc_size = generation_logits.size() generation_logits = generation_logits.view(batch_size, num_beams, max_gen_len, voc_size) @@ -410,12 +439,58 @@ def main(args): return output_lines_list, tokens_list, ppls if test_trt_llm: + if not supports_inflight_batching(args.engine_dir): + logger.warning( + "The given engine does not support in-flight batching, fallback to python session" + ) + args.use_py_session = True + + if not PYTHON_BINDINGS and not args.use_py_session: + logger.warning( + "Python bindings of C++ session is unavailable, fallback to Python session." + ) + args.use_py_session = True + if args.return_all_generated_tokens: + raise ValueError( + "Returning all the generated tokens at each step is not supported in summarize.py" + ) + runner_cls = ModelRunner if args.use_py_session else ModelRunnerCpp + runner_kwargs = dict(engine_dir=args.engine_dir, + rank=runtime_rank, + debug_mode=args.debug_mode, + gpu_weights_percent=args.gpu_weights_percent) + if args.medusa_choices is not None: + args.medusa_choices = ast.literal_eval(args.medusa_choices) + assert args.temperature == 1.0, "Medusa should use temperature == 1.0" + assert args.num_beams == 1, "Medusa should use num_beams == 1" + runner_kwargs.update(medusa_choices=args.medusa_choices) + if not args.use_py_session: + runner_kwargs.update( + max_batch_size=max_batch_size, + max_input_len=test_token_num, + max_output_len=output_len, + max_beam_width=num_beams, + max_attention_window_size=max_attention_window_size, + sink_token_length=sink_token_length, + max_tokens_in_paged_kv_cache=args.max_tokens_in_paged_kv_cache, + kv_cache_enable_block_reuse=args.kv_cache_enable_block_reuse, + kv_cache_free_gpu_memory_fraction=args. + kv_cache_free_gpu_memory_fraction, + enable_chunked_context=args.enable_chunked_context, + multi_block_mode=args.multi_block_mode) + runner_kwargs.update( + enable_context_fmha_fp32_acc=args.enable_context_fmha_fp32_acc) + runner = runner_cls.from_dir(**runner_kwargs) + assert not (args.eval_ppl and not (runner.gather_context_logits and runner.gather_generation_logits)), \ + "PPL evaluation requires engine built with gather_all_token_logits enabled" + datapoint = dataset[0:1] output, *_ = eval_trt_llm(datapoint, eval_task=args.eval_task, eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - if runtime_rank == 0: + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + if runtime_rank == 0 and args.eval_task != "eval_context_ppl": logger.info( "---------------------------------------------------------") logger.info("TensorRT-LLM Generated : ") @@ -424,71 +499,50 @@ def main(args): logger.info(f"\n Output : {output}") logger.info( "---------------------------------------------------------") - if test_hf: - datapoint = dataset[0:1] - output, *_ = eval_hf(datapoint, - eval_task=args.eval_task, - eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - logger.info("---------------------------------------------------------") - logger.info("HF Generated : ") - logger.info(f" Input : {datapoint[dataset_input_key]}") - logger.info(f"\n Reference : {datapoint[dataset_output_key]}") - logger.info(f"\n Output : {output}") - logger.info("---------------------------------------------------------") - # TODO: Add random_seed flag in gptj - metric_tensorrt_llm = [evaluate.load("rouge") for _ in range(num_beams)] - metric_hf = [evaluate.load("rouge") for _ in range(num_beams)] - for i in range(num_beams): - metric_tensorrt_llm[i].seed = 0 - metric_hf[i].seed = 0 - ppls_trt_llm = [[] for _ in range(num_beams)] - ppls_hf = [[] for _ in range(num_beams)] + ite_count = 0 + data_point_idx = 0 + total_output_token_count_trt_llm = 0 # only valid for runtime_rank == 0 + + if args.stability_test: + logger.info(f"stability test, need {args.stability_test_hours} hours") + else: + logger.info(f"dataset size: {len(dataset)}, max_ite: {args.max_ite}") + stability_start_time = time.time() - ite_count = 0 - data_point_idx = 0 - total_output_token_count_trt_llm = 0 # only valid for runtime_rank == 0 - - if args.stability_test: - logger.info(f"stability test, need {args.stability_test_hours} hours") - else: - logger.info(f"dataset size: {len(dataset)}, max_ite: {args.max_ite}") - stability_start_time = time.time() - while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): - if runtime_rank == 0: - logger.debug( - f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" - ) - datapoint = dataset[data_point_idx:(data_point_idx + max_batch_size)] + while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): + if runtime_rank == 0: + logger.debug( + f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" + ) + datapoint = dataset[data_point_idx:(data_point_idx + + max_batch_size)] - if test_trt_llm: profiler.start('tensorrt_llm') output_tensorrt_llm, output_ids_trt_llm, curr_ppls_trt_llm, lengths_info = eval_trt_llm( datapoint, eval_task=args.eval_task, eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) profiler.stop('tensorrt_llm') + + empty_batch = (runtime_rank == 0 and len(output_tensorrt_llm) == 0) + empty_batch = mpi_broadcast(empty_batch, 0) + if empty_batch: + # No valid samples in the current batch, skip this iteration + data_point_idx += max_batch_size + continue + if runtime_rank == 0: input_lengths = lengths_info['input_lengths'] seq_lengths = lengths_info['seq_lengths'] output_token_count_trt_llm = sum( - seq_lengths[idx][0] - input_lengths[idx] - for idx in range(len(input_lengths))) + seq_lengths[bs][bm] - input_lengths[bs] + for bm in range(len(output_tensorrt_llm[0])) + for bs in range(len(output_tensorrt_llm))) total_output_token_count_trt_llm += output_token_count_trt_llm - if test_hf: - profiler.start('hf') - output_hf, _, curr_ppls_hf = eval_hf( - datapoint, - eval_task=args.eval_task, - eval_ppl=args.eval_ppl, - add_special_tokens=args.add_special_tokens) - profiler.stop('hf') - - if runtime_rank == 0: - if test_trt_llm: for batch_idx in range(len(output_tensorrt_llm)): for beam_idx in range(num_beams): metric_tensorrt_llm[beam_idx].add_batch( @@ -502,13 +556,121 @@ def main(args): ppls_trt_llm[beam_idx].append( curr_ppls_trt_llm[batch_idx][beam_idx]) if output_dir is not None: - # yapf: disable for i in range(len(output_tensorrt_llm[0])): for beam_idx in range(num_beams): with (output_dir / 'trtllm.out').open('a') as f: - f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_tensorrt_llm[beam_idx][i]}\n') - # yapf: enable - if test_hf: + f.write( + f'[{data_point_idx + i}] [Beam {beam_idx}] {output_tensorrt_llm[beam_idx][i]}\n' + ) + + logger.debug('-' * 100) + logger.debug(f"Input : {datapoint[dataset_input_key]}") + logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}') + logger.debug(f"Reference : {datapoint[dataset_output_key]}") + + data_point_idx += max_batch_size + ite_count += 1 + + if args.stability_test: + test_time_hours = round((time.time() - stability_start_time) / 3600, 1) + if test_time_hours > args.stability_test_hours: + if runtime_rank == 0: + logger.info(f"Stability Test Finished. Total run {test_time_hours} hours.") + break + else: + data_point_idx = data_point_idx % len(dataset) + ite_count = ite_count % args.max_ite + if runtime_rank == 0 and ite_count % 100 == 0: + logger.info( + f"stability test, remain {round(args.stability_test_hours - test_time_hours, 1)} hours") + elif runtime_rank == 0 and ite_count % 10 == 0: + logger.info(f"data_point_idx: {data_point_idx}, ite_count: {ite_count}") + + del runner + + if test_hf and runtime_rank == 0: + profiler.start('load HF model') + dtype_alias_mapping = { + 'fp32': 'float32', + 'fp16': 'float16', + 'bf16': 'bfloat16' + } + args.hf_data_type = dtype_alias_mapping.get(args.hf_data_type, + args.hf_data_type) + if 'GLM' in model_name and model_version == 'glm': + auto_model_cls = AutoModelForSeq2SeqLM + elif 'GLM' in model_name and model_version == 'chatglm': + auto_model_cls = AutoModel + else: + auto_model_cls = AutoModelForCausalLM + model = auto_model_cls.from_pretrained( + args.hf_model_dir, + trust_remote_code=True, + torch_dtype=str_dtype_to_torch(args.hf_data_type), + device_map='auto' if args.hf_device_map_auto else None) + try: + model.to_bettertransformer() + except Exception as e: + logger.warning( + f'Fail to call model.to_bettertransformer(), exception:\n{str(e)}' + ) + if not args.hf_device_map_auto: + model.cuda() + if model_name == 'qwen': + model.generation_config = GenerationConfig.from_pretrained( + args.hf_model_dir, trust_remote_code=True) + profiler.stop('load HF model') + logger.info( + f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec' + ) + + datapoint = dataset[0:1] + output, *_ = eval_hf(datapoint, + eval_task=args.eval_task, + eval_ppl=args.eval_ppl, + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + if runtime_rank == 0 and args.eval_task != "eval_context_ppl": + logger.info( + "---------------------------------------------------------") + logger.info("HF Generated : ") + logger.info(f" Input : {datapoint[dataset_input_key]}") + logger.info(f"\n Reference : {datapoint[dataset_output_key]}") + logger.info(f"\n Output : {output}") + logger.info( + "---------------------------------------------------------") + + ite_count = 0 + data_point_idx = 0 + total_output_token_count_hf = 0 # only valid for runtime_rank == 0 + while (data_point_idx < len(dataset)) and (ite_count < args.max_ite): + if runtime_rank == 0: + logger.debug( + f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}" + ) + datapoint = dataset[data_point_idx:(data_point_idx + + max_batch_size)] + + profiler.start('hf') + output_hf, token_list, curr_ppls_hf = eval_hf( + datapoint, + eval_task=args.eval_task, + eval_ppl=args.eval_ppl, + add_special_tokens=args.add_special_tokens, + min_input_length=args.min_input_length) + profiler.stop('hf') + + # HF model runs on rank 0 only + empty_batch = len(output_hf) == 0 + if empty_batch: + # No valid samples in the current batch, skip this iteration + data_point_idx += max_batch_size + continue + + if runtime_rank == 0: + seq_lengths = [len(tokens) for tokens in token_list] + total_output_token_count_hf += sum(seq_lengths) + for beam_idx in range(num_beams): for batch_idx in range(len(output_hf[beam_idx])): metric_hf[beam_idx].add_batch( @@ -520,37 +682,21 @@ def main(args): ppls_hf[beam_idx].append( curr_ppls_hf[batch_idx][beam_idx]) if output_dir is not None: - # yapf: disable for i in range(len(output_hf[0])): for beam_idx in range(num_beams): with (output_dir / 'hf.out').open('a') as f: - f.write(f'[{data_point_idx + i}] [Beam {beam_idx}] {output_hf[beam_idx][i]}\n') - # yapf: enable + f.write( + f'[{data_point_idx + i}] [Beam {beam_idx}] {output_hf[beam_idx][i]}\n' + ) - logger.debug('-' * 100) - logger.debug(f"Input : {datapoint[dataset_input_key]}") - if test_trt_llm: - logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}') - if test_hf: + logger.debug('-' * 100) + logger.debug(f"Input : {datapoint[dataset_input_key]}") logger.debug(f'HF Output: {output_hf}') - logger.debug(f"Reference : {datapoint[dataset_output_key]}") + logger.debug(f"Reference : {datapoint[dataset_output_key]}") - data_point_idx += max_batch_size - ite_count += 1 - - if args.stability_test: - test_time_hours = round((time.time() - stability_start_time)/3600, 1) - if test_time_hours > args.stability_test_hours: - if runtime_rank == 0: - logger.info(f"Stability Test Finished. Total run {test_time_hours} hours.") - break - else: - data_point_idx = data_point_idx % len(dataset) - ite_count = ite_count % args.max_ite - if runtime_rank == 0 and ite_count % 100 == 0: - logger.info(f"stability test, remain {round(args.stability_test_hours - test_time_hours, 1)} hours") - elif runtime_rank == 0 and ite_count % 10 == 0: - logger.info(f"data_point_idx: {data_point_idx}, ite_count: {ite_count}") + data_point_idx += max_batch_size + ite_count += 1 + del model if runtime_rank == 0: if test_trt_llm: @@ -558,6 +704,7 @@ def main(args): logger.info( f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)' ) + logger.info( f'TensorRT-LLM (total output tokens: {total_output_token_count_trt_llm})' ) @@ -567,30 +714,30 @@ def main(args): rouge1 = 0 tps = total_output_token_count_trt_llm / profiler.elapsed_time_in_sec("tensorrt_llm") - + for beam_idx in range(num_beams): logger.info(f"TensorRT-LLM beam {beam_idx} result") - computed_metrics_tensorrt_llm = metric_tensorrt_llm[ - beam_idx].compute() - for key in computed_metrics_tensorrt_llm.keys(): - logger.info( - f' {key} : {computed_metrics_tensorrt_llm[key]*100}') - - if args.check_accuracy and beam_idx == 0: - assert computed_metrics_tensorrt_llm[ - 'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold - - if beam_idx == 0: - rouge1 = computed_metrics_tensorrt_llm['rouge1'] * 100 - + if args.eval_task != "eval_context_ppl": + computed_metrics_tensorrt_llm = metric_tensorrt_llm[ + beam_idx].compute() + for key in computed_metrics_tensorrt_llm.keys(): + logger.info( + f' {key} : {computed_metrics_tensorrt_llm[key]*100}' + ) + if args.check_accuracy and beam_idx == 0: + assert computed_metrics_tensorrt_llm[ + 'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold + + if beam_idx == 0: + rouge1 = computed_metrics_tensorrt_llm['rouge1'] * 100 if args.eval_ppl: logger.info( f" Per-token perplexity: {np.mean(ppls_trt_llm[beam_idx])}" ) if args.check_accuracy and beam_idx == 0: - assert np.mean(ppls_trt_llm[beam_idx] - ) < args.tensorrt_llm_ppl_threshold - + avg_ppl = np.mean(ppls_trt_llm[beam_idx]) + assert avg_ppl < args.tensorrt_llm_ppl_threshold, f"[FAILED] average PPL ({avg_ppl}) is larger than threshold ({args.tensorrt_llm_ppl_threshold})" + load_engine_time = tensorrt_llm.profiler.elapsed_time_in_sec("load tensorrt_llm engine") logger.info(f'Load engine takes: {load_engine_time} sec') @@ -599,19 +746,27 @@ def main(args): print("successful.") else: print("failed.") - - sys.exit(int(not status)) - + + sys.exit(int(not status)) + if test_hf: np.random.seed(0) # rouge score use sampling to compute the score logger.info( f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)' ) + logger.info( + f'Hugging Face (total output tokens: {total_output_token_count_hf})' + ) + logger.info( + f'Hugging Face (tokens per second: {total_output_token_count_hf / profiler.elapsed_time_in_sec("hf")})' + ) + for beam_idx in range(num_beams): logger.info(f"HF beam {beam_idx} result") computed_metrics_hf = metric_hf[beam_idx].compute() - for key in computed_metrics_hf.keys(): - logger.info(f' {key} : {computed_metrics_hf[key]*100}') + if args.eval_task != "eval_context_ppl": + for key in computed_metrics_hf.keys(): + logger.info(f' {key} : {computed_metrics_hf[key]*100}') if args.eval_ppl and args.batch_size == 1: logger.info( f" Per-token perplexity: {np.mean(ppls_hf[beam_idx])}") @@ -619,34 +774,15 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) - parser.add_argument( - '--tokenizer_dir', - default=None, - help='tokenizer path; defaults to hf_model_dir if left unspecified') - parser.add_argument( - '--tokenizer_type', - help= - 'Specify that argument when providing a .model file as the tokenizer_dir. ' - 'It allows AutoTokenizer to instantiate the correct tokenizer type.') - parser.add_argument('--vocab_file') parser.add_argument('--test_hf', action='store_true') parser.add_argument('--test_trt_llm', action='store_true') - parser.add_argument( - '--data_type', - type=str, - choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'], - default='fp16') - parser.add_argument('--engine_dir', type=str, default='engine_outputs') - parser.add_argument('--use_py_session', - default=False, - action='store_true', - help="Whether or not to use Python runtime session") - parser.add_argument( - '--eval_task', - type=str, - default='summarize', - choices=['summarize', 'summarize_long', 'code_completion']) + parser.add_argument('--eval_task', + type=str, + default='summarize', + choices=[ + 'summarize', 'summarize_long', 'code_completion', + 'eval_context_ppl' + ]) parser.add_argument('--check_accuracy', action='store_true') parser.add_argument('--tensorrt_llm_rouge1_threshold', type=float, @@ -655,51 +791,33 @@ if __name__ == '__main__': parser.add_argument('--tensorrt_llm_ppl_threshold', type=float, default=15.0) + parser.add_argument( + '--dataset_dir', + type=str, + default=None, + help="The local directory of the dataset for evaluation; " + "will download the dataset from huggingface hub if not specified.") + parser.add_argument( + '--dataset_cache_dir', + type=str, + default=None, + help="The local cache directory for dataset; " + "will use `~/.cache/huggingface/datasets` if not specified.") parser.add_argument('--target_load_engine_time', type=float, default=0) parser.add_argument('--target_tps', type=float, default=0) - parser.add_argument('--dataset_path', type=str, default='') - parser.add_argument('--log_level', type=str, default='info') parser.add_argument('--batch_size', type=int, default=1) parser.add_argument('--max_ite', type=int, default=20) parser.add_argument('--output_len', type=int, default=100) parser.add_argument('--max_input_length', type=int, default=923) parser.add_argument( - '--max_attention_window_size', + '--min_input_length', type=int, - default=None, - help= - 'The attention window size that controls the sliding window attention / cyclic kv cache behaviour' - ) - parser.add_argument('--sink_token_length', - type=int, - default=None, - help='The sink token length.') - parser.add_argument('--num_beams', type=int, default=1) - parser.add_argument('--temperature', type=float, default=1.0) - parser.add_argument('--top_k', type=int, default=1) - parser.add_argument('--top_p', type=float, default=0.0) - parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--repetition_penalty', type=float, default=1.0) - parser.add_argument('--presence_penalty', type=float, default=0.0) - parser.add_argument('--frequency_penalty', type=float, default=0.0) - parser.add_argument('--debug_mode', - default=False, - action='store_true', - help="Whether or not to turn on the debug mode") - parser.add_argument('--no_add_special_tokens', - dest='add_special_tokens', - default=True, - action='store_false', - help="Whether or not to add special tokens") - parser.add_argument( - '--hf_device_map_auto', - action='store_true', - help="Use device map 'auto' to load a pretrained HF model. This may " - "help to test a large model that cannot fit into a singlue GPU.") + default=0, + help='skip the sentences which are shorter than min_input_length.') parser.add_argument( '--output_dir', type=str, @@ -708,17 +826,19 @@ if __name__ == '__main__': "TensorRT-LLM outputs, and 'hf.out' for HF outputs. If None, do not " "save outputs.") parser.add_argument( - '--medusa_choices', - type=str, + '--rouge_dir', default=None, - help="Medusa choice to use, if not none, will use Medusa decoding." - " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." + type=str, + help= + "evaluate.load('rouge') will attempt to pull rouge package from HF. Use cached rouge can avoid network outage of host or HF." ) parser.add_argument('--stability_test', default=False, action='store_true', help="Whether or not to run stability test for tensorrt_llm.") parser.add_argument('--stability_test_hours', type=float, default=24.0) + parser = add_common_args(parser) args = parser.parse_args() print(args) - main(args) + + main(args) \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/utils.py b/models/nlp/large_language_model/llama2-7b/trtllm/utils.py index 44042d9e2dcb44dd6cd917ab16a00010e4005202..340ea03995dc62d200234e43ec3e73a4d4923bbb 100644 --- a/models/nlp/large_language_model/llama2-7b/trtllm/utils.py +++ b/models/nlp/large_language_model/llama2-7b/trtllm/utils.py @@ -12,55 +12,90 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import json from pathlib import Path from typing import Optional -from transformers import AutoTokenizer, T5Tokenizer +from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer -import tensorrt_llm +from tensorrt_llm.bindings import GptJsonConfig +from tensorrt_llm.builder import get_engine_version DEFAULT_HF_MODEL_DIRS = { - 'baichuan': 'baichuan-inc/Baichuan-13B-Chat', - 'bloom': 'bigscience/bloom-560m', - 'chatglm_6b': 'THUDM/chatglm-6b', - 'chatglm2_6b': 'THUDM/chatglm2-6b', - 'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k', - 'chatglm3_6b': 'THUDM/chatglm3-6b', - 'chatglm3_6b_base': 'THUDM/chatglm3-6b-base', - 'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k', - 'falcon': 'tiiuae/falcon-rw-1b', - 'glm_10b': 'THUDM/glm-10b', - 'gpt': 'gpt2-medium', - 'gptj': 'EleutherAI/gpt-j-6b', - 'gptneox': 'EleutherAI/gpt-neox-20b', - 'internlm': 'internlm/internlm-chat-7b', - 'llama': 'meta-llama/Llama-2-7b-hf', - 'mpt': 'mosaicml/mpt-7b', - 'phi': 'microsoft/phi-2', - 'opt': 'facebook/opt-350m', - 'qwen': 'Qwen/Qwen-7B', + 'BaichuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat', + 'BaiChuanForCausalLM': 'baichuan-inc/Baichuan-13B-Chat', + 'BloomForCausalLM': 'bigscience/bloom-560m', + 'GLMModel': 'THUDM/glm-10b', + 'ChatGLMModel': 'THUDM/chatglm3-6b', + 'ChatGLMForCausalLM': 'THUDM/chatglm3-6b', + 'RWForCausalLM': 'tiiuae/falcon-rw-1b', + 'FalconForCausalLM': 'tiiuae/falcon-rw-1b', + 'GPT2LMHeadModel': 'gpt2', + 'GPT2LMHeadCustomModel': 'gpt2', + 'Starcoder2ForCausalLM': 'bigcode/starcoder2-3b', + 'GPTForCausalLM': 'gpt2', + 'GPTJForCausalLM': 'EleutherAI/gpt-j-6b', + 'GPTNeoXForCausalLM': 'EleutherAI/gpt-neox-20b', + 'InternLMForCausalLM': 'internlm/internlm-chat-7b', + 'InternLM2ForCausalLM': 'internlm/internlm2-chat-7b', + 'LlamaForCausalLM': 'meta-llama/Llama-2-7b-hf', + 'MPTForCausalLM': 'mosaicml/mpt-7b', + 'PhiForCausalLM': 'microsoft/phi-2', + 'OPTForCausalLM': 'facebook/opt-350m', + 'QWenLMHeadModel': 'Qwen/Qwen-7B', + 'QWenForCausalLM': 'Qwen/Qwen-7B', + 'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B', + 'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B', + 'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b', } +INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (书生·浦语). +- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. +- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文. +""" + +QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n" + DEFAULT_PROMPT_TEMPLATES = { - 'internlm': - "<|User|>:{input_text}\n<|Bot|>:", - 'qwen': - "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", + 'InternLMForCausalLM': "<|User|>:{input_text}\n<|Bot|>:", + 'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION + + "<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", + 'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE, + 'QWenForCausalLM': QWEN_PROMPT_TEMPLATE, + 'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE, + 'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE, } +def supports_inflight_batching(engine_dir): + config_path = Path(engine_dir) / "config.json" + json_config = GptJsonConfig.parse_file(config_path) + model_config = json_config.model_config + return model_config.supports_inflight_batching + + +def read_decoder_start_token_id(engine_dir): + with open(Path(engine_dir) / "config.json", 'r') as f: + config = json.load(f) + return config['pretrained_config']['decoder_start_token_id'] + + def read_model_name(engine_dir: str): - engine_version = tensorrt_llm.runtime.engine.get_engine_version(engine_dir) + engine_version = get_engine_version(engine_dir) with open(Path(engine_dir) / "config.json", 'r') as f: config = json.load(f) if engine_version is None: - return config['builder_config']['name'] + return config['builder_config']['name'], None - return config['pretrained_config']['architecture'] + model_arch = config['pretrained_config']['architecture'] + model_version = None + if 'GLM' in model_arch: + model_version = config['pretrained_config']['chatglm_version'] + if 'qwen' in model_arch.lower(): + model_version = config['pretrained_config']['qwen_type'] + return model_arch, model_version def throttle_generator(generator, stream_interval): @@ -74,7 +109,8 @@ def throttle_generator(generator, stream_interval): def load_tokenizer(tokenizer_dir: Optional[str] = None, vocab_file: Optional[str] = None, - model_name: str = 'gpt', + model_name: str = 'GPTForCausalLM', + model_version: Optional[str] = None, tokenizer_type: Optional[str] = None): if vocab_file is None: use_fast = True @@ -86,28 +122,34 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None, padding_side='left', truncation_side='left', trust_remote_code=True, - tokenizer_type=tokenizer_type, + # tokenizer_type=tokenizer_type, # adapt to llama3 use_fast=use_fast) + elif model_name == 'GemmaForCausalLM' or model_name == 'RecurrentGemmaForCausalLM': + from transformers import GemmaTokenizer + + # Initialize tokenizer from vocab file. + tokenizer = GemmaTokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False) + elif model_name == 'Grok1ModelForCausalLM': + tokenizer = LlamaTokenizer(vocab_file=vocab_file, + padding_side='left', + truncation_side='left', + legacy=False, + use_fast=False) else: # For gpt-next, directly load from tokenizer.model - assert model_name == 'gpt' tokenizer = T5Tokenizer(vocab_file=vocab_file, padding_side='left', - truncation_side='left') - - if model_name == 'qwen': + truncation_side='left', + legacy=False) + if 'qwen' in model_name.lower() and model_version == 'qwen': with open(Path(tokenizer_dir) / "generation_config.json") as f: gen_config = json.load(f) - chat_format = gen_config['chat_format'] - if chat_format == 'raw': - pad_id = gen_config['pad_token_id'] - end_id = gen_config['eos_token_id'] - elif chat_format == 'chatml': - pad_id = tokenizer.im_end_id - end_id = tokenizer.im_end_id - else: - raise Exception(f"unknown chat format: {chat_format}") - elif model_name == 'glm_10b': + pad_id = gen_config['pad_token_id'] + end_id = gen_config['eos_token_id'] + elif 'GLM' in model_name and model_version == 'glm': pad_id = tokenizer.pad_token_id end_id = tokenizer.eop_token_id else: @@ -117,3 +159,212 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None, end_id = tokenizer.eos_token_id return tokenizer, pad_id, end_id + + +def add_common_args(parser): + # sampling arguments + parser.add_argument('--num_beams', + type=int, + help="Use beam search if num_beams > 1", + default=1) + parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--top_k', type=int, default=1) + parser.add_argument('--top_p', type=float, default=0.0) + parser.add_argument('--length_penalty', type=float, default=1.0) + parser.add_argument('--repetition_penalty', type=float, default=1.0) + parser.add_argument('--presence_penalty', type=float, default=0.0) + parser.add_argument('--frequency_penalty', type=float, default=0.0) + parser.add_argument('--beam_search_diversity_rate', type=float, default=0.0) + parser.add_argument('--random_seed', type=int, default=0) + parser.add_argument('--early_stopping', + type=int, + help='Use early stopping if num_beams > 1, ' + '1 for early-stopping, 0 for non-early-stopping' + 'other values for stopping by length', + default=1) + parser.add_argument( + '--end_id', + default=None, + type=int, + help="Override tokenizer end_id to stop on given end_id token.") + parser.add_argument( + '--stop_words', + default=None, + type=str, + nargs="+", + action='append', + help= + 'Set stop words for a batch. Successive invocations of --stop_words set stop words for other batches.' + ' E.g.: --stop_words " London" " chef" --stop_words "eventually became" "was not"', + ) + parser.add_argument( + '--bad_words', + default=None, + type=str, + nargs="+", + action='append', + help= + 'Set bad words for a batch. Successive invocations of --bad_words set bad words for other batches.' + ' E.g.: --bad_words " London" " chef" --bad_words "eventually became" "was not"', + ) + parser.add_argument('--no_repeat_ngram_size', type=int, default=None) + + # common runtime arguments + parser.add_argument('--sink_token_length', + type=int, + default=None, + help='The sink token length.') + parser.add_argument( + '--max_attention_window_size', + type=int, + default=None, + help= + 'The attention window size that controls the sliding window attention / cyclic kv cache behavior' + ) + parser.add_argument( + '--multi_block_mode', + action='store_true', + help= + "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel." + ) + parser.add_argument('--enable_context_fmha_fp32_acc', + action='store_true', + help="Enable FMHA runner FP32 accumulation.") + parser.add_argument('--log_level', type=str, default='info') + parser.add_argument( + '--no_prompt_template', + dest='use_prompt_template', + default=True, + action='store_false', + help= + "Whether or not to use default prompt template to wrap the input text.") + parser.add_argument('--use_py_session', + default=False, + action='store_true', + help="Whether or not to use Python runtime session") + parser.add_argument('--debug_mode', + default=False, + action='store_true', + help="Whether or not to turn on the debug mode") + parser.add_argument('--streaming', default=False, action='store_true') + parser.add_argument('--streaming_interval', + type=int, + help="How often to return tokens when streaming.", + default=5) + parser.add_argument( + '--prompt_table_path', + type=str, + help="Path to .npy file, exported by nemo_prompt_convert.py") + parser.add_argument( + '--prompt_tasks', + help="Comma-separated list of tasks for prompt tuning, e.g., 0,3,1,0") + parser.add_argument('--lora_dir', + type=str, + default=None, + nargs="+", + help="The directory of LoRA weights") + parser.add_argument('--lora_ckpt_source', + type=str, + default="hf", + choices=["hf", "nemo"], + help="The source of lora checkpoint.") + parser.add_argument( + '--lora_task_uids', + type=str, + default=None, + nargs="+", + help="The list of LoRA task uids; use -1 to disable the LoRA module") + parser.add_argument( + '--num_prepend_vtokens', + nargs="+", + type=int, + help="Number of (default) virtual tokens to prepend to each sentence." + " For example, '--num_prepend_vtokens=10' will prepend the tokens" + " [vocab_size, vocab_size + 1, ..., vocab_size + 9] to the sentence.") + parser.add_argument( + '--medusa_choices', + type=str, + default=None, + help="Medusa choice to use, if not none, will use Medusa decoding." + " E.g.: [[0, 0, 0, 0], [0, 1, 0], [1, 0], [1, 1]] for 9 medusa tokens." + ) + + # model arguments + parser.add_argument('--engine_dir', type=str, default='engine_outputs') + parser.add_argument( + '--tokenizer_type', + help= + 'Specify that argument when providing a .model file as the tokenizer_dir. ' + 'It allows AutoTokenizer to instantiate the correct tokenizer type.') + parser.add_argument('--vocab_file', + help="Used for sentencepiece tokenizers") + parser.add_argument('--no_add_special_tokens', + dest='add_special_tokens', + default=True, + action='store_false', + help="Whether or not to add special tokens") + parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) + parser.add_argument( + '--tokenizer_dir', + default=None, + help='tokenizer path; defaults to hf_model_dir if left unspecified') + + # memory argument + parser.add_argument( + '--gpu_weights_percent', + default=1, + type=float, + help= + 'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.', + ) + parser.add_argument( + '--max_tokens_in_paged_kv_cache', + default=None, + type=int, + help= + 'Specify the maximum number of tokens in a kv cache page (only available with cpp session).', + ) + parser.add_argument( + '--kv_cache_enable_block_reuse', + action='store_true', + help= + 'Enables block reuse in kv cache (only available with cpp session).', + ) + parser.add_argument( + '--kv_cache_free_gpu_memory_fraction', + default=0.9, + type=float, + help='Specify the free gpu memory fraction.', + ) + parser.add_argument( + '--enable_chunked_context', + action='store_true', + help='Enables chunked context (only available with cpp session).', + ) + + # hf model argument (if use hf model) + parser.add_argument( + '--hf_data_type', + '--data_type', + type=str, + choices=['fp32', 'fp16', 'bf16', 'float32', 'float16', 'bfloat16'], + default='fp16', + help="The data type for hf model.") + parser.add_argument( + '--hf_device_map_auto', + action='store_true', + help="Use device map 'auto' to load a pretrained HF model. This may " + "help to test a large model that cannot fit into a singlue GPU.") + + parser.add_argument( + "--return_all_generated_tokens", + default=False, + action="store_true", + help="This option changes the token output only for streaming. " + "If not specified, return only generated tokens at each step. " + "If specified, return the full beams/outputs at each step. " + "It is automatically enabled for num_beams>1 (only available with cpp session). " + "WARNING: using this option may increase network usage significantly (quadratically w.r.t output length)." + ) + + return parser \ No newline at end of file